Imported Upstream version 2.3.5
[platform/upstream/python-lxml.git] / src / lxml / tests / test_htmlparser.py
1 # -*- coding: utf-8 -*-
2
3 """
4 HTML parser test cases for etree
5 """
6
7 import unittest
8 import tempfile, os, os.path, sys
9
10 this_dir = os.path.dirname(__file__)
11 if this_dir not in sys.path:
12     sys.path.insert(0, this_dir) # needed for Py3
13
14 from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str
15 from common_imports import SillyFileLike, HelperTestCase, write_to_file
16
17 try:
18     unicode = __builtins__["unicode"]
19 except (NameError, KeyError):
20     unicode = str
21
22 class HtmlParserTestCase(HelperTestCase):
23     """HTML parser test cases
24     """
25     etree = etree
26
27     html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>")
28     html_str_pretty = _bytes("""\
29 <html>
30 <head><title>test</title></head>
31 <body><h1>page title</h1></body>
32 </html>
33 """)
34     broken_html_str = _bytes("<html><head><title>test<body><h1>page title</h3></p></html>")
35     uhtml_str = _str("<html><head><title>test Ã¡\uF8D2</title></head><body><h1>page Ã¡\uF8D2 title</h1></body></html>")
36
37     def tearDown(self):
38         super(HtmlParserTestCase, self).tearDown()
39         self.etree.set_default_parser()
40
41     def test_module_HTML(self):
42         element = self.etree.HTML(self.html_str)
43         self.assertEqual(self.etree.tostring(element, method="html"),
44                          self.html_str)
45
46     def test_module_HTML_unicode(self):
47         element = self.etree.HTML(self.uhtml_str)
48         self.assertEqual(unicode(self.etree.tostring(element, method="html",
49                                                      encoding='UTF8'), 'UTF8'),
50                          unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
51
52     def test_module_HTML_pretty_print(self):
53         element = self.etree.HTML(self.html_str)
54         self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True),
55                          self.html_str_pretty)
56
57     def test_module_parse_html_error(self):
58         parser = self.etree.HTMLParser(recover=False)
59         parse = self.etree.parse
60         f = BytesIO("<html></body>")
61         self.assertRaises(self.etree.XMLSyntaxError,
62                           parse, f, parser)
63
64     def test_html_element_name_empty(self):
65         parser = self.etree.HTMLParser()
66         Element = parser.makeelement
67
68         el = Element('name')
69         self.assertRaises(ValueError, Element, '{}')
70         self.assertRaises(ValueError, setattr, el, 'tag', '{}')
71
72         self.assertRaises(ValueError, Element, '{test}')
73         self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
74
75     def test_html_element_name_colon(self):
76         parser = self.etree.HTMLParser()
77         Element = parser.makeelement
78
79         pname = Element('p:name')
80         self.assertEquals(pname.tag, 'p:name')
81
82         pname = Element('{test}p:name')
83         self.assertEquals(pname.tag, '{test}p:name')
84
85         pname = Element('name')
86         pname.tag = 'p:name'
87         self.assertEquals(pname.tag, 'p:name')
88
89     def test_html_element_name_quote(self):
90         parser = self.etree.HTMLParser()
91         Element = parser.makeelement
92
93         self.assertRaises(ValueError, Element, 'p"name')
94         self.assertRaises(ValueError, Element, "na'me")
95         self.assertRaises(ValueError, Element, '{test}"name')
96         self.assertRaises(ValueError, Element, "{test}name'")
97
98         el = Element('name')
99         self.assertRaises(ValueError, setattr, el, 'tag', "pname'")
100         self.assertRaises(ValueError, setattr, el, 'tag', '"pname')
101         self.assertEquals(el.tag, "name")
102
103     def test_html_element_name_space(self):
104         parser = self.etree.HTMLParser()
105         Element = parser.makeelement
106
107         self.assertRaises(ValueError, Element, ' name ')
108         self.assertRaises(ValueError, Element, 'na me')
109         self.assertRaises(ValueError, Element, '{test} name')
110
111         el = Element('name')
112         self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
113         self.assertEquals(el.tag, "name")
114
115     def test_html_subelement_name_empty(self):
116         parser = self.etree.HTMLParser()
117         Element = parser.makeelement
118
119         SubElement = self.etree.SubElement
120
121         el = Element('name')
122         self.assertRaises(ValueError, SubElement, el, '{}')
123         self.assertRaises(ValueError, SubElement, el, '{test}')
124
125     def test_html_subelement_name_colon(self):
126         parser = self.etree.HTMLParser()
127         Element = parser.makeelement
128         SubElement = self.etree.SubElement
129
130         el = Element('name')
131         pname = SubElement(el, 'p:name')
132         self.assertEquals(pname.tag, 'p:name')
133
134         pname = SubElement(el, '{test}p:name')
135         self.assertEquals(pname.tag, '{test}p:name')
136
137     def test_html_subelement_name_quote(self):
138         parser = self.etree.HTMLParser()
139         Element = parser.makeelement
140         SubElement = self.etree.SubElement
141
142         el = Element('name')
143         self.assertRaises(ValueError, SubElement, el, "name'")
144         self.assertRaises(ValueError, SubElement, el, 'na"me')
145         self.assertRaises(ValueError, SubElement, el, "{test}na'me")
146         self.assertRaises(ValueError, SubElement, el, '{test}"name')
147
148     def test_html_subelement_name_space(self):
149         parser = self.etree.HTMLParser()
150         Element = parser.makeelement
151         SubElement = self.etree.SubElement
152
153         el = Element('name')
154         self.assertRaises(ValueError, SubElement, el, ' name ')
155         self.assertRaises(ValueError, SubElement, el, 'na me')
156         self.assertRaises(ValueError, SubElement, el, '{test} name')
157
158     def test_module_parse_html_norecover(self):
159         parser = self.etree.HTMLParser(recover=False)
160         parse = self.etree.parse
161         f = BytesIO(self.broken_html_str)
162         self.assertRaises(self.etree.XMLSyntaxError,
163                           parse, f, parser)
164
165     def test_parse_encoding_8bit_explicit(self):
166         text = _str('Søk på nettet')
167         html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1')
168
169         tree = self.etree.parse(
170             BytesIO(html_latin1),
171             self.etree.HTMLParser(encoding="iso-8859-1"))
172         p = tree.find("//p")
173         self.assertEquals(p.text, text)
174
175     def test_parse_encoding_8bit_override(self):
176         text = _str('Søk på nettet')
177         wrong_head = _str('''
178         <head>
179           <meta http-equiv="Content-Type"
180                 content="text/html; charset=UTF-8" />
181         </head>''')
182         html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head,
183                                                                         text)
184                       ).encode('iso-8859-1')
185
186         self.assertRaises(self.etree.ParseError,
187                           self.etree.parse,
188                           BytesIO(html_latin1))
189
190         tree = self.etree.parse(
191             BytesIO(html_latin1),
192             self.etree.HTMLParser(encoding="iso-8859-1"))
193         p = tree.find("//p")
194         self.assertEquals(p.text, text)
195
196     def test_module_HTML_broken(self):
197         element = self.etree.HTML(self.broken_html_str)
198         self.assertEqual(self.etree.tostring(element, method="html"),
199                          self.html_str)
200
201     def test_module_HTML_cdata(self):
202         # by default, libxml2 generates CDATA nodes for <script> content
203         html = _bytes('<html><head><style>foo</style></head></html>')
204         element = self.etree.HTML(html)
205         self.assertEquals(element[0][0].text, "foo")
206
207     def test_module_HTML_access(self):
208         element = self.etree.HTML(self.html_str)
209         self.assertEqual(element[0][0].tag, 'title')
210
211     def test_module_parse_html(self):
212         parser = self.etree.HTMLParser()
213         filename = tempfile.mktemp(suffix=".html")
214         write_to_file(filename, self.html_str, 'wb')
215         try:
216             f = open(filename, 'rb')
217             tree = self.etree.parse(f, parser)
218             f.close()
219             self.assertEqual(self.etree.tostring(tree.getroot(), method="html"),
220                              self.html_str)
221         finally:
222             os.remove(filename)
223
224     def test_module_parse_html_filelike(self):
225         parser = self.etree.HTMLParser()
226         f = SillyFileLike(self.html_str)
227         tree = self.etree.parse(f, parser)
228         html = self.etree.tostring(tree.getroot(),
229                                    method="html", encoding='UTF-8')
230         self.assertEqual(html, self.html_str)
231
232 ##     def test_module_parse_html_filelike_unicode(self):
233 ##         parser = self.etree.HTMLParser()
234 ##         f = SillyFileLike(self.uhtml_str)
235 ##         tree = self.etree.parse(f, parser)
236 ##         html = self.etree.tostring(tree.getroot(), encoding='UTF-8')
237 ##         self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str)
238
239     def test_html_file_error(self):
240         parser = self.etree.HTMLParser()
241         parse = self.etree.parse
242         self.assertRaises(IOError,
243                           parse, "__some_hopefully_nonexisting_file__.html",
244                           parser)
245
246     def test_default_parser_HTML_broken(self):
247         self.assertRaises(self.etree.XMLSyntaxError,
248                           self.etree.parse, BytesIO(self.broken_html_str))
249
250         self.etree.set_default_parser( self.etree.HTMLParser() )
251
252         tree = self.etree.parse(BytesIO(self.broken_html_str))
253         self.assertEqual(self.etree.tostring(tree.getroot(), method="html"),
254                          self.html_str)
255
256         self.etree.set_default_parser()
257
258         self.assertRaises(self.etree.XMLSyntaxError,
259                           self.etree.parse, BytesIO(self.broken_html_str))
260
261     def test_html_iterparse(self):
262         iterparse = self.etree.iterparse
263         f = BytesIO(
264             '<html><head><title>TITLE</title><body><p>P</p></body></html>')
265
266         iterator = iterparse(f, html=True)
267         self.assertEquals(None, iterator.root)
268
269         events = list(iterator)
270         root = iterator.root
271         self.assert_(root is not None)
272         self.assertEquals(
273             [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]),
274              ('end', root[1]), ('end', root)],
275             events)
276
277     def test_html_iterparse_file(self):
278         iterparse = self.etree.iterparse
279         iterator = iterparse(fileInTestDir("css_shakespear.html"),
280                              html=True)
281
282         self.assertEquals(None, iterator.root)
283         events = list(iterator)
284         root = iterator.root
285         self.assert_(root is not None)
286         self.assertEquals(249, len(events))
287         self.assertEquals(
288             [],
289             [ event for (event, element) in events if event != 'end' ])
290
291     def test_html_iterparse_start(self):
292         iterparse = self.etree.iterparse
293         f = BytesIO(
294             '<html><head><title>TITLE</title><body><p>P</p></body></html>')
295
296         iterator = iterparse(f, html=True, events=('start',))
297         self.assertEquals(None, iterator.root)
298
299         events = list(iterator)
300         root = iterator.root
301         self.assert_(root is not None)
302         self.assertEquals(
303             [('start', root), ('start', root[0]), ('start', root[0][0]),
304                 ('start', root[1]), ('start', root[1][0])],
305             events)
306
307     def test_html_parser_target_tag(self):
308         assertFalse  = self.assertFalse
309         events = []
310         class Target(object):
311             def start(self, tag, attrib):
312                 events.append(("start", tag))
313                 assertFalse(attrib)
314             def end(self, tag):
315                 events.append(("end", tag))
316             def close(self):
317                 return "DONE"
318
319         parser = self.etree.HTMLParser(target=Target())
320
321         parser.feed("<html><body></body></html>")
322         done = parser.close()
323
324         self.assertEquals("DONE", done)
325         self.assertEquals([
326             ("start", "html"), ("start", "body"),
327             ("end", "body"), ("end", "html")], events)
328
329     def test_html_parser_target_doctype_empty(self):
330         assertFalse  = self.assertFalse
331         events = []
332         class Target(object):
333             def start(self, tag, attrib):
334                 events.append(("start", tag))
335                 assertFalse(attrib)
336             def end(self, tag):
337                 events.append(("end", tag))
338             def doctype(self, *args):
339                 events.append(("doctype", args))
340             def close(self):
341                 return "DONE"
342
343         parser = self.etree.HTMLParser(target=Target())
344         parser.feed("<!DOCTYPE><html><body></body></html>")
345         done = parser.close()
346
347         self.assertEquals("DONE", done)
348         self.assertEquals([
349             ("doctype", (None, None, None)),
350             ("start", "html"), ("start", "body"),
351             ("end", "body"), ("end", "html")], events)
352
353     def test_html_parser_target_doctype_html(self):
354         assertFalse  = self.assertFalse
355         events = []
356         class Target(object):
357             def start(self, tag, attrib):
358                 events.append(("start", tag))
359                 assertFalse(attrib)
360             def end(self, tag):
361                 events.append(("end", tag))
362             def doctype(self, *args):
363                 events.append(("doctype", args))
364             def close(self):
365                 return "DONE"
366
367         parser = self.etree.HTMLParser(target=Target())
368         parser.feed("<!DOCTYPE html><html><body></body></html>")
369         done = parser.close()
370
371         self.assertEquals("DONE", done)
372         self.assertEquals([
373             ("doctype", ("html", None, None)),
374             ("start", "html"), ("start", "body"),
375             ("end", "body"), ("end", "html")], events)
376
377     def test_html_parser_target_doctype_html_full(self):
378         assertFalse  = self.assertFalse
379         events = []
380         class Target(object):
381             def start(self, tag, attrib):
382                 events.append(("start", tag))
383                 assertFalse(attrib)
384             def end(self, tag):
385                 events.append(("end", tag))
386             def doctype(self, *args):
387                 events.append(("doctype", args))
388             def close(self):
389                 return "DONE"
390
391         parser = self.etree.HTMLParser(target=Target())
392         parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
393                     '<html><body></body></html>')
394         done = parser.close()
395
396         self.assertEquals("DONE", done)
397         self.assertEquals([
398             ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")),
399             ("start", "html"), ("start", "body"),
400             ("end", "body"), ("end", "html")], events)
401
402
403 def test_suite():
404     suite = unittest.TestSuite()
405     suite.addTests([unittest.makeSuite(HtmlParserTestCase)])
406     return suite
407
408 if __name__ == '__main__':
409     print('to test use test.py %s' % __file__)