a193d9944ad6ca6b524cc6fe7f27a5369ed4633a
[platform/upstream/python-lxml.git] / src / lxml / html / tests / test_clean.py
1 import unittest
2 from lxml.tests.common_imports import make_doctest
3
4 import lxml.html
5 from lxml.html.clean import Cleaner, clean_html
6
7
8 class CleanerTest(unittest.TestCase):
9     def test_allow_tags(self):
10         html = """
11             <html>
12             <head>
13             </head>
14             <body>
15             <p>some text</p>
16             <table>
17             <tr>
18             <td>hello</td><td>world</td>
19             </tr>
20             <tr>
21             <td>hello</td><td>world</td>
22             </tr>
23             </table>
24             <img>
25             </body>
26             </html>
27             """
28
29         html_root = lxml.html.document_fromstring(html)
30         cleaner = Cleaner(
31             remove_unknown_tags = False,
32             allow_tags = ['table', 'tr', 'td'])
33         result = cleaner.clean_html(html_root)
34
35         self.assertEqual(12-5+1, len(list(result.iter())))
36
37     def test_safe_attrs_included(self):
38         html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
39
40         safe_attrs=set(lxml.html.defs.safe_attrs)
41         safe_attrs.add('style')
42
43         cleaner = Cleaner(
44             safe_attrs_only=True,
45             safe_attrs=safe_attrs)
46         result = cleaner.clean_html(html)
47
48         self.assertEqual(html, result)
49
50     def test_safe_attrs_excluded(self):
51         html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
52         expected = """<p><span>Cyan</span></p>"""
53
54         safe_attrs=set()
55
56         cleaner = Cleaner(
57             safe_attrs_only=True,
58             safe_attrs=safe_attrs)
59         result = cleaner.clean_html(html)
60
61         self.assertEqual(expected, result)
62
63     def test_clean_invalid_root_tag(self):
64         # only testing that cleaning with invalid root tags works at all
65         s = lxml.html.fromstring('parent <invalid tag>child</another>')
66         self.assertEqual('parent child', clean_html(s).text_content())
67
68         s = lxml.html.fromstring('<invalid tag>child</another>')
69         self.assertEqual('child', clean_html(s).text_content())
70
71
72 def test_suite():
73     suite = unittest.TestSuite()
74     suite.addTests([make_doctest('test_clean.txt')])
75     suite.addTests([make_doctest('test_clean_embed.txt')])
76     suite.addTests(unittest.makeSuite(CleanerTest))
77     return suite