2824f64cee2f4f8ab82e892138ca5d13318a83e4
[platform/upstream/python-lxml.git] / src / lxml / html / tests / test_clean.txt
1 >>> import re
2 >>> from lxml.html import fromstring, tostring
3 >>> from lxml.html.clean import clean, clean_html, Cleaner
4 >>> from lxml.html import usedoctest
5
6 >>> doc = '''<html>
7 ...   <head>
8 ...     <script type="text/javascript" src="evil-site"></script>
9 ...     <link rel="alternate" type="text/rss" src="evil-rss">
10 ...     <link rel="alternate" type="text/rss" href="http://example.com">
11 ...     <link rel="stylesheet" type="text/rss" href="http://example.com">
12 ...     <style>
13 ...       body {background-image: url(javascript:do_evil)};
14 ...       div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
15 ...       div {color: expression(evil)};
16 ...     </style>
17 ...   </head>
18 ...   <body onload="evil_function()">
19 ...     <!-- I am interpreted for EVIL! -->
20 ...     <a href="javascript:evil_function()">a link</a>
21 ...     <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t%20:evil_function()">a control char link</a>
22 ...     <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
23 ...     <a href="#" onclick="evil_function()">another link</a>
24 ...     <p onclick="evil_function()">a paragraph</p>
25 ...     <div style="display: none">secret EVIL!</div>
26 ...     <object> of EVIL! </object>
27 ...     <iframe src="evil-site"></iframe>
28 ...     <form action="evil-site">
29 ...       Password: <input type="password" name="password">
30 ...     </form>
31 ...     <a href="evil-site">spam spam SPAM!</a>
32 ...     <a href="http://example.com" rel="author">Author</a>
33 ...     <a href="http://example.com" rel="nofollow">Text</a>
34 ...     <img src="evil!">
35 ...   </body>
36 ... </html>'''
37
38 >>> print(re.sub('[\x00-\x07\x0E]', '', doc))
39 <html>
40   <head>
41     <script type="text/javascript" src="evil-site"></script>
42     <link rel="alternate" type="text/rss" src="evil-rss">
43     <link rel="alternate" type="text/rss" href="http://example.com">
44     <link rel="stylesheet" type="text/rss" href="http://example.com">
45     <style>
46       body {background-image: url(javascript:do_evil)};
47       div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
48       div {color: expression(evil)};
49     </style>
50   </head>
51   <body onload="evil_function()">
52     <!-- I am interpreted for EVIL! -->
53     <a href="javascript:evil_function()">a link</a>
54     <a href="javascrip t%20:evil_function()">a control char link</a>
55     <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
56     <a href="#" onclick="evil_function()">another link</a>
57     <p onclick="evil_function()">a paragraph</p>
58     <div style="display: none">secret EVIL!</div>
59     <object> of EVIL! </object>
60     <iframe src="evil-site"></iframe>
61     <form action="evil-site">
62       Password: <input type="password" name="password">
63     </form>
64     <a href="evil-site">spam spam SPAM!</a>
65     <a href="http://example.com" rel="author">Author</a>
66     <a href="http://example.com" rel="nofollow">Text</a>
67     <img src="evil!">
68   </body>
69 </html>
70
71 >>> print(tostring(fromstring(doc)).decode("utf-8"))
72 <html>
73   <head>
74     <script type="text/javascript" src="evil-site"></script>
75     <link rel="alternate" type="text/rss" src="evil-rss">
76     <link rel="alternate" type="text/rss" href="http://example.com">
77     <link rel="stylesheet" type="text/rss" href="http://example.com">
78     <style>
79       body {background-image: url(javascript:do_evil)};
80       div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
81       div {color: expression(evil)};
82     </style>
83   </head>
84   <body onload="evil_function()">
85     <!-- I am interpreted for EVIL! -->
86     <a href="javascript:evil_function()">a link</a>
87     <a href="javascrip%20t%20:evil_function()">a control char link</a>
88     <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
89     <a href="#" onclick="evil_function()">another link</a>
90     <p onclick="evil_function()">a paragraph</p>
91     <div style="display: none">secret EVIL!</div>
92     <object> of EVIL! </object>
93     <iframe src="evil-site"></iframe>
94     <form action="evil-site">
95       Password: <input type="password" name="password">
96     </form>
97     <a href="evil-site">spam spam SPAM!</a>
98     <a href="http://example.com" rel="author">Author</a>
99     <a href="http://example.com" rel="nofollow">Text</a>
100     <img src="evil!">
101   </body>
102 </html>
103
104 >>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
105 <html>
106   <head>
107     <style>/* deleted */</style>
108   </head>
109   <body>
110     <a href="">a link</a>
111     <a href="">a control char link</a>
112     <a href="">data</a>
113     <a href="#">another link</a>
114     <p>a paragraph</p>
115     <div style="display: none">secret EVIL!</div>
116     of EVIL!
117     Password:
118     <a href="evil-site">spam spam SPAM!</a>
119     <a href="http://example.com" rel="author">Author</a>
120     <a href="http://example.com" rel="nofollow">Text</a>
121     <img src="evil!">
122   </body>
123 </html>
124
125 >>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
126 <html>
127   <head>
128   </head>
129   <body>
130     <a href="">a link</a>
131     <a href="">a control char link</a>
132     <a href="">data</a>
133     <a href="#">another link</a>
134     <p>a paragraph</p>
135     <div>secret EVIL!</div>
136     of EVIL!
137     Password:
138     <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
139     <a href="http://example.com" rel="author nofollow">Author</a>
140     <a href="http://example.com" rel="nofollow">Text</a>
141     <img src="evil!">
142   </body>
143 </html>
144
145 >>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
146 <html>
147   <head>
148   </head>
149   <body>
150     <a href="">a link</a>
151     <a href="">a control char link</a>
152     <a href="">data</a>
153     <a href="#">another link</a>
154     <p>a paragraph</p>
155     <div style="display: none">secret EVIL!</div>
156     of EVIL!
157     Password:
158     <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
159     <a href="http://example.com" rel="author nofollow">Author</a>
160     <a href="http://example.com" rel="nofollow">Text</a>
161     <img src="evil!">
162   </body>
163 </html>
164
165 >>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc))
166 <html>
167   <head>
168     <link rel="alternate" type="text/rss" src="evil-rss">
169     <link rel="alternate" type="text/rss" href="http://example.com">
170     <link rel="stylesheet" type="text/rss" href="http://example.com">
171     <style>/* deleted */</style>
172   </head>
173   <body>
174     <a href="">a link</a>
175     <a href="">a control char link</a>
176     <a href="">data</a>
177     <a href="#">another link</a>
178     <p>a paragraph</p>
179     <div>secret EVIL!</div>
180     of EVIL!
181     Password:
182     <a href="evil-site">spam spam SPAM!</a>
183     <a href="http://example.com" rel="author">Author</a>
184     <a href="http://example.com" rel="nofollow">Text</a>
185     <img src="evil!">
186   </body>
187 </html>