2 >>> from lxml.html import fromstring, tostring
3 >>> from lxml.html.clean import clean, clean_html, Cleaner
4 >>> from lxml.html import usedoctest
8 ... <script type="text/javascript" src="evil-site"></script>
9 ... <link rel="alternate" type="text/rss" src="evil-rss">
10 ... <link rel="alternate" type="text/rss" href="http://example.com">
11 ... <link rel="stylesheet" type="text/rss" href="http://example.com">
13 ... body {background-image: url(javascript:do_evil)};
14 ... div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
15 ... div {color: expression(evil)};
18 ... <body onload="evil_function()">
19 ... <!-- I am interpreted for EVIL! -->
20 ... <a href="javascript:evil_function()">a link</a>
21 ... <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t%20:evil_function()">a control char link</a>
22 ... <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
23 ... <a href="#" onclick="evil_function()">another link</a>
24 ... <p onclick="evil_function()">a paragraph</p>
25 ... <div style="display: none">secret EVIL!</div>
26 ... <object> of EVIL! </object>
27 ... <iframe src="evil-site"></iframe>
28 ... <form action="evil-site">
29 ... Password: <input type="password" name="password">
31 ... <a href="evil-site">spam spam SPAM!</a>
32 ... <a href="http://example.com" rel="author">Author</a>
33 ... <a href="http://example.com" rel="nofollow">Text</a>
38 >>> print(re.sub('[\x00-\x07\x0E]', '', doc))
41 <script type="text/javascript" src="evil-site"></script>
42 <link rel="alternate" type="text/rss" src="evil-rss">
43 <link rel="alternate" type="text/rss" href="http://example.com">
44 <link rel="stylesheet" type="text/rss" href="http://example.com">
46 body {background-image: url(javascript:do_evil)};
47 div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
48 div {color: expression(evil)};
51 <body onload="evil_function()">
52 <!-- I am interpreted for EVIL! -->
53 <a href="javascript:evil_function()">a link</a>
54 <a href="javascrip t%20:evil_function()">a control char link</a>
55 <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
56 <a href="#" onclick="evil_function()">another link</a>
57 <p onclick="evil_function()">a paragraph</p>
58 <div style="display: none">secret EVIL!</div>
59 <object> of EVIL! </object>
60 <iframe src="evil-site"></iframe>
61 <form action="evil-site">
62 Password: <input type="password" name="password">
64 <a href="evil-site">spam spam SPAM!</a>
65 <a href="http://example.com" rel="author">Author</a>
66 <a href="http://example.com" rel="nofollow">Text</a>
71 >>> print(tostring(fromstring(doc)).decode("utf-8"))
74 <script type="text/javascript" src="evil-site"></script>
75 <link rel="alternate" type="text/rss" src="evil-rss">
76 <link rel="alternate" type="text/rss" href="http://example.com">
77 <link rel="stylesheet" type="text/rss" href="http://example.com">
79 body {background-image: url(javascript:do_evil)};
80 div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
81 div {color: expression(evil)};
84 <body onload="evil_function()">
85 <!-- I am interpreted for EVIL! -->
86 <a href="javascript:evil_function()">a link</a>
87 <a href="javascrip%20t%20:evil_function()">a control char link</a>
88 <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
89 <a href="#" onclick="evil_function()">another link</a>
90 <p onclick="evil_function()">a paragraph</p>
91 <div style="display: none">secret EVIL!</div>
92 <object> of EVIL! </object>
93 <iframe src="evil-site"></iframe>
94 <form action="evil-site">
95 Password: <input type="password" name="password">
97 <a href="evil-site">spam spam SPAM!</a>
98 <a href="http://example.com" rel="author">Author</a>
99 <a href="http://example.com" rel="nofollow">Text</a>
104 >>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
107 <style>/* deleted */</style>
110 <a href="">a link</a>
111 <a href="">a control char link</a>
113 <a href="#">another link</a>
115 <div style="display: none">secret EVIL!</div>
118 <a href="evil-site">spam spam SPAM!</a>
119 <a href="http://example.com" rel="author">Author</a>
120 <a href="http://example.com" rel="nofollow">Text</a>
125 >>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
130 <a href="">a link</a>
131 <a href="">a control char link</a>
133 <a href="#">another link</a>
135 <div>secret EVIL!</div>
138 <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
139 <a href="http://example.com" rel="author nofollow">Author</a>
140 <a href="http://example.com" rel="nofollow">Text</a>
145 >>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
150 <a href="">a link</a>
151 <a href="">a control char link</a>
153 <a href="#">another link</a>
155 <div style="display: none">secret EVIL!</div>
158 <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
159 <a href="http://example.com" rel="author nofollow">Author</a>
160 <a href="http://example.com" rel="nofollow">Text</a>
165 >>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc))
168 <link rel="alternate" type="text/rss" src="evil-rss">
169 <link rel="alternate" type="text/rss" href="http://example.com">
170 <link rel="stylesheet" type="text/rss" href="http://example.com">
171 <style>/* deleted */</style>
174 <a href="">a link</a>
175 <a href="">a control char link</a>
177 <a href="#">another link</a>
179 <div>secret EVIL!</div>
182 <a href="evil-site">spam spam SPAM!</a>
183 <a href="http://example.com" rel="author">Author</a>
184 <a href="http://example.com" rel="nofollow">Text</a>