1 We'll define a link translation function:
3 >>> base_href = 'http://old/base/path.html'
4 >>> try: import urlparse
5 ... except ImportError: import urllib.parse as urlparse
6 >>> def relocate_href(link):
7 ... link = urlparse.urljoin(base_href, link)
8 ... if link.startswith('http://old'):
9 ... return 'https://new' + link[len('http://old'):]
13 Now for content. First, to make it easier on us, we need to trim the
14 normalized HTML we get from these functions::
18 >>> from lxml.html import usedoctest, tostring
19 >>> from lxml.html import rewrite_links
20 >>> print(rewrite_links(
21 ... '<a href="http://old/blah/blah.html">link</a>', relocate_href))
22 <a href="https://new/blah/blah.html">link</a>
23 >>> print(rewrite_links(
24 ... '<script src="http://old/foo.js"></script>', relocate_href))
25 <script src="https://new/foo.js"></script>
26 >>> print(rewrite_links(
27 ... '<link href="foo.css">', relocate_href))
28 <link href="https://new/base/foo.css">
29 >>> print(rewrite_links('''\
30 ... <base href="http://blah/stuff/index.html">
31 ... <link href="foo.css">
32 ... <a href="http://old/bar.html">x</a>\
33 ... ''', relocate_href))
34 <link href="http://blah/stuff/foo.css">
35 <a href="https://new/bar.html">x</a>
37 Links in CSS are also handled::
39 >>> print(rewrite_links('''
41 ... body {background-image: url(http://old/image.gif)};
42 ... @import "http://old/other-style.css";
43 ... </style>''', relocate_href))
45 body {background-image: url(https://new/image.gif)};
46 @import "https://new/other-style.css";
47 </style></head></html>
48 >>> print(rewrite_links('''
50 ... body {background-image: url("http://old/image.gif")};
51 ... @import "http://old/other-style.css";
52 ... </style>''', relocate_href))
54 body {background-image: url("https://new/image.gif")};
55 @import "https://new/other-style.css";
56 </style></head></html>
58 Those links in style attributes are also rewritten::
60 >>> print(rewrite_links('''
61 ... <div style="background-image: url(http://old/image.gif)">text</div>
62 ... ''', relocate_href))
63 <div style="background-image: url(https://new/image.gif)">text</div>
65 The ``<base href>`` tag is also respected (but also removed)::
67 >>> print(rewrite_links('''
69 ... <base href="http://old/">
72 ... <a href="foo.html">link</a>
73 ... </body></html>''', relocate_href))
77 <a href="https://new/foo.html">link</a>
81 The ``iterlinks`` method (and function) gives you all the links in
82 the document, along with the element and attribute the link comes
83 from. This makes it fairly easy to see what resources the document
84 references or embeds (an ``<a>`` tag is a reference, an ``<img>`` tag
85 is something embedded). It returns a generator of ``(element, attrib,
86 link)``, which is awkward to test here, so we'll make a printer::
88 >>> from lxml.html import iterlinks, document_fromstring, tostring
89 >>> def print_iter(seq):
90 ... for element, attrib, link, pos in seq:
92 ... extra = '@%s' % pos
95 ... print('%s %s="%s"%s' % (element.tag, attrib, link, extra))
96 >>> print_iter(iterlinks('''
99 ... <link rel="stylesheet" href="style.css">
100 ... <style type="text/css">
102 ... background-image: url(/bg.gif);
104 ... @import "/other-styles.css";
106 ... <script src="/js-funcs.js"></script>
111 ... <li><a href="/test.html">Test stuff</a></li>
112 ... <li><a href="/other.html">Other stuff</a></li>
114 ... <td style="background-image: url(/td-bg.png)">
115 ... <img src="/logo.gif">
119 ... </body></html>'''))
120 link href="style.css"
121 style None="/bg.gif"@40
122 style None="/other-styles.css"@69
123 script src="/js-funcs.js"
126 td style="/td-bg.png"@22
129 An application of ``iterlinks()`` is ``make_links_absolute()``::
131 >>> from lxml.html import make_links_absolute
132 >>> print(make_links_absolute('''
135 ... <link rel="stylesheet" href="style.css">
136 ... <style type="text/css">
138 ... background-image: url(/bg.gif);
140 ... @import "/other-styles.css";
142 ... <script src="/js-funcs.js"></script>
147 ... <li><a href=" /test.html">Test stuff</a></li>
148 ... <li><a href="/other.html ">Other stuff</a></li>
150 ... <tr><td style="background-image: url( /td-bg.png )">
151 ... <img src="logo.gif">
155 ... </body></html>''',
156 ... base_url="http://my.little.server/url/"))
159 <link rel="stylesheet" href="http://my.little.server/url/style.css">
160 <style type="text/css">
162 background-image: url(http://my.little.server/bg.gif);
164 @import "http://my.little.server/other-styles.css";
166 <script src="http://my.little.server/js-funcs.js"></script>
171 <li><a href="http://my.little.server/test.html">Test stuff</a></li>
172 <li><a href="http://my.little.server/other.html">Other stuff</a></li>
175 <td style="background-image: url(http://my.little.server/td-bg.png)">
176 <img src="http://my.little.server/url/logo.gif">