From b9b5be7c4c78e02f83464d8e332a1bf819853b60 Mon Sep 17 00:00:00 2001 From: Jon Parise Date: Tue, 3 Sep 2013 16:16:46 -0700 Subject: [PATCH] Improved content encoding detection. get_encodings_from_content() can now detect HTML in-document content encoding declarations in the following formats: - HTML5 - HTML4 - XHTML 1.x served with text/html MIME type - XHTML 1.x served as XML Ref: http://www.w3.org/International/questions/qa-html-encoding-declarations --- requests/utils.py | 6 +++++- test_requests.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/requests/utils.py b/requests/utils.py index 37aa19e..ac5f59d 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -265,8 +265,12 @@ def get_encodings_from_content(content): """ charset_re = re.compile(r']', flags=re.I) + pragma_re = re.compile(r']', flags=re.I) + xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') - return charset_re.findall(content) + return (charset_re.findall(content) + + pragma_re.findall(content) + + xml_re.findall(content)) def get_encoding_from_headers(headers): diff --git a/test_requests.py b/test_requests.py index b6e4659..e62d923 100755 --- a/test_requests.py +++ b/test_requests.py @@ -639,6 +639,50 @@ class RequestsTestCase(unittest.TestCase): self.assertEqual(r.url, url) +class TestContentEncodingDetection(unittest.TestCase): + + def test_none(self): + encodings = requests.utils.get_encodings_from_content('') + self.assertEqual(len(encodings), 0) + + def test_html_charset(self): + """HTML5 meta charset attribute""" + content = '' + encodings = requests.utils.get_encodings_from_content(content) + self.assertEqual(len(encodings), 1) + self.assertEqual(encodings[0], 'UTF-8') + + def test_html4_pragma(self): + """HTML4 pragma directive""" + content = '' + encodings = requests.utils.get_encodings_from_content(content) + self.assertEqual(len(encodings), 1) + self.assertEqual(encodings[0], 'UTF-8') + + def test_xhtml_pragma(self): + """XHTML 1.x served with text/html MIME type""" + content = '' + encodings = requests.utils.get_encodings_from_content(content) + self.assertEqual(len(encodings), 1) + self.assertEqual(encodings[0], 'UTF-8') + + def test_xml(self): + """XHTML 1.x served as XML""" + content = '' + encodings = requests.utils.get_encodings_from_content(content) + self.assertEqual(len(encodings), 1) + self.assertEqual(encodings[0], 'UTF-8') + + def test_precedence(self): + content = ''' + + + + '''.strip() + encodings = requests.utils.get_encodings_from_content(content) + self.assertEqual(encodings, ['HTML5', 'HTML4', 'XML']) + + class TestCaseInsensitiveDict(unittest.TestCase): def test_mapping_init(self): -- 2.7.4