Improved content encoding detection.

author Jon Parise <jon@indelible.org>

Tue, 3 Sep 2013 23:16:46 +0000 (16:16 -0700)

committer Jon Parise <jon@indelible.org>

Tue, 3 Sep 2013 23:16:46 +0000 (16:16 -0700)
author Jon Parise <jon@indelible.org>
Tue, 3 Sep 2013 23:16:46 +0000 (16:16 -0700)
committer Jon Parise <jon@indelible.org>
Tue, 3 Sep 2013 23:16:46 +0000 (16:16 -0700)
diff --git a/requests/utils.py b/requests/utils.py

index 37aa19e..ac5f59d 100644 (file)
--- a/requests/utils.py
+++ b/requests/utils.py
@@ -265,8 +265,12 @@ def get_encodings_from_content(content):
      """
  
      charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
+    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
+    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
  
-    return charset_re.findall(content)
+    return (charset_re.findall(content) +
+            pragma_re.findall(content) +
+            xml_re.findall(content))
  
  
  def get_encoding_from_headers(headers):
diff --git a/test_requests.py b/test_requests.py

index b6e4659..e62d923 100755 (executable)
--- a/test_requests.py
+++ b/test_requests.py
@@ -639,6 +639,50 @@ class RequestsTestCase(unittest.TestCase):
          self.assertEqual(r.url, url)
  
  
+class TestContentEncodingDetection(unittest.TestCase):
+
+    def test_none(self):
+        encodings = requests.utils.get_encodings_from_content('')
+        self.assertEqual(len(encodings), 0)
+
+    def test_html_charset(self):
+        """HTML5 meta charset attribute"""
+        content = '<meta charset="UTF-8">'
+        encodings = requests.utils.get_encodings_from_content(content)
+        self.assertEqual(len(encodings), 1)
+        self.assertEqual(encodings[0], 'UTF-8')
+
+    def test_html4_pragma(self):
+        """HTML4 pragma directive"""
+        content = '<meta http-equiv="Content-type" content="text/html;charset=UTF-8">'
+        encodings = requests.utils.get_encodings_from_content(content)
+        self.assertEqual(len(encodings), 1)
+        self.assertEqual(encodings[0], 'UTF-8')
+
+    def test_xhtml_pragma(self):
+        """XHTML 1.x served with text/html MIME type"""
+        content = '<meta http-equiv="Content-type" content="text/html;charset=UTF-8" />'
+        encodings = requests.utils.get_encodings_from_content(content)
+        self.assertEqual(len(encodings), 1)
+        self.assertEqual(encodings[0], 'UTF-8')
+
+    def test_xml(self):
+        """XHTML 1.x served as XML"""
+        content = '<?xml version="1.0" encoding="UTF-8"?>'
+        encodings = requests.utils.get_encodings_from_content(content)
+        self.assertEqual(len(encodings), 1)
+        self.assertEqual(encodings[0], 'UTF-8')
+
+    def test_precedence(self):
+        content = '''
+        <?xml version="1.0" encoding="XML"?>
+        <meta charset="HTML5">
+        <meta http-equiv="Content-type" content="text/html;charset=HTML4" />
+        '''.strip()
+        encodings = requests.utils.get_encodings_from_content(content)
+        self.assertEqual(encodings, ['HTML5', 'HTML4', 'XML'])
+
+
  class TestCaseInsensitiveDict(unittest.TestCase):
  
      def test_mapping_init(self):
author	Jon Parise <jon@indelible.org>
	Tue, 3 Sep 2013 23:16:46 +0000 (16:16 -0700)
committer	Jon Parise <jon@indelible.org>
	Tue, 3 Sep 2013 23:16:46 +0000 (16:16 -0700)
requests/utils.py		patch \| blob \| history
test_requests.py		patch \| blob \| history