Return the r.content as unicode.
authorRick Mak <rick.mak@gmail.com>
Wed, 17 Aug 2011 04:49:19 +0000 (12:49 +0800)
committerRick Mak <rick.mak@gmail.com>
Wed, 17 Aug 2011 04:49:19 +0000 (12:49 +0800)
requests/models.py
test_unicode.py [new file with mode: 0644]

index a277555042973e8fcd36279440f33ce44c5e7022..2e35c7ec46bfafbec8c1897580b048eec799c082 100644 (file)
@@ -10,6 +10,7 @@ import urllib
 import urllib2
 import socket
 import zlib
+import cgi
 
 from urllib2 import HTTPError
 from urlparse import urlparse, urlunparse, urljoin
@@ -400,8 +401,54 @@ class Response(object):
                     self._content = zlib.decompress(self._content, 16+zlib.MAX_WBITS)
                 except zlib.error:
                     pass
-            return self._content
+            return self.unicode_content(self._content)
+    
+
+    def get_content_type(self):
+        content_type = self.headers.get("content-type")
+        content_type, params = cgi.parse_header(content_type)
+        return content_type, params
+
+    def get_encoding_from_content_type(self):
+        content_type, params = self.get_content_type()
+        if "charset" in params:
+            return params["charset"].strip("'\"")
+
+    def get_encodings_from_content(self, content):
+        if self._charset_re is None:
+            self._charset_re = re.compile(
+                r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I
+            )
+        return self._charset_re.findall(content)
+    
+    def unicode_content(self, content):
+        """
+        Returns the requested content back in unicode.
+        Tried:
+        1. charset from content-type
+        2. every encodings from <meta ... charset=XXX>
+        3. fall back and replace all unicode characters
+        """
+        # Try charset from content-type
+        encoding = self.get_encoding_from_content_type()
+        if encoding:
+            try:
+                return unicode(content, encoding)
+            except UnicodeError:
+                self.tried_encodings.append(encoding)
+
+        # Try every encodings from <meta ... charset=XXX>
+        encodings = self.get_encodings_from_content(content)
+        for encoding in encodings:
+            if encoding in self.tried_encodings:
+                continue
+            try:
+                return unicode(content, encoding)
+            except UnicodeError:
+                self.tried_encodings.append(encoding)
 
+        # Fall back:
+        return unicode(content, encoding, errors="replace")
 
     def raise_for_status(self):
         """Raises stored :class:`HTTPError` or :class:`URLError`, if one occured."""
diff --git a/test_unicode.py b/test_unicode.py
new file mode 100644 (file)
index 0000000..38a1dfc
--- /dev/null
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from __future__ import with_statement
+
+import unittest
+import cookielib
+
+try:
+    import omnijson as json
+except ImportError:
+    import json
+
+import requests
+
+
+
+HTTPBIN_URL = 'http://httpbin.org/'
+HTTPSBIN_URL = 'https://httpbin.ep.io/'
+
+# HTTPBIN_URL = 'http://staging.httpbin.org/'
+# HTTPSBIN_URL = 'https://httpbin-staging.ep.io/'
+
+
+def httpbin(*suffix):
+    """Returns url for HTTPBIN resource."""
+
+    return HTTPBIN_URL + '/'.join(suffix)
+
+
+def httpsbin(*suffix):
+    """Returns url for HTTPSBIN resource."""
+
+    return HTTPSBIN_URL + '/'.join(suffix)
+
+
+SERVICES = (httpbin, httpsbin)
+
+
+
+class RequestsTestSuite(unittest.TestCase):
+    """Requests test cases."""
+
+
+    def setUp(self):
+        pass
+
+
+    def tearDown(self):
+        """Teardown."""
+        pass
+
+
+    def test_HTTP_200_OK_GET_ON_ISO88591(self):
+        r = requests.get("http://www.qypedeals.de/Verzehrgutschein+für+Jellyfish")
+        self.assertEqual(r.status_code, 200)
+        self.assertIsInstance(r.content, unicode)
+    
+    def test_HTTP_200_OK_GET_ON_BIG5(self):
+        r = requests.get("http://google.com.hk/")
+        self.assertEqual(r.status_code, 200)
+        self.assertIsInstance(r.content, unicode)
+
+
+if __name__ == '__main__':
+    unittest.main()