From: Rick Mak Date: Wed, 17 Aug 2011 04:49:19 +0000 (+0800) Subject: Return the r.content as unicode. X-Git-Tag: v0.6.4^2~5^2~5^2~40^2~3^2~2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8c737069a53933566403ad115234e9e6c809b3d6;p=services%2Fpython-requests.git Return the r.content as unicode. --- diff --git a/requests/models.py b/requests/models.py index a277555..2e35c7e 100644 --- a/requests/models.py +++ b/requests/models.py @@ -10,6 +10,7 @@ import urllib import urllib2 import socket import zlib +import cgi from urllib2 import HTTPError from urlparse import urlparse, urlunparse, urljoin @@ -400,8 +401,54 @@ class Response(object): self._content = zlib.decompress(self._content, 16+zlib.MAX_WBITS) except zlib.error: pass - return self._content + return self.unicode_content(self._content) + + + def get_content_type(self): + content_type = self.headers.get("content-type") + content_type, params = cgi.parse_header(content_type) + return content_type, params + + def get_encoding_from_content_type(self): + content_type, params = self.get_content_type() + if "charset" in params: + return params["charset"].strip("'\"") + + def get_encodings_from_content(self, content): + if self._charset_re is None: + self._charset_re = re.compile( + r']', flags=re.I + ) + return self._charset_re.findall(content) + + def unicode_content(self, content): + """ + Returns the requested content back in unicode. + Tried: + 1. charset from content-type + 2. every encodings from + 3. fall back and replace all unicode characters + """ + # Try charset from content-type + encoding = self.get_encoding_from_content_type() + if encoding: + try: + return unicode(content, encoding) + except UnicodeError: + self.tried_encodings.append(encoding) + + # Try every encodings from + encodings = self.get_encodings_from_content(content) + for encoding in encodings: + if encoding in self.tried_encodings: + continue + try: + return unicode(content, encoding) + except UnicodeError: + self.tried_encodings.append(encoding) + # Fall back: + return unicode(content, encoding, errors="replace") def raise_for_status(self): """Raises stored :class:`HTTPError` or :class:`URLError`, if one occured.""" diff --git a/test_unicode.py b/test_unicode.py new file mode 100644 index 0000000..38a1dfc --- /dev/null +++ b/test_unicode.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import with_statement + +import unittest +import cookielib + +try: + import omnijson as json +except ImportError: + import json + +import requests + + + +HTTPBIN_URL = 'http://httpbin.org/' +HTTPSBIN_URL = 'https://httpbin.ep.io/' + +# HTTPBIN_URL = 'http://staging.httpbin.org/' +# HTTPSBIN_URL = 'https://httpbin-staging.ep.io/' + + +def httpbin(*suffix): + """Returns url for HTTPBIN resource.""" + + return HTTPBIN_URL + '/'.join(suffix) + + +def httpsbin(*suffix): + """Returns url for HTTPSBIN resource.""" + + return HTTPSBIN_URL + '/'.join(suffix) + + +SERVICES = (httpbin, httpsbin) + + + +class RequestsTestSuite(unittest.TestCase): + """Requests test cases.""" + + + def setUp(self): + pass + + + def tearDown(self): + """Teardown.""" + pass + + + def test_HTTP_200_OK_GET_ON_ISO88591(self): + r = requests.get("http://www.qypedeals.de/Verzehrgutschein+für+Jellyfish") + self.assertEqual(r.status_code, 200) + self.assertIsInstance(r.content, unicode) + + def test_HTTP_200_OK_GET_ON_BIG5(self): + r = requests.get("http://google.com.hk/") + self.assertEqual(r.status_code, 200) + self.assertIsInstance(r.content, unicode) + + +if __name__ == '__main__': + unittest.main()