From: Martijn Pieters Date: Thu, 25 Oct 2012 15:43:52 +0000 (+0200) Subject: Use a JSON-specific encoding detection when no encoding has been specified. X-Git-Tag: v0.14.2~2^2~5 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=4decc7986e32bb8f3511df3dd0c9b1c1d57453c1;p=services%2Fpython-requests.git Use a JSON-specific encoding detection when no encoding has been specified. JSON *must* be encoded using UTF-8, UTF-16 or UTF-32 (see the [RFC][1]; detect the encoding based on the fact that JSON always starts with 2 ASCII characters. [1]: http://tools.ietf.org/html/rfc4627#section-3 --- diff --git a/requests/models.py b/requests/models.py index c19d3cc..06c8a71 100644 --- a/requests/models.py +++ b/requests/models.py @@ -31,7 +31,8 @@ from .exceptions import ( from .utils import ( get_encoding_from_headers, stream_untransfer, guess_filename, requote_uri, stream_decode_response_unicode, get_netrc_auth, get_environ_proxies, - to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices) + to_key_val_list, DEFAULT_CA_BUNDLE_PATH, parse_header_links, iter_slices, + guess_json_utf) from .compat import ( cookielib, urlparse, urlunparse, urljoin, urlsplit, urlencode, str, bytes, StringIO, is_py2, chardet, json, builtin_str) @@ -842,6 +843,18 @@ class Response(object): @property def json(self): """Returns the json-encoded content of a response, if any.""" + + if not self.encoding and len(self.content) > 3: + # No encoding set. JSON RFC 4627 section 3 states we should expect + # UTF-8, -16 or -32. Detect which one to use; If the detection or + # decoding fails, fall back to `self.text` (using chardet to make + # a best guess). + encoding = guess_json_utf(self.content) + if encoding is not None: + try: + return json.loads(self.content.decode(encoding)) + except (ValueError, UnicodeDecodeError): + pass try: return json.loads(self.text or self.content) except ValueError: diff --git a/requests/utils.py b/requests/utils.py index 7e9f631..ec9f4d2 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -579,3 +579,38 @@ def parse_header_links(value): links.append(link) return links + + +# Null bytes; no need to recreate these on each call to guess_json_utf +_null = '\x00'.encode('ascii') # encoding to ASCII for Python 3 +_null2 = _null * 2 +_null3 = _null * 3 + + +def guess_json_utf(data): + # JSON always starts with two ASCII characters, so detection is as + # easy as counting the nulls and from their location and count + # determine the encoding. Also detect a BOM, if present. + sample = data[:4] + if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): + return 'utf-32' # BOM included + if sample[:3] == codecs.BOM_UTF8: + return 'utf-8-sig' # BOM included, MS style (discouraged) + if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): + return 'utf-16' # BOM included + nullcount = sample.count(_null) + if nullcount == 0: + return 'utf-8' + if nullcount == 2: + if sample[::2] == _null2: # 1st and 3rd are null + return 'utf-16-be' + if sample[1::2] == _null2: # 2nd and 4th are null + return 'utf-16-le' + # Did not detect 2 valid UTF-16 ascii-range characters + if nullcount == 3: + if sample[:3] == _null3: + return 'utf-32-be' + if sample[1:] == _null3: + return 'utf-32-le' + # Did not detect a valid UTF-32 ascii-range character + return None diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..27fa18e --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import sys +import unittest +import random + +# Path hack. +sys.path.insert(0, os.path.abspath('..')) +import requests.utils +from requests.compat import is_py3, bytes + + +class GuessJSONUTFTests(unittest.TestCase): + """Smoke test for https functionality.""" + + codecs = ( + 'utf-8', 'utf-8-sig', + 'utf-16', 'utf-16-le', 'utf-16-be', + 'utf-32', 'utf-32-le', 'utf-32-be' + ) + + def test_guess_encoding(self): + # Throw 4-character ASCII strings (encoded to a UTF encoding) + # at the guess routine; it should correctly guess all codecs. + unichr = chr if is_py3 else __builtins__.unichr + guess = requests.utils.guess_json_utf + for c in range(33, 127): # printable only + sample = unichr(c) * 4 + for codec in self.codecs: + res = guess(sample.encode(codec)) + self.assertEqual(res, codec) + + def test_smoke_encoding(self): + # Throw random 4-byte strings at the guess function. + # Any guess for a UTF encoding is verified, a decode exception + # is a test failure. + chr = (lambda c: bytes([c])) if is_py3 else __builtins__.chr + guess = requests.utils.guess_json_utf + for i in range(1000): + sample = bytes().join( + [chr(random.randrange(256)) for _ in range(4)]) + res = guess(sample) + if res is not None and res != 'utf-8': + # This should decode without errors if this is *really* + # something in this encoding. Skip UTF-8, it is more + # picky about valid data. + sample.decode(res) + + +if __name__ == '__main__': + unittest.main()