From: Martijn Pieters Date: Fri, 26 Oct 2012 10:15:27 +0000 (+0200) Subject: Fix the smoke test in the face of UTF-16 surrogate pairs. X-Git-Tag: v0.14.2~2^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e26ccb34eb1c0d3948bfd9e50ffe333605ae554d;p=services%2Fpython-requests.git Fix the smoke test in the face of UTF-16 surrogate pairs. If the random data starts with a UTF-16 BOM *and* the next two bytes are for a character in the `\ud800`-`\udfff` range decoding would fail. Small chance, but still possible. Extend it to check the UTF-8 error as well. The goal is to test that the guesser was *mostly* correct, and to verify the cases where it wasn't that it was to be expected. Most of all that the function doesn't buckle under wildly unexpected data. --- diff --git a/tests/test_utils.py b/tests/test_utils.py index c0560ec..5cd0684 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import codecs import os import sys import unittest @@ -47,12 +48,30 @@ class GuessJSONUTFTests(unittest.TestCase): sample = bytes().join( [byteschr(random.randrange(256)) for _ in range(4)]) res = guess(sample) - if res is not None and res != 'utf-8': + if res is not None: # This should decode without errors if this is *really* - # something in this encoding. Skip UTF-8, it is more - # picky about valid data. - sample.decode(res) - + # something in this encoding. However, UTF-8 is a lot + # more picky, so we expect errors there. UTF-16 surrogate + # pairs also fail + try: + sample.decode(res) + except UnicodeDecodeError as e: + self.assertEqual(e.args[0].replace('-', '').lower(), + res.replace('-', '').lower()) + if res == 'utf-8': + self.assertTrue(e.args[-1], ( + 'invalid continuation byte', + 'invalid start byte')) + continue + if res == 'utf-16': + self.assertEqual(e.args[-1], 'unexpected end of data') + self.assertTrue(sample[:2] in ( + codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE)) + # the second two bytes are in the range \ud800-\udfff + # if someone wants to add tests for that as well. I don't + # see the need; we are not testing UTF decoding here. + continue + raise if __name__ == '__main__': unittest.main()