From e26ccb34eb1c0d3948bfd9e50ffe333605ae554d Mon Sep 17 00:00:00 2001 From: Martijn Pieters Date: Fri, 26 Oct 2012 12:15:27 +0200 Subject: [PATCH] Fix the smoke test in the face of UTF-16 surrogate pairs. If the random data starts with a UTF-16 BOM *and* the next two bytes are for a character in the `\ud800`-`\udfff` range decoding would fail. Small chance, but still possible. Extend it to check the UTF-8 error as well. The goal is to test that the guesser was *mostly* correct, and to verify the cases where it wasn't that it was to be expected. Most of all that the function doesn't buckle under wildly unexpected data. --- tests/test_utils.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index c0560ec..5cd0684 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import codecs import os import sys import unittest @@ -47,12 +48,30 @@ class GuessJSONUTFTests(unittest.TestCase): sample = bytes().join( [byteschr(random.randrange(256)) for _ in range(4)]) res = guess(sample) - if res is not None and res != 'utf-8': + if res is not None: # This should decode without errors if this is *really* - # something in this encoding. Skip UTF-8, it is more - # picky about valid data. - sample.decode(res) - + # something in this encoding. However, UTF-8 is a lot + # more picky, so we expect errors there. UTF-16 surrogate + # pairs also fail + try: + sample.decode(res) + except UnicodeDecodeError as e: + self.assertEqual(e.args[0].replace('-', '').lower(), + res.replace('-', '').lower()) + if res == 'utf-8': + self.assertTrue(e.args[-1], ( + 'invalid continuation byte', + 'invalid start byte')) + continue + if res == 'utf-16': + self.assertEqual(e.args[-1], 'unexpected end of data') + self.assertTrue(sample[:2] in ( + codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE)) + # the second two bytes are in the range \ud800-\udfff + # if someone wants to add tests for that as well. I don't + # see the need; we are not testing UTF decoding here. + continue + raise if __name__ == '__main__': unittest.main() -- 2.34.1