From e26ccb34eb1c0d3948bfd9e50ffe333605ae554d Mon Sep 17 00:00:00 2001
From: Martijn Pieters <mj@zopatista.com>
Date: Fri, 26 Oct 2012 12:15:27 +0200
Subject: [PATCH] Fix the smoke test in the face of UTF-16 surrogate pairs.

If the random data starts with a UTF-16 BOM *and* the next two bytes are for a character in the `\ud800`-`\udfff` range decoding would fail. Small chance, but still possible.

Extend it to check the UTF-8 error as well. The goal is to test that the guesser was *mostly* correct, and to verify the cases where it wasn't that it was to be expected. Most of all that the function doesn't buckle under wildly unexpected data.
---
 tests/test_utils.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index c0560ec..5cd0684 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import codecs
 import os
 import sys
 import unittest
@@ -47,12 +48,30 @@ class GuessJSONUTFTests(unittest.TestCase):
             sample = bytes().join(
                 [byteschr(random.randrange(256)) for _ in range(4)])
             res = guess(sample)
-            if res is not None and res != 'utf-8':
+            if res is not None:
                 # This should decode without errors if this is *really*
-                # something in this encoding. Skip UTF-8, it is more
-                # picky about valid data.
-                sample.decode(res)
-
+                # something in this encoding. However, UTF-8 is a lot
+                # more picky, so we expect errors there. UTF-16 surrogate
+                # pairs also fail
+                try:
+                    sample.decode(res)
+                except UnicodeDecodeError as e:
+                    self.assertEqual(e.args[0].replace('-', '').lower(),
+                                     res.replace('-', '').lower())
+                    if res == 'utf-8':
+                        self.assertTrue(e.args[-1], (
+                            'invalid continuation byte',
+                            'invalid start byte'))
+                        continue
+                    if res == 'utf-16':
+                        self.assertEqual(e.args[-1], 'unexpected end of data')
+                        self.assertTrue(sample[:2] in (
+                            codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE))
+                        # the second two bytes are in the range \ud800-\udfff
+                        # if someone wants to add tests for that as well. I don't
+                        # see the need; we are not testing UTF decoding here.
+                        continue
+                    raise
 
 if __name__ == '__main__':
     unittest.main()
-- 
2.34.1