support surrogates in unicode string literals in Py3.3

author Stefan Behnel <stefan_ml@behnel.de>

Fri, 15 Mar 2013 19:14:19 +0000 (20:14 +0100)

committer Stefan Behnel <stefan_ml@behnel.de>

Fri, 15 Mar 2013 19:14:19 +0000 (20:14 +0100)
author Stefan Behnel <stefan_ml@behnel.de>
Fri, 15 Mar 2013 19:14:19 +0000 (20:14 +0100)
committer Stefan Behnel <stefan_ml@behnel.de>
Fri, 15 Mar 2013 19:14:19 +0000 (20:14 +0100)
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py

index 5297f93..119373d 100755 (executable)
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1187,7 +1187,7 @@ class UnicodeNode(ConstNode):
          self.constant_result = self.value
  
      def as_sliced_node(self, start, stop, step=None):
-        if _string_contains_surrogates(self.value[:stop]):
+        if StringEncoding.string_contains_surrogates(self.value[:stop]):
              # this is unsafe as it may give different results in different runtimes
              return None
          value = StringEncoding.EncodedString(self.value[start:stop:step])
@@ -1236,11 +1236,30 @@ class UnicodeNode(ConstNode):
          return BoolNode(self.pos, value=bool_value, constant_result=bool_value)
  
      def contains_surrogates(self):
-        return _string_contains_surrogates(self.value)
+        return StringEncoding.string_contains_surrogates(self.value)
  
      def generate_evaluation_code(self, code):
          if self.type.is_pyobject:
-            self.result_code = code.get_py_string_const(self.value)
+            if self.contains_surrogates():
+                # surrogates are not really portable and cannot be
+                # decoded by the UTF-8 codec in Py3.3
+                self.result_code = code.get_py_const(py_object_type, 'ustring_')
+                data_cname = code.get_pyunicode_ptr_const(self.value)
+                code = code.get_cached_constants_writer()
+                code.mark_pos(self.pos)
+                code.putln(
+                    "%s = PyUnicode_FromUnicode(%s, (sizeof(%s) / sizeof(Py_UNICODE))-1); %s" % (
+                        self.result_code,
+                        data_cname,
+                        data_cname,
+                        code.error_goto_if_null(self.result_code, self.pos)))
+                code.putln("#if CYTHON_PEP393_ENABLED")
+                code.putln(
+                    code.error_goto_if_neg(
+                        "PyUnicode_READY(%s)" % self.result_code, self.pos))
+                code.putln("#endif")
+            else:
+                self.result_code = code.get_py_string_const(self.value)
          else:
              self.result_code = code.get_pyunicode_ptr_const(self.value)
  
@@ -1271,7 +1290,7 @@ class StringNode(PyConstNode):
          value = type(self.value)(self.value[start:stop:step])
          value.encoding = self.value.encoding
          if self.unicode_value is not None:
-            if _string_contains_surrogates(self.unicode_value[:stop]):
+            if StringEncoding.string_contains_surrogates(self.unicode_value[:stop]):
                  # this is unsafe as it may give different results in different runtimes
                  return None
              unicode_value = StringEncoding.EncodedString(
@@ -1316,26 +1335,6 @@ class IdentifierStringNode(StringNode):
      is_identifier = True
  
  
-def _string_contains_surrogates(ustring):
-    """
-    Check if the unicode string contains surrogate code points
-    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
-    Unicode, i.e. characters that would be spelled as two
-    separate code units on a narrow platform.
-    """
-    for c in map(ord, ustring):
-        if c > 65535: # can only happen on wide platforms
-            return True
-            # We only look for the first code unit (D800-DBFF) of a
-        # surrogate pair - if we find one, the other one
-        # (DC00-DFFF) is likely there, too.  If we don't find it,
-        # any second code unit cannot make for a surrogate pair by
-        # itself.
-        if 0xD800 <= c <= 0xDBFF:
-            return True
-    return False
-
-
  class ImagNode(AtomicExprNode):
      #  Imaginary number literal
      #
diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py

index 1eb77b5..4d84afa 100644 (file)
--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -126,9 +126,28 @@ class EncodedString(_unicode):
          assert self.encoding is None
          return self.encode("UTF-8")
  
+    @property
      def is_unicode(self):
          return self.encoding is None
-    is_unicode = property(is_unicode)
+
+    def contains_surrogates(self):
+        return string_contains_surrogates(self)
+
+
+def string_contains_surrogates(ustring):
+    """
+    Check if the unicode string contains surrogate code points
+    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+    Unicode, i.e. characters that would be spelled as two
+    separate code units on a narrow platform.
+    """
+    for c in map(ord, ustring):
+        if c > 65535:  # can only happen on wide platforms
+            return True
+        if 0xD800 <= c <= 0xDFFF:
+            return True
+    return False
+
  
  class BytesLiteral(_bytes):
      # bytes subclass that is compatible with EncodedString
@@ -155,6 +174,7 @@ class BytesLiteral(_bytes):
  
      is_unicode = False
  
+
  char_from_escape_sequence = {
      r'\a' : u'\a',
      r'\b' : u'\b',
diff --git a/tests/run/unicodeliterals.pyx b/tests/run/unicodeliterals.pyx

index 0eb90e3..19da131 100644 (file)
--- a/tests/run/unicodeliterals.pyx
+++ b/tests/run/unicodeliterals.pyx
@@ -17,6 +17,10 @@ __doc__ = br"""
      u'\x03g\xf8\uf8d2S\xf8k ik'
      >>> f
      u'\xf8'
+    >>> g
+    u'\udc00'
+    >>> h
+    u'\ud800'
      >>> add
      u'S\xf8k ik\xfc\xd6\xe4abc'
      >>> null
@@ -36,6 +40,10 @@ __doc__ = br"""
      10
      >>> len(f)
      1
+    >>> len(g)
+    1
+    >>> len(h)
+    1
      >>> len(add)
      12
      >>> len(null)
@@ -63,6 +71,10 @@ __doc__ = br"""
      True
      >>> f == u'\\xf8' # unescaped by Python
      True
+    >>> g == u'\\udc00' # unescaped by Python (required by doctest)
+    True
+    >>> h == u'\\ud800' # unescaped by Python (required by doctest)
+    True
      >>> k == u'\\N{SNOWMAN}' == u'\\u2603'
      True
      >>> add == u'Søk ik' + u'üÖä' + 'abc'
@@ -95,6 +107,8 @@ c = u'Søk ik'
  d = u'üÖä'
  e = u'\x03\x67\xf8\uf8d2Søk ik'
  f = u'\xf8'
+g = u'\udc00'   # lone trail surrogate
+h = u'\ud800'   # lone lead surrogate
  k = u'\N{SNOWMAN}'
  
  add = u'Søk ik' + u'üÖä' + u'abc'
author	Stefan Behnel <stefan_ml@behnel.de>
	Fri, 15 Mar 2013 19:14:19 +0000 (20:14 +0100)
committer	Stefan Behnel <stefan_ml@behnel.de>
	Fri, 15 Mar 2013 19:14:19 +0000 (20:14 +0100)
Cython/Compiler/ExprNodes.py		patch \| blob \| history
Cython/Compiler/StringEncoding.py		patch \| blob \| history
tests/run/unicodeliterals.pyx		patch \| blob \| history