undo Py3.3 surrogates support fixes - breaks too many special cases with strings

author Stefan Behnel <stefan_ml@behnel.de>

Thu, 10 Jan 2013 21:09:37 +0000 (22:09 +0100)

committer Stefan Behnel <stefan_ml@behnel.de>

Thu, 10 Jan 2013 21:09:37 +0000 (22:09 +0100)
author Stefan Behnel <stefan_ml@behnel.de>
Thu, 10 Jan 2013 21:09:37 +0000 (22:09 +0100)
committer Stefan Behnel <stefan_ml@behnel.de>
Thu, 10 Jan 2013 21:09:37 +0000 (22:09 +0100)
diff --git a/CHANGES.rst b/CHANGES.rst

index c5ff3ad..9c79b80 100644 (file)
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -26,13 +26,6 @@ Features added
  Bugs fixed
  ----------
  
-* Surrogate code points in Unicode string literals failed to compile and/or
-  load in CPython 3.3.  To work around this change introduced by CPython,
-  Cython switched from UTF-8 to Python Unicode escapes ('\u0101') internally
-  for storing literal Unicode strings in C code.  This may add a slight
-  initialisation overhead if a large number of non-Latin1 characters are
-  used in the code.
-
  Other changes
  -------------
  
diff --git a/Cython/Compiler/Code.py b/Cython/Compiler/Code.py

index 9399fcb..c348b16 100644 (file)
--- a/Cython/Compiler/Code.py
+++ b/Cython/Compiler/Code.py
@@ -991,7 +991,7 @@ class GlobalState(object):
      def get_string_const(self, text, py_version=None):
          # return a C string constant, creating a new one if necessary
          if text.is_unicode:
-            byte_string = text.escapeencode()
+            byte_string = text.utf8encode()
          else:
              byte_string = text.byteencode()
          try:
@@ -1006,7 +1006,7 @@ class GlobalState(object):
          # return a Python string constant, creating a new one if necessary
          py3str_cstring = None
          if is_str and unicode_value is not None \
-               and unicode_value.escapeencode() != text.byteencode():
+               and unicode_value.utf8encode() != text.byteencode():
              py3str_cstring = self.get_string_const(unicode_value, py_version=3)
              c_string = self.get_string_const(text, py_version=2)
          else:
diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py

index 8fc37fc..1ca490e 100644 (file)
--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -12,8 +12,6 @@ else:
      _unicode, _str, _bytes = unicode, str, str
      IS_PYTHON3 = False
  
-IS_PYTHON24 = sys.version_info[:2] < (2,5)
-
  empty_bytes = _bytes()
  empty_unicode = _unicode()
  
@@ -128,13 +126,6 @@ class EncodedString(_unicode):
          assert self.encoding is None
          return self.encode("UTF-8")
  
-    def escapeencode(self):
-        assert self.encoding is None
-        if IS_PYTHON24:
-            # work around bug in Py24 encoder
-            return self.replace(u'\\', u'\\\\').encode('unicode_escape')
-        return self.encode('unicode_escape')
-
      def is_unicode(self):
          return self.encoding is None
      is_unicode = property(is_unicode)
@@ -156,9 +147,6 @@ class BytesLiteral(_bytes):
      def utf8encode(self):
          assert False, "this is not a unicode string: %r" % self
  
-    def escapeencode(self):
-        assert False, "this is not a unicode string: %r" % self
-
      def __str__(self):
          """Fake-decode the byte string to unicode to support %
          formatting of unicode strings.
@@ -177,8 +165,6 @@ char_from_escape_sequence = {
      r'\v' : u'\v',
      }.get
  
-_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
-
  def _to_escape_sequence(s):
      if s in '\n\r\t':
          return repr(s)[1:-1]
@@ -190,22 +176,19 @@ def _to_escape_sequence(s):
          # within a character sequence, oct passes much better than hex
          return ''.join(['\\%03o' % ord(c) for c in s])
  
-def _build_specials_replacer():
+_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
+_c_special_replacements = [(orig.encode('ASCII'),
+                            _to_escape_sequence(orig).encode('ASCII'))
+                           for orig in _c_special ]
+
+def _build_specials_test():
      subexps = []
-    replacements = {}
      for special in _c_special:
          regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
          subexps.append(regexp)
-        replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
-
-    sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
-    def replace_specials(m):
-        return replacements[m.group(1)]
-    def replace(s):
-        return sub(replace_specials, s)
-    return replace
+    return re.compile('|'.join(subexps).encode('ASCII')).search
  
-_replace_specials = _build_specials_replacer()
+_has_specials = _build_specials_test()
  
  def escape_char(c):
      if IS_PYTHON3:
@@ -227,7 +210,10 @@ def escape_byte_string(s):
      encoded as ISO-8859-1, will result in the correct byte sequence
      being written.
      """
-    s = _replace_specials(s)
+    if _has_specials(s):
+        for special, replacement in _c_special_replacements:
+            if special in s:
+                s = s.replace(special, replacement)
      try:
          return s.decode("ASCII") # trial decoding: plain ASCII => done
      except UnicodeDecodeError:
diff --git a/Cython/Utility/StringTools.c b/Cython/Utility/StringTools.c

index 11464c2..53dfc77 100644 (file)
--- a/Cython/Utility/StringTools.c
+++ b/Cython/Utility/StringTools.c
@@ -17,7 +17,7 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
      while (t->p) {
          #if PY_MAJOR_VERSION < 3
          if (t->is_unicode) {
-            *t->p = PyUnicode_DecodeUnicodeEscape(t->s, t->n - 1, NULL);
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
          } else if (t->intern) {
              *t->p = PyString_InternFromString(t->s);
          } else {
@@ -25,13 +25,12 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
          }
          #else  /* Python 3+ has unicode identifiers */
          if (t->is_unicode | t->is_str) {
-            if (unlikely(t->encoding)) {
+            if (t->intern) {
+                *t->p = PyUnicode_InternFromString(t->s);
+            } else if (t->encoding) {
                  *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
              } else {
-                *t->p = PyUnicode_DecodeUnicodeEscape(t->s, t->n - 1, NULL);
-            }
-            if (t->intern && likely(*t->p)) {
-                PyUnicode_InternInPlace(t->p);
+                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
              }
          } else {
              *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
diff --git a/tests/run/unicodeliterals.pyx b/tests/run/unicodeliterals.pyx

index 19da131..0eb90e3 100644 (file)
--- a/tests/run/unicodeliterals.pyx
+++ b/tests/run/unicodeliterals.pyx
@@ -17,10 +17,6 @@ __doc__ = br"""
      u'\x03g\xf8\uf8d2S\xf8k ik'
      >>> f
      u'\xf8'
-    >>> g
-    u'\udc00'
-    >>> h
-    u'\ud800'
      >>> add
      u'S\xf8k ik\xfc\xd6\xe4abc'
      >>> null
@@ -40,10 +36,6 @@ __doc__ = br"""
      10
      >>> len(f)
      1
-    >>> len(g)
-    1
-    >>> len(h)
-    1
      >>> len(add)
      12
      >>> len(null)
@@ -71,10 +63,6 @@ __doc__ = br"""
      True
      >>> f == u'\\xf8' # unescaped by Python
      True
-    >>> g == u'\\udc00' # unescaped by Python (required by doctest)
-    True
-    >>> h == u'\\ud800' # unescaped by Python (required by doctest)
-    True
      >>> k == u'\\N{SNOWMAN}' == u'\\u2603'
      True
      >>> add == u'Søk ik' + u'üÖä' + 'abc'
@@ -107,8 +95,6 @@ c = u'Søk ik'
  d = u'üÖä'
  e = u'\x03\x67\xf8\uf8d2Søk ik'
  f = u'\xf8'
-g = u'\udc00'   # lone trail surrogate
-h = u'\ud800'   # lone lead surrogate
  k = u'\N{SNOWMAN}'
  
  add = u'Søk ik' + u'üÖä' + u'abc'
author	Stefan Behnel <stefan_ml@behnel.de>
	Thu, 10 Jan 2013 21:09:37 +0000 (22:09 +0100)
committer	Stefan Behnel <stefan_ml@behnel.de>
	Thu, 10 Jan 2013 21:09:37 +0000 (22:09 +0100)
CHANGES.rst		patch \| blob \| history
Cython/Compiler/Code.py		patch \| blob \| history
Cython/Compiler/StringEncoding.py		patch \| blob \| history
Cython/Utility/StringTools.c		patch \| blob \| history
tests/run/unicodeliterals.pyx		patch \| blob \| history