From f989876bd4e3df666f53941cf355cc20cd96d5fc Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Thu, 10 Jan 2013 22:09:37 +0100 Subject: [PATCH] undo Py3.3 surrogates support fixes - breaks too many special cases with strings --- CHANGES.rst | 7 ------- Cython/Compiler/Code.py | 4 ++-- Cython/Compiler/StringEncoding.py | 38 ++++++++++++-------------------------- Cython/Utility/StringTools.c | 11 +++++------ tests/run/unicodeliterals.pyx | 14 -------------- 5 files changed, 19 insertions(+), 55 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index c5ff3ad..9c79b80 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -26,13 +26,6 @@ Features added Bugs fixed ---------- -* Surrogate code points in Unicode string literals failed to compile and/or - load in CPython 3.3. To work around this change introduced by CPython, - Cython switched from UTF-8 to Python Unicode escapes ('\u0101') internally - for storing literal Unicode strings in C code. This may add a slight - initialisation overhead if a large number of non-Latin1 characters are - used in the code. - Other changes ------------- diff --git a/Cython/Compiler/Code.py b/Cython/Compiler/Code.py index 9399fcb..c348b16 100644 --- a/Cython/Compiler/Code.py +++ b/Cython/Compiler/Code.py @@ -991,7 +991,7 @@ class GlobalState(object): def get_string_const(self, text, py_version=None): # return a C string constant, creating a new one if necessary if text.is_unicode: - byte_string = text.escapeencode() + byte_string = text.utf8encode() else: byte_string = text.byteencode() try: @@ -1006,7 +1006,7 @@ class GlobalState(object): # return a Python string constant, creating a new one if necessary py3str_cstring = None if is_str and unicode_value is not None \ - and unicode_value.escapeencode() != text.byteencode(): + and unicode_value.utf8encode() != text.byteencode(): py3str_cstring = self.get_string_const(unicode_value, py_version=3) c_string = self.get_string_const(text, py_version=2) else: diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py index 8fc37fc..1ca490e 100644 --- a/Cython/Compiler/StringEncoding.py +++ b/Cython/Compiler/StringEncoding.py @@ -12,8 +12,6 @@ else: _unicode, _str, _bytes = unicode, str, str IS_PYTHON3 = False -IS_PYTHON24 = sys.version_info[:2] < (2,5) - empty_bytes = _bytes() empty_unicode = _unicode() @@ -128,13 +126,6 @@ class EncodedString(_unicode): assert self.encoding is None return self.encode("UTF-8") - def escapeencode(self): - assert self.encoding is None - if IS_PYTHON24: - # work around bug in Py24 encoder - return self.replace(u'\\', u'\\\\').encode('unicode_escape') - return self.encode('unicode_escape') - def is_unicode(self): return self.encoding is None is_unicode = property(is_unicode) @@ -156,9 +147,6 @@ class BytesLiteral(_bytes): def utf8encode(self): assert False, "this is not a unicode string: %r" % self - def escapeencode(self): - assert False, "this is not a unicode string: %r" % self - def __str__(self): """Fake-decode the byte string to unicode to support % formatting of unicode strings. @@ -177,8 +165,6 @@ char_from_escape_sequence = { r'\v' : u'\v', }.get -_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) - def _to_escape_sequence(s): if s in '\n\r\t': return repr(s)[1:-1] @@ -190,22 +176,19 @@ def _to_escape_sequence(s): # within a character sequence, oct passes much better than hex return ''.join(['\\%03o' % ord(c) for c in s]) -def _build_specials_replacer(): +_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) +_c_special_replacements = [(orig.encode('ASCII'), + _to_escape_sequence(orig).encode('ASCII')) + for orig in _c_special ] + +def _build_specials_test(): subexps = [] - replacements = {} for special in _c_special: regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) subexps.append(regexp) - replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') - - sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub - def replace_specials(m): - return replacements[m.group(1)] - def replace(s): - return sub(replace_specials, s) - return replace + return re.compile('|'.join(subexps).encode('ASCII')).search -_replace_specials = _build_specials_replacer() +_has_specials = _build_specials_test() def escape_char(c): if IS_PYTHON3: @@ -227,7 +210,10 @@ def escape_byte_string(s): encoded as ISO-8859-1, will result in the correct byte sequence being written. """ - s = _replace_specials(s) + if _has_specials(s): + for special, replacement in _c_special_replacements: + if special in s: + s = s.replace(special, replacement) try: return s.decode("ASCII") # trial decoding: plain ASCII => done except UnicodeDecodeError: diff --git a/Cython/Utility/StringTools.c b/Cython/Utility/StringTools.c index 11464c2..53dfc77 100644 --- a/Cython/Utility/StringTools.c +++ b/Cython/Utility/StringTools.c @@ -17,7 +17,7 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { while (t->p) { #if PY_MAJOR_VERSION < 3 if (t->is_unicode) { - *t->p = PyUnicode_DecodeUnicodeEscape(t->s, t->n - 1, NULL); + *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL); } else if (t->intern) { *t->p = PyString_InternFromString(t->s); } else { @@ -25,13 +25,12 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { } #else /* Python 3+ has unicode identifiers */ if (t->is_unicode | t->is_str) { - if (unlikely(t->encoding)) { + if (t->intern) { + *t->p = PyUnicode_InternFromString(t->s); + } else if (t->encoding) { *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL); } else { - *t->p = PyUnicode_DecodeUnicodeEscape(t->s, t->n - 1, NULL); - } - if (t->intern && likely(*t->p)) { - PyUnicode_InternInPlace(t->p); + *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1); } } else { *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1); diff --git a/tests/run/unicodeliterals.pyx b/tests/run/unicodeliterals.pyx index 19da131..0eb90e3 100644 --- a/tests/run/unicodeliterals.pyx +++ b/tests/run/unicodeliterals.pyx @@ -17,10 +17,6 @@ __doc__ = br""" u'\x03g\xf8\uf8d2S\xf8k ik' >>> f u'\xf8' - >>> g - u'\udc00' - >>> h - u'\ud800' >>> add u'S\xf8k ik\xfc\xd6\xe4abc' >>> null @@ -40,10 +36,6 @@ __doc__ = br""" 10 >>> len(f) 1 - >>> len(g) - 1 - >>> len(h) - 1 >>> len(add) 12 >>> len(null) @@ -71,10 +63,6 @@ __doc__ = br""" True >>> f == u'\\xf8' # unescaped by Python True - >>> g == u'\\udc00' # unescaped by Python (required by doctest) - True - >>> h == u'\\ud800' # unescaped by Python (required by doctest) - True >>> k == u'\\N{SNOWMAN}' == u'\\u2603' True >>> add == u'Søk ik' + u'üÖä' + 'abc' @@ -107,8 +95,6 @@ c = u'Søk ik' d = u'üÖä' e = u'\x03\x67\xf8\uf8d2Søk ik' f = u'\xf8' -g = u'\udc00' # lone trail surrogate -h = u'\ud800' # lone lead surrogate k = u'\N{SNOWMAN}' add = u'Søk ik' + u'üÖä' + u'abc' -- 2.7.4