From: Stefan Behnel Date: Sun, 6 Jan 2013 19:19:41 +0000 (+0100) Subject: implement \N{...} Unicode escapes for literals X-Git-Tag: 0.18b1~19 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2a9d8d459f09b9dcc8d3c0322a2227a3fed70b45;p=platform%2Fupstream%2Fpython-cython.git implement \N{...} Unicode escapes for literals --- diff --git a/CHANGES.rst b/CHANGES.rst index 0e3b115..c5ff3ad 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -8,6 +8,8 @@ Cython Changelog Features added -------------- +* Named Unicode escapes ("\N{...}") are supported. + * Python functions/classes provide the special attribute "__qualname__" as defined by PEP 3155. diff --git a/Cython/Compiler/Lexicon.py b/Cython/Compiler/Lexicon.py index cb716b4..7195f13 100644 --- a/Cython/Compiler/Lexicon.py +++ b/Cython/Compiler/Lexicon.py @@ -66,6 +66,7 @@ def make_lexicon(): two_hex = hexdigit + hexdigit four_hex = two_hex + two_hex escapeseq = Str("\\") + (two_oct | three_oct | + Str('N{') + Rep(AnyBut('}')) + Str('}') | Str('u') + four_hex | Str('x') + two_hex | Str('U') + four_hex + four_hex | AnyChar) diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py index 91ea03d..0850ed4 100644 --- a/Cython/Compiler/Parsing.py +++ b/Cython/Compiler/Parsing.py @@ -8,6 +8,7 @@ import cython cython.declare(Nodes=object, ExprNodes=object, EncodedString=object) import re +import unicodedata from Cython.Compiler.Scanning import PyrexScanner, FileSourceDescriptor import Nodes @@ -803,23 +804,27 @@ def p_string_literal(s, kind_override=None): StringEncoding.char_from_escape_sequence(systr)) elif c == u'\n': pass - elif c == u'x': + elif c == u'x': # \xXX if len(systr) == 4: chars.append_charval( int(systr[2:], 16) ) else: s.error("Invalid hex escape '%s'" % systr) - elif c in u'Uu': - if kind in ('u', ''): - if len(systr) in (6,10): - chrval = int(systr[2:], 16) - if chrval > 1114111: # sys.maxunicode: - s.error("Invalid unicode escape '%s'" % systr) - else: + elif c in u'NUu' and kind in ('u', ''): # \uxxxx, \Uxxxxxxxx, \N{...} + chrval = -1 + if c == u'N': + try: + chrval = ord(unicodedata.lookup(systr[3:-1])) + except KeyError: + s.error("Unknown Unicode character name %r" % systr[3:-1]) + elif len(systr) in (6,10): + chrval = int(systr[2:], 16) + if chrval > 1114111: # sys.maxunicode: s.error("Invalid unicode escape '%s'" % systr) + chrval = -1 else: - # unicode escapes in byte strings are not unescaped - chrval = None - chars.append_uescape(chrval, systr) + s.error("Invalid unicode escape '%s'" % systr) + if chrval >= 0: + chars.append_uescape(chrval, systr) else: chars.append(u'\\' + systr[1:]) if is_python3_source and not has_non_ASCII_literal_characters \ diff --git a/tests/run/strliterals.pyx b/tests/run/strliterals.pyx index f50ca49..a2d68ab 100644 --- a/tests/run/strliterals.pyx +++ b/tests/run/strliterals.pyx @@ -132,9 +132,9 @@ __doc__ = ur""" >>> len(bytes_uescape) 28 - >>> (sys.version_info[0] >= 3 and sys.maxunicode == 1114111 and len(str_uescape) == 3 or - ... sys.version_info[0] >= 3 and sys.maxunicode == 65535 and len(str_uescape) == 4 or - ... sys.version_info[0] < 3 and len(str_uescape) == 17 or + >>> (sys.version_info[0] >= 3 and sys.maxunicode == 1114111 and len(str_uescape) == 4 or + ... sys.version_info[0] >= 3 and sys.maxunicode == 65535 and len(str_uescape) == 5 or + ... sys.version_info[0] < 3 and len(str_uescape) == 28 or ... len(str_uescape)) True >>> (sys.version_info[0] >= 3 and str_uescape[0] == 'c' or @@ -143,6 +143,10 @@ __doc__ = ur""" True >>> print(str_uescape[-1]) B + >>> (sys.version_info[0] >= 3 and ord(str_uescape[-2]) == 0x2603 or + ... sys.version_info[0] < 3 and str_uescape[-12:-1] == b'\\N{SNOWMAN}' or + ... sys.version_info[0] >= 3 and ord(str_uescape[-2]) or str_uescape[-12:-1]) + True >>> newlines == "Aaa\n" True @@ -185,7 +189,7 @@ bresc = br'\12\'\"\\' uresc = ur'\12\'\"\\' bytes_uescape = b'\u1234\U12345678\u\u1\u12\uX' -str_uescape = '\u0063\U00012345\x42' +str_uescape = '\u0063\U00012345\N{SNOWMAN}\x42' newlines = "Aaa\n" diff --git a/tests/run/unicodeliterals.pyx b/tests/run/unicodeliterals.pyx index 7588d70..19da131 100644 --- a/tests/run/unicodeliterals.pyx +++ b/tests/run/unicodeliterals.pyx @@ -75,6 +75,8 @@ __doc__ = br""" True >>> h == u'\\ud800' # unescaped by Python (required by doctest) True + >>> k == u'\\N{SNOWMAN}' == u'\\u2603' + True >>> add == u'Søk ik' + u'üÖä' + 'abc' True >>> null == u'\\x00' # unescaped by Python (required by doctest) @@ -107,6 +109,7 @@ e = u'\x03\x67\xf8\uf8d2Søk ik' f = u'\xf8' g = u'\udc00' # lone trail surrogate h = u'\ud800' # lone lead surrogate +k = u'\N{SNOWMAN}' add = u'Søk ik' + u'üÖä' + u'abc' null = u'\x00' diff --git a/tests/run/unicodeliteralslatin1.pyx b/tests/run/unicodeliteralslatin1.pyx index 46f708a..f5b9e53 100644 --- a/tests/run/unicodeliteralslatin1.pyx +++ b/tests/run/unicodeliteralslatin1.pyx @@ -55,6 +55,8 @@ __doc__ = br""" True >>> f == u'\\xf8' # unescaped by Python True + >>> k == u'ä' == u'\\N{LATIN SMALL LETTER A WITH DIAERESIS}' + True >>> add == u'Søk ik' + u'üÖä' + 'abc' True >>> null == u'\\x00' # unescaped by Python (required by doctest) @@ -75,6 +77,7 @@ c = u'S d = u'üÖä' e = u'\x03\x67\xf8\uf8d2Søk ik' f = u'\xf8' +k = u'\N{LATIN SMALL LETTER A WITH DIAERESIS}' add = u'Søk ik' + u'üÖä' + u'abc' null = u'\x00'