def get_string_const(self, text, py_version=None):
# return a C string constant, creating a new one if necessary
if text.is_unicode:
- byte_string = text.escapeencode()
+ byte_string = text.utf8encode()
else:
byte_string = text.byteencode()
try:
# return a Python string constant, creating a new one if necessary
py3str_cstring = None
if is_str and unicode_value is not None \
- and unicode_value.escapeencode() != text.byteencode():
+ and unicode_value.utf8encode() != text.byteencode():
py3str_cstring = self.get_string_const(unicode_value, py_version=3)
c_string = self.get_string_const(text, py_version=2)
else:
_unicode, _str, _bytes = unicode, str, str
IS_PYTHON3 = False
-IS_PYTHON24 = sys.version_info[:2] < (2,5)
-
empty_bytes = _bytes()
empty_unicode = _unicode()
assert self.encoding is None
return self.encode("UTF-8")
- def escapeencode(self):
- assert self.encoding is None
- if IS_PYTHON24:
- # work around bug in Py24 encoder
- return self.replace(u'\\', u'\\\\').encode('unicode_escape')
- return self.encode('unicode_escape')
-
def is_unicode(self):
return self.encoding is None
is_unicode = property(is_unicode)
def utf8encode(self):
assert False, "this is not a unicode string: %r" % self
- def escapeencode(self):
- assert False, "this is not a unicode string: %r" % self
-
def __str__(self):
"""Fake-decode the byte string to unicode to support %
formatting of unicode strings.
r'\v' : u'\v',
}.get
-_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
-
def _to_escape_sequence(s):
if s in '\n\r\t':
return repr(s)[1:-1]
# within a character sequence, oct passes much better than hex
return ''.join(['\\%03o' % ord(c) for c in s])
-def _build_specials_replacer():
+_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
+_c_special_replacements = [(orig.encode('ASCII'),
+ _to_escape_sequence(orig).encode('ASCII'))
+ for orig in _c_special ]
+
+def _build_specials_test():
subexps = []
- replacements = {}
for special in _c_special:
regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
subexps.append(regexp)
- replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
-
- sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
- def replace_specials(m):
- return replacements[m.group(1)]
- def replace(s):
- return sub(replace_specials, s)
- return replace
+ return re.compile('|'.join(subexps).encode('ASCII')).search
-_replace_specials = _build_specials_replacer()
+_has_specials = _build_specials_test()
def escape_char(c):
if IS_PYTHON3:
encoded as ISO-8859-1, will result in the correct byte sequence
being written.
"""
- s = _replace_specials(s)
+ if _has_specials(s):
+ for special, replacement in _c_special_replacements:
+ if special in s:
+ s = s.replace(special, replacement)
try:
return s.decode("ASCII") # trial decoding: plain ASCII => done
except UnicodeDecodeError:
while (t->p) {
#if PY_MAJOR_VERSION < 3
if (t->is_unicode) {
- *t->p = PyUnicode_DecodeUnicodeEscape(t->s, t->n - 1, NULL);
+ *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
} else if (t->intern) {
*t->p = PyString_InternFromString(t->s);
} else {
}
#else /* Python 3+ has unicode identifiers */
if (t->is_unicode | t->is_str) {
- if (unlikely(t->encoding)) {
+ if (t->intern) {
+ *t->p = PyUnicode_InternFromString(t->s);
+ } else if (t->encoding) {
*t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
} else {
- *t->p = PyUnicode_DecodeUnicodeEscape(t->s, t->n - 1, NULL);
- }
- if (t->intern && likely(*t->p)) {
- PyUnicode_InternInPlace(t->p);
+ *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
}
} else {
*t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
u'\x03g\xf8\uf8d2S\xf8k ik'
>>> f
u'\xf8'
- >>> g
- u'\udc00'
- >>> h
- u'\ud800'
>>> add
u'S\xf8k ik\xfc\xd6\xe4abc'
>>> null
10
>>> len(f)
1
- >>> len(g)
- 1
- >>> len(h)
- 1
>>> len(add)
12
>>> len(null)
True
>>> f == u'\\xf8' # unescaped by Python
True
- >>> g == u'\\udc00' # unescaped by Python (required by doctest)
- True
- >>> h == u'\\ud800' # unescaped by Python (required by doctest)
- True
>>> k == u'\\N{SNOWMAN}' == u'\\u2603'
True
>>> add == u'Søk ik' + u'üÖä' + 'abc'
d = u'üÖä'
e = u'\x03\x67\xf8\uf8d2Søk ik'
f = u'\xf8'
-g = u'\udc00' # lone trail surrogate
-h = u'\ud800' # lone lead surrogate
k = u'\N{SNOWMAN}'
add = u'Søk ik' + u'üÖä' + u'abc'