optimise bytearray.decode()
authorStefan Behnel <stefan_ml@behnel.de>
Sun, 3 Nov 2013 12:59:20 +0000 (13:59 +0100)
committerStefan Behnel <stefan_ml@behnel.de>
Sun, 3 Nov 2013 12:59:20 +0000 (13:59 +0100)
--HG--
rename : tests/run/bytesmethods.pyx => tests/run/bytearraymethods.pyx

Cython/Compiler/Optimize.py
Cython/Utility/StringTools.c
runtests.py
tests/run/bytearraymethods.pyx [new file with mode: 0644]

index 1a2ca8a..1d8f3e6 100644 (file)
@@ -2788,11 +2788,11 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
             string_node = string_node.arg
 
         string_type = string_node.type
-        if string_type is Builtin.bytes_type:
+        if string_type in (Builtin.bytes_type, Builtin.bytearray_type):
             if is_unbound_method:
                 string_node = string_node.as_none_safe_node(
                     "descriptor '%s' requires a '%s' object but received a 'NoneType'",
-                    format_args = ['decode', 'bytes'])
+                    format_args = ['decode', string_type.name])
             else:
                 string_node = string_node.as_none_safe_node(
                     "'NoneType' object has no attribute '%s'",
@@ -2862,12 +2862,15 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
             helper_func_type = self._decode_cpp_string_func_type
             utility_code_name = 'decode_cpp_string'
         else:
-            # Python bytes object
+            # Python bytes/bytearray object
             if not stop:
                 stop = ExprNodes.IntNode(node.pos, value='PY_SSIZE_T_MAX',
                                          constant_result=ExprNodes.not_a_constant)
             helper_func_type = self._decode_bytes_func_type
-            utility_code_name = 'decode_bytes'
+            if string_type is Builtin.bytes_type:
+                utility_code_name = 'decode_bytes'
+            else:
+                utility_code_name = 'decode_bytearray'
 
         node = ExprNodes.PythonCapiCallNode(
             node.pos, '__Pyx_%s' % utility_code_name, helper_func_type,
@@ -2880,6 +2883,8 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
             node = UtilNodes.EvalWithTempExprNode(temp, node)
         return node
 
+    _handle_simple_method_bytearray_decode = _handle_simple_method_bytes_decode
+
     def _find_special_codec_name(self, encoding):
         try:
             requested_codec = codecs.getencoder(encoding)
@@ -2957,6 +2962,18 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
             node, function, args, is_unbound_method, 'bytes', 'startswith',
             bytes_tailmatch_utility_code, -1)
 
+    '''   # disabled for now, enable when we consider it worth it (see StringTools.c)
+    def _handle_simple_method_bytearray_endswith(self, node, function, args, is_unbound_method):
+        return self._inject_tailmatch(
+            node, function, args, is_unbound_method, 'bytearray', 'endswith',
+            bytes_tailmatch_utility_code, +1)
+
+    def _handle_simple_method_bytearray_startswith(self, node, function, args, is_unbound_method):
+        return self._inject_tailmatch(
+            node, function, args, is_unbound_method, 'bytearray', 'startswith',
+            bytes_tailmatch_utility_code, -1)
+    '''
+
     ### helpers
 
     def _substitute_method_call(self, node, function, name, func_type,
index 1a3a043..1bae48f 100644 (file)
@@ -279,41 +279,15 @@ static CYTHON_INLINE Py_UCS4 __Pyx_GetItemInt_Unicode_Generic(PyObject* ustring,
 }
 
 /////////////// decode_cpp_string.proto ///////////////
-//@requires IncludeCppStringH
-
-static CYTHON_INLINE PyObject* __Pyx_decode_cpp_string(
-         std::string cppstring, Py_ssize_t start, Py_ssize_t stop,
-         const char* encoding, const char* errors,
-         PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors));
-
-/////////////// decode_cpp_string ///////////////
+//@requires: IncludeCppStringH
+//@requires: decode_c_bytes
 
 static CYTHON_INLINE PyObject* __Pyx_decode_cpp_string(
          std::string cppstring, Py_ssize_t start, Py_ssize_t stop,
          const char* encoding, const char* errors,
          PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors)) {
-    const char* cstring = cppstring.data();
-    Py_ssize_t length = cppstring.size();
-
-    if (unlikely(start < 0)) {
-        start += length;
-        if (unlikely(start < 0))
-            start = 0;
-    }
-    if (unlikely(stop < 0))
-        stop += length;
-    else if (stop >= length)
-        stop = length;
-    if (unlikely(start >= stop))
-        return PyUnicode_FromUnicode(NULL, 0);
-    cstring += start;
-    length = stop - start;
-
-    if (decode_func) {
-        return decode_func(cstring, length, errors);
-    } else {
-        return PyUnicode_Decode(cstring, length, encoding, errors);
-    }
+    return __Pyx_decode_c_bytes(
+        cppstring.data(), cppstring.size(), start, stop, encoding, errors, decode_func);
 }
 
 /////////////// decode_c_string.proto ///////////////
@@ -324,7 +298,9 @@ static CYTHON_INLINE PyObject* __Pyx_decode_c_string(
          PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors));
 
 /////////////// decode_c_string ///////////////
-//@requires IncludeStringH
+//@requires: IncludeStringH
+
+/* duplicate code to avoid calling strlen() if start >= 0 and stop >= 0 */
 
 static CYTHON_INLINE PyObject* __Pyx_decode_c_string(
          const char* cstring, Py_ssize_t start, Py_ssize_t stop,
@@ -352,21 +328,19 @@ static CYTHON_INLINE PyObject* __Pyx_decode_c_string(
     }
 }
 
-/////////////// decode_bytes.proto ///////////////
+/////////////// decode_c_bytes.proto ///////////////
 
-static CYTHON_INLINE PyObject* __Pyx_decode_bytes(
-         PyObject* string, Py_ssize_t start, Py_ssize_t stop,
+static CYTHON_INLINE PyObject* __Pyx_decode_c_bytes(
+         const char* cstring, Py_ssize_t length, Py_ssize_t start, Py_ssize_t stop,
          const char* encoding, const char* errors,
          PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors));
 
-/////////////// decode_bytes ///////////////
+/////////////// decode_c_bytes ///////////////
 
-static CYTHON_INLINE PyObject* __Pyx_decode_bytes(
-         PyObject* string, Py_ssize_t start, Py_ssize_t stop,
+static CYTHON_INLINE PyObject* __Pyx_decode_c_bytes(
+         const char* cstring, Py_ssize_t length, Py_ssize_t start, Py_ssize_t stop,
          const char* encoding, const char* errors,
          PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors)) {
-    char* cstring;
-    Py_ssize_t length = PyBytes_GET_SIZE(string);
     if (unlikely((start < 0) | (stop < 0))) {
         if (start < 0) {
             start += length;
@@ -381,7 +355,7 @@ static CYTHON_INLINE PyObject* __Pyx_decode_bytes(
     length = stop - start;
     if (unlikely(length <= 0))
         return PyUnicode_FromUnicode(NULL, 0);
-    cstring = PyBytes_AS_STRING(string) + start;
+    cstring += start;
     if (decode_func) {
         return decode_func(cstring, length, errors);
     } else {
@@ -389,6 +363,30 @@ static CYTHON_INLINE PyObject* __Pyx_decode_bytes(
     }
 }
 
+/////////////// decode_bytes.proto ///////////////
+//@requires: decode_c_bytes
+
+static CYTHON_INLINE PyObject* __Pyx_decode_bytes(
+         PyObject* string, Py_ssize_t start, Py_ssize_t stop,
+         const char* encoding, const char* errors,
+         PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors)) {
+    return __Pyx_decode_c_bytes(
+        PyBytes_AS_STRING(string), PyBytes_GET_SIZE(string),
+        start, stop, encoding, errors, decode_func);
+}
+
+/////////////// decode_bytearray.proto ///////////////
+//@requires: decode_c_bytes
+
+static CYTHON_INLINE PyObject* __Pyx_decode_bytearray(
+         PyObject* string, Py_ssize_t start, Py_ssize_t stop,
+         const char* encoding, const char* errors,
+         PyObject* (*decode_func)(const char *s, Py_ssize_t size, const char *errors)) {
+    return __Pyx_decode_c_bytes(
+        PyByteArray_AS_STRING(string), PyByteArray_GET_SIZE(string),
+        start, stop, encoding, errors, decode_func);
+}
+
 /////////////// PyUnicode_Substring.proto ///////////////
 
 static CYTHON_INLINE PyObject* __Pyx_PyUnicode_Substring(
index c189447..58f6c35 100755 (executable)
@@ -235,7 +235,7 @@ VER_DEP_MODULES = {
                                           'run.pure_py', # decorators, with statement
                                           'run.purecdef',
                                           'run.struct_conversion',
-                                          'run.bytearray_coercion',
+                                          'run.bytearray',
                                           # memory views require buffer protocol
                                           'memoryview.relaxed_strides',
                                           'memoryview.cythonarray',
diff --git a/tests/run/bytearraymethods.pyx b/tests/run/bytearraymethods.pyx
new file mode 100644 (file)
index 0000000..c18c03d
--- /dev/null
@@ -0,0 +1,195 @@
+cimport cython
+
+b_a = bytearray(b'a')
+b_b = bytearray(b'b')
+
+
+'''   # disabled for now, enable when we consider it worth the code overhead
+
+@cython.test_assert_path_exists(
+    "//PythonCapiCallNode")
+@cython.test_fail_if_path_exists(
+    "//SimpleCallNode")
+def bytearray_startswith(bytearray s, sub, start=None, stop=None):
+    """
+    >>> bytearray_startswith(b_a, b_a)
+    True
+    >>> bytearray_startswith(b_a+b_b, b_a)
+    True
+    >>> bytearray_startswith(b_a, b_b)
+    False
+    >>> bytearray_startswith(b_a+b_b, b_b)
+    False
+    >>> bytearray_startswith(b_a, (b_a, b_b))
+    True
+    >>> bytearray_startswith(b_a, b_a, 1)
+    False
+    >>> bytearray_startswith(b_a, b_a, 0, 0)
+    False
+    """
+
+    if start is None:
+      return s.startswith(sub)
+    elif stop is None:
+      return s.startswith(sub, start)
+    else:
+      return s.startswith(sub, start, stop)
+
+
+@cython.test_assert_path_exists(
+    "//PythonCapiCallNode")
+@cython.test_fail_if_path_exists(
+    "//SimpleCallNode")
+def bytearray_endswith(bytearray s, sub, start=None, stop=None):
+    """
+    >>> bytearray_endswith(b_a, b_a)
+    True
+    >>> bytearray_endswith(b_b+b_a, b_a)
+    True
+    >>> bytearray_endswith(b_a, b_b)
+    False
+    >>> bytearray_endswith(b_b+b_a, b_b)
+    False
+    >>> bytearray_endswith(b_a, (b_a, b_b))
+    True
+    >>> bytearray_endswith(b_a, b_a, 1)
+    False
+    >>> bytearray_endswith(b_a, b_a, 0, 0)
+    False
+    """
+
+    if start is None:
+      return s.endswith(sub)
+    elif stop is None:
+      return s.endswith(sub, start)
+    else:
+      return s.endswith(sub, start, stop)
+'''
+
+
+@cython.test_assert_path_exists(
+    "//PythonCapiCallNode")
+@cython.test_fail_if_path_exists(
+    "//SimpleCallNode")
+def bytearray_decode(bytearray s, start=None, stop=None):
+    """
+    >>> s = b_a+b_b+b_a+b_a+b_b
+    >>> print(bytearray_decode(s))
+    abaab
+
+    >>> print(bytearray_decode(s, 2))
+    aab
+    >>> print(bytearray_decode(s, -3))
+    aab
+
+    >>> print(bytearray_decode(s, None, 4))
+    abaa
+    >>> print(bytearray_decode(s, None, 400))
+    abaab
+    >>> print(bytearray_decode(s, None, -2))
+    aba
+    >>> print(bytearray_decode(s, None, -4))
+    a
+    >>> print(bytearray_decode(s, None, -5))
+    <BLANKLINE>
+    >>> print(bytearray_decode(s, None, -200))
+    <BLANKLINE>
+
+    >>> print(bytearray_decode(s, 2, 5))
+    aab
+    >>> print(bytearray_decode(s, 2, 500))
+    aab
+    >>> print(bytearray_decode(s, 2, -1))
+    aa
+    >>> print(bytearray_decode(s, 2, -3))
+    <BLANKLINE>
+    >>> print(bytearray_decode(s, 2, -300))
+    <BLANKLINE>
+    >>> print(bytearray_decode(s, -3, -1))
+    aa
+    >>> print(bytearray_decode(s, -300, 300))
+    abaab
+    >>> print(bytearray_decode(s, -300, -4))
+    a
+    >>> print(bytearray_decode(s, -300, -5))
+    <BLANKLINE>
+    >>> print(bytearray_decode(s, -300, -6))
+    <BLANKLINE>
+    >>> print(bytearray_decode(s, -300, -500))
+    <BLANKLINE>
+
+    >>> s[:'test']                       # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    TypeError:...
+    >>> print(bytearray_decode(s, 'test'))   # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    TypeError:...
+    >>> print(bytearray_decode(s, None, 'test'))    # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    TypeError:...
+    >>> print(bytearray_decode(s, 'test', 'test'))  # doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    TypeError:...
+
+    >>> print(bytearray_decode(None))
+    Traceback (most recent call last):
+    AttributeError: 'NoneType' object has no attribute 'decode'
+    >>> print(bytearray_decode(None, 1))
+    Traceback (most recent call last):
+    TypeError: 'NoneType' object is not subscriptable
+    >>> print(bytearray_decode(None, None, 1))
+    Traceback (most recent call last):
+    TypeError: 'NoneType' object is not subscriptable
+    >>> print(bytearray_decode(None, 0, 1))
+    Traceback (most recent call last):
+    TypeError: 'NoneType' object is not subscriptable
+    """
+    if start is None:
+        if stop is None:
+            return s.decode('utf8')
+        else:
+            return s[:stop].decode('utf8')
+    elif stop is None:
+        return s[start:].decode('utf8')
+    else:
+        return s[start:stop].decode('utf8')
+
+
+@cython.test_assert_path_exists(
+    "//PythonCapiCallNode")
+@cython.test_fail_if_path_exists(
+    "//SimpleCallNode")
+def bytearray_decode_unbound_method(bytearray s, start=None, stop=None):
+    """
+    >>> s = b_a+b_b+b_a+b_a+b_b
+    >>> print(bytearray_decode_unbound_method(s))
+    abaab
+    >>> print(bytearray_decode_unbound_method(s, 1))
+    baab
+    >>> print(bytearray_decode_unbound_method(s, None, 3))
+    aba
+    >>> print(bytearray_decode_unbound_method(s, 1, 4))
+    baa
+
+    >>> print(bytearray_decode_unbound_method(None))
+    Traceback (most recent call last):
+    TypeError: descriptor 'decode' requires a 'bytearray' object but received a 'NoneType'
+    >>> print(bytearray_decode_unbound_method(None, 1))
+    Traceback (most recent call last):
+    TypeError: 'NoneType' object is not subscriptable
+    >>> print(bytearray_decode_unbound_method(None, None, 1))
+    Traceback (most recent call last):
+    TypeError: 'NoneType' object is not subscriptable
+    >>> print(bytearray_decode_unbound_method(None, 0, 1))
+    Traceback (most recent call last):
+    TypeError: 'NoneType' object is not subscriptable
+    """
+    if start is None:
+        if stop is None:
+            return bytearray.decode(s, 'utf8')
+        else:
+            return bytearray.decode(s[:stop], 'utf8')
+    elif stop is None:
+        return bytearray.decode(s[start:], 'utf8')
+    else:
+        return bytearray.decode(s[start:stop], 'utf8')