From 6f71ec74d3e1b36aa15467100cab2a29f5d355d5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Vladimir=20Plazun/AI=20Tools=20Lab=20/SRR/Engineer/?= =?utf8?q?=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Mon, 28 Jan 2019 18:17:34 +0300 Subject: [PATCH] [nnc] Increase interpreter speed (#2768) Remove vector usage from Shapes and Indices Move all getters into headers( inlining ) Increase inference speed up to 4x-6x times Tinker with Deconv2D implementation to speed it up Fix issue in broadcasting implementation Signed-off-by: Vladimir Plazun v.plazun@partner.samsung.com --- contrib/nnc/core/modelIR/Index.cpp | 20 +--- contrib/nnc/core/modelIR/Shape.cpp | 12 -- contrib/nnc/core/modelIR/TensorVariant.cpp | 29 +---- contrib/nnc/include/ADT/SmallVector.h | 142 +++++++++++++++++++++++ contrib/nnc/include/core/modelIR/Common.h | 31 +++++ contrib/nnc/include/core/modelIR/Index.h | 62 +++++++--- contrib/nnc/include/core/modelIR/Shape.h | 35 ++++-- contrib/nnc/include/core/modelIR/ShapeRange.h | 6 +- contrib/nnc/include/core/modelIR/TensorVariant.h | 28 +++-- contrib/nnc/passes/interpreter/ops/DeConv2D.cpp | 43 ++++--- 10 files changed, 302 insertions(+), 106 deletions(-) create mode 100644 contrib/nnc/include/ADT/SmallVector.h create mode 100644 contrib/nnc/include/core/modelIR/Common.h diff --git a/contrib/nnc/core/modelIR/Index.cpp b/contrib/nnc/core/modelIR/Index.cpp index ebeb182..7f4c30f 100644 --- a/contrib/nnc/core/modelIR/Index.cpp +++ b/contrib/nnc/core/modelIR/Index.cpp @@ -17,37 +17,27 @@ #include "core/modelIR/Index.h" #include +#include namespace nnc { namespace mir { -Index::Index(std::initializer_list&& l) : _indices{l} { - // DO NOTHING -} - -int32_t Index::rank(void) const { return _indices.size(); } - Index& Index::resize(int32_t size) { _indices.resize(size); return *this; } Index& Index::fill(int32_t index) { - std::fill(_indices.begin(), _indices.end(), index); + std::fill(std::begin(_indices), std::end(_indices), index); return (*this); } -int32_t& Index::at(int32_t axis) { return _indices[(axis < 0) ? (_indices.size() + axis) : axis]; } -int32_t Index::at(int32_t axis) const { - return _indices[(axis < 0) ? (_indices.size() + axis) : axis]; -} - -std::ostream& operator<<(std::ostream& s, const Index& sh) { +std::ostream& operator<<(std::ostream& s, const Index& idx) { s << "[ "; - for (int32_t i = 0; i < sh.rank(); ++i) { + for (int32_t i = 0; i < idx.rank(); ++i) { if (i != 0) s << ", "; - s << sh.at(i); + s << idx.at(i); } s << "]"; diff --git a/contrib/nnc/core/modelIR/Shape.cpp b/contrib/nnc/core/modelIR/Shape.cpp index 99765ff..af2a154 100644 --- a/contrib/nnc/core/modelIR/Shape.cpp +++ b/contrib/nnc/core/modelIR/Shape.cpp @@ -28,18 +28,6 @@ void Shape::resize(int32_t size) { _dims.resize(size); } -int32_t Shape::dim(int32_t axis) const { - auto dim = (axis < 0) ? (_dims.size() + axis) : axis; - assert(dim < _dims.size()); - return _dims.at(dim); -} - -int32_t& Shape::dim(int32_t axis) { - auto dim = (axis < 0) ? (_dims.size() + axis) : axis; - assert(dim < _dims.size()); - return _dims.at(dim); -} - int32_t Shape::numElements() const { if (rank() == 0) { return 0; diff --git a/contrib/nnc/core/modelIR/TensorVariant.cpp b/contrib/nnc/core/modelIR/TensorVariant.cpp index f0dbb53..b2722aa 100644 --- a/contrib/nnc/core/modelIR/TensorVariant.cpp +++ b/contrib/nnc/core/modelIR/TensorVariant.cpp @@ -23,7 +23,7 @@ namespace mir { TensorVariant::TensorVariant(DTYPE dtype, const Shape& shape) - : _dtype(dtype), _strides{0}, _rank(shape.rank()), _shape(shape) { + : _dtype(dtype), _shape(shape), _strides(shape.rank()) { switch (dtype) { case DTYPE::FLOAT32: _elementSize = sizeof(float); @@ -44,7 +44,7 @@ TensorVariant::TensorVariant(DTYPE dtype, const Shape& shape) _data.reset(new char[data_size], std::default_delete()); int stride = 1; - for (int d = _rank - 1; d >= 0; --d) + for (int d = _shape.rank() - 1; d >= 0; --d) { _strides[d] = stride; stride *= _shape.dim(d); @@ -65,10 +65,10 @@ TensorVariant::TensorVariant(DTYPE dtype, const Shape& shape, const void* data) */ TensorVariant::TensorVariant(const TensorVariant& t_old, const Shape& shape) - : _dtype(t_old._dtype), _data(t_old._data), _strides{0}, _rank(shape.rank()), + : _dtype(t_old._dtype), _data(t_old._data), _strides(static_cast(shape.rank())), _shape(shape), _elementSize(t_old._elementSize) { - int axis_old = t_old._rank - 1; - for (int d = _rank - 1; d >= 0; d--) { + int axis_old = t_old._shape.rank() - 1; + for (int d = shape.rank() - 1; d >= 0; d--) { if (t_old._shape.dim(axis_old) == 1) _strides[d] = 0; else @@ -79,24 +79,5 @@ TensorVariant::TensorVariant(const TensorVariant& t_old, } } -char* TensorVariant::at(const Index& idx) const { - return _data.get() + getOffset(idx) * _elementSize; -} - -char* TensorVariant::atOffset(int32_t offset) const { - assert(offset >= 0 && offset < getShape().numElements()); - return _data.get() + offset * _elementSize; -} - -size_t TensorVariant::getOffset(const Index &idx) const { - assert(idx.rank() == getShape().rank()); - std::size_t offset = 0; - for (size_t i = 0; i < _rank; ++i) - { - offset += idx.at(i) * _strides[i]; - } - return offset; -} - } // namespace mir } // namespace nnc diff --git a/contrib/nnc/include/ADT/SmallVector.h b/contrib/nnc/include/ADT/SmallVector.h new file mode 100644 index 0000000..3f09ad7 --- /dev/null +++ b/contrib/nnc/include/ADT/SmallVector.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _NNC_CORE_SMALL_VECTOR_H +#define _NNC_CORE_SMALL_VECTOR_H + +#include +#include +#include + +namespace nnc { +namespace adt { + +/** + * @brief vector with cheap memory allocation + * @tparam T type of elements + * @tparam Capacity maximum number of elements + * @note much like std::array, but tracks number of used elements. Stored in stack + */ +template +class small_vector { +public: + using value_type = T; + using reference = T&; + using iterator = T*; + using size_type = size_t; + + template + small_vector(It begin, It end) : _size(std::distance(begin, end)) { + assert(_size <= Capacity); + std::copy(begin, end, this->begin()); + } + + explicit small_vector(size_t size, value_type initializer = value_type()) : _size(size) { + assert(_size <= Capacity); + std::fill(begin(), end(), initializer); + } + + explicit small_vector() : _size(0) {} + + small_vector(std::initializer_list&& l) : _size(l.size()) { + assert(_size <= Capacity); + std::copy(std::begin(l), std::end(l), begin()); + } + + /** + * @return current size + */ + inline size_t size() const noexcept { + return _size; + } + + /** + * @return maximum number of elements this vector can hold + */ + constexpr size_t capacity() const { + return Capacity; + } + + /** + * @brief resize to given new size + * @note if new size is greater than current size, new elements are default-initialized + */ + void resize(size_t new_size) noexcept { + assert(new_size <= Capacity); + if (new_size > _size) { + std::fill(_storage + _size, _storage + new_size, T()); + } + _size = new_size; + } + + /** + * @return reference to the element at position idx + */ + inline reference operator[](size_t idx) noexcept { + assert(idx >= 0 && idx < _size); + return _storage[idx]; + } + + /** + * @return value of element at position idx + */ + inline constexpr value_type operator[](size_t idx) const noexcept { + //assert on the same line since c++11 does not allow multi-line constexpr functions + return assert(idx >= 0 && idx < _size), _storage[idx]; + } + + inline iterator begin() noexcept { + return std::begin(_storage); + } + + inline iterator end() noexcept { + return _storage + _size; + } + + inline void push_back(const value_type& e) noexcept { + assert(_size < Capacity); + _storage[_size++] = e; + } + + inline void push_back(value_type&& e) noexcept { + assert(_size < Capacity); + _storage[_size++] = std::move(e); + } + +private: + size_t _size; + value_type _storage[Capacity]; +}; + +template +bool operator==(const small_vector& lhs, const small_vector& rhs) { + if (lhs.size() != rhs.size()) { + return false; + } + + bool equal = true; + size_t end = lhs.size(); + for (size_t i = 0; i < end; ++i) { + equal &= (lhs[i] == rhs[i]); + } + + return equal; +} + +} // namespace adt +} // namespace nnc + +#endif //_NNC_CORE_SMALL_VECTOR_H diff --git a/contrib/nnc/include/core/modelIR/Common.h b/contrib/nnc/include/core/modelIR/Common.h new file mode 100644 index 0000000..58e1dd0 --- /dev/null +++ b/contrib/nnc/include/core/modelIR/Common.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _NNC_CORE_COMMON_H_ +#define _NNC_CORE_COMMON_H_ + +namespace nnc { +/** + * @brief maximum number of dimensions what an Index, Shape or Tensor can have + */ + constexpr size_t MAX_DIMENSION_COUNT = 8; + + inline constexpr size_t wrap_index(int32_t index, size_t limit) noexcept { + return static_cast(index >= 0 ? index : limit + index); + } +} + +#endif // _NNC_CORE_COMMON_H_ diff --git a/contrib/nnc/include/core/modelIR/Index.h b/contrib/nnc/include/core/modelIR/Index.h index 931e11a..d6ed3d7 100644 --- a/contrib/nnc/include/core/modelIR/Index.h +++ b/contrib/nnc/include/core/modelIR/Index.h @@ -18,36 +18,66 @@ #define _NNC_CORE_LINALG_INDEX_H_ #include -#include #include #include -namespace nnc -{ -namespace mir -{ +#include "core/modelIR/Common.h" -class Index -{ +#include "ADT/SmallVector.h" + +namespace nnc { +namespace mir { + +class Index { public: Index() = default; - Index(std::initializer_list&& l); - Index(std::vector&& vec); - int32_t rank(void) const; + Index(std::initializer_list&& l) noexcept : _indices(std::move(l)) { + // DO NOTHING + } + + Index(size_t rank) : _indices(rank) {} + + int32_t rank() const { return static_cast(_indices.size()); } + + + /** + * @brief resize index to given dimension number + * @param size new number of dimensions + * @return *this + * @warning if new size is greater than old, new dimensions are undefined + */ + Index& resize(int32_t size); - Index &resize(int32_t size); + /** + * @brief fill all axis with `index` + * @return `*this` + */ + Index& fill(int32_t index); - Index &fill(int32_t index); + /** + * @brief return position on given axis + * @param axis index of axis to get index on. If axis is negative returns axis from the last + * @return + */ + int32_t& at(int32_t axis) { + return _indices[wrap_index(axis, _indices.size())]; + } - int32_t &at(int32_t axis); - int32_t at(int32_t axis) const; + /** + * @brief return position on given axis + * @param axis index of axis to get index on. If axis is negative returns axis from the last + * @return + */ + int32_t at(int32_t axis) const { + return _indices[wrap_index(axis, _indices.size())]; + } private: - std::vector _indices; + adt::small_vector _indices; }; -std::ostream &operator<<(std::ostream &s, const Index &sh); +std::ostream &operator<<(std::ostream &s, const Index &idx); } // namespace mir } // namespace nnc diff --git a/contrib/nnc/include/core/modelIR/Shape.h b/contrib/nnc/include/core/modelIR/Shape.h index f495270..13028d3 100644 --- a/contrib/nnc/include/core/modelIR/Shape.h +++ b/contrib/nnc/include/core/modelIR/Shape.h @@ -21,6 +21,10 @@ #include #include #include +#include + +#include "ADT/SmallVector.h" +#include "core/modelIR/Common.h" namespace nnc { namespace mir { @@ -31,28 +35,40 @@ public: Shape() = default; - explicit Shape(int32_t rank) : _dims(static_cast(rank)) {} + explicit Shape(int32_t rank) : _dims(rank) {} - Shape(std::initializer_list dims) : _dims(dims) {} + Shape(std::initializer_list&& dims) : _dims(std::move(dims)) {} - explicit Shape(const std::vector& dims) : _dims(dims) {} + explicit Shape(const std::vector& dims) : _dims(std::begin(dims), std::end(dims)) {} - int32_t rank() const { return static_cast(_dims.size()); } + int32_t rank() const { + return static_cast(_dims.size()); + } void resize(int32_t size); - int32_t& dim(int32_t axis); + int32_t& dim(int32_t axis) noexcept { + auto dim = wrap_index(axis, _dims.size()); + return _dims[dim]; + }; - int32_t dim(int32_t axis) const; + int32_t dim(int32_t axis) const noexcept { + auto dim = wrap_index(axis, _dims.size()); + return _dims[dim]; + } int32_t numElements() const; - bool operator==(const Shape& rhs) const { return _dims == rhs._dims; } + bool operator==(const Shape& rhs) const { + return _dims == rhs._dims; + } - bool operator!=(const Shape& rhs) const { return _dims != rhs._dims; } + bool operator!=(const Shape& rhs) const { + return !(*this == rhs); + } private: - std::vector _dims; + adt::small_vector _dims; }; std::ostream& operator<<(std::ostream& s, const Shape& sh); @@ -61,3 +77,4 @@ std::ostream& operator<<(std::ostream& s, const Shape& sh); } // namespace nnc #endif // _NNC_CORE_LINALG_SHAPE_H_ + diff --git a/contrib/nnc/include/core/modelIR/ShapeRange.h b/contrib/nnc/include/core/modelIR/ShapeRange.h index 12bf83b..e49a235 100644 --- a/contrib/nnc/include/core/modelIR/ShapeRange.h +++ b/contrib/nnc/include/core/modelIR/ShapeRange.h @@ -36,7 +36,7 @@ class ShapeIter : int32_t rank = _shape.rank(); int32_t c = rank - 1; pidx[c]++; - while( (pidx[c] > pshape[c] - 1) && (c > 0) ) { + while (pidx[c] >= pshape[c] && c > 0 ) { pidx[c] = 0; pidx[--c]++; } @@ -44,7 +44,7 @@ class ShapeIter : return *this; } - ShapeIter operator++(int) { + const ShapeIter operator++(int) { ShapeIter it = *this; ++*this; return it; @@ -54,7 +54,7 @@ class ShapeIter : return _index; } - bool operator!=(ShapeIter& iter) { + bool operator!=(const ShapeIter& iter) const { assert(iter._index.rank() == _index.rank()); assert(iter._shape == _shape); return _pos != iter._pos; diff --git a/contrib/nnc/include/core/modelIR/TensorVariant.h b/contrib/nnc/include/core/modelIR/TensorVariant.h index 8544c1f..8bf3a1d 100644 --- a/contrib/nnc/include/core/modelIR/TensorVariant.h +++ b/contrib/nnc/include/core/modelIR/TensorVariant.h @@ -24,14 +24,13 @@ #include "core/modelIR/Index.h" #include "core/modelIR/Shape.h" #include "core/modelIR/DataType.h" +#include "core/modelIR/Common.h" namespace nnc { namespace mir { -constexpr int MAX_DIMENSIONS = 32; - class TensorVariant { public: TensorVariant(DTYPE dtype, const Shape& shape); @@ -42,20 +41,31 @@ public: virtual ~TensorVariant() = default; - char* at(const Index& idx) const; - char* atOffset(int32_t offset) const; - size_t getOffset(const Index &idx) const; + char* at(const Index& idx) const { + return _data.get() + getOffset(idx) * _elementSize; + } - virtual const Shape &getShape() const { return _shape; } - DTYPE getDataType() const { return _dtype; } + char* atOffset(int32_t offset) const { + assert(offset >= 0 && offset < getShape().numElements()); + return _data.get() + offset * _elementSize; + } + size_t getOffset(const Index &idx) const { + assert(idx.rank() == getShape().rank()); + std::size_t offset = 0; + for (int i = 0; i < _shape.rank(); ++i) + offset += idx.at(i) * _strides[i]; + return offset; + } + + const Shape &getShape() const { return _shape; } + DTYPE getDataType() const { return _dtype; } size_t getElementSize() const { return _elementSize; } private: DTYPE _dtype; std::shared_ptr _data; - int_fast32_t _strides[MAX_DIMENSIONS]; - size_t _rank; + adt::small_vector _strides; Shape _shape; size_t _elementSize; diff --git a/contrib/nnc/passes/interpreter/ops/DeConv2D.cpp b/contrib/nnc/passes/interpreter/ops/DeConv2D.cpp index 5b0afd8..8208613 100644 --- a/contrib/nnc/passes/interpreter/ops/DeConv2D.cpp +++ b/contrib/nnc/passes/interpreter/ops/DeConv2D.cpp @@ -52,42 +52,49 @@ std::vector nnc::DeConv2D::operator()() { Index input_idx; input_idx.resize(in_shape.rank()); - for (auto &out_idx : out_range) { - for (auto &kernel_idx_r : kernel_range) { - auto kernel_idx = kernel_idx_r; + Index kernel_idx; + kernel_idx.resize(k_shape.rank()); + for (auto& out_idx : out_range) { + auto out_region = res_accesor.getRegion(out_idx); + assert(out_region.size() == num_kernels); + + for (auto& kernel_idx_r : kernel_range) { // rotate kernel 180 deg around last axis // by index transform for (int32_t d = 0; d < 2; ++d) { - kernel_idx.at(d) = kernel.getShape().dim(d) - kernel_idx.at(d) - 1; + kernel_idx.at(d) = kernel.getShape().dim(d) - kernel_idx_r.at(d) - 1; } + kernel_idx.at(2) = kernel_idx_r.at(2); + kernel_idx.at(3) = kernel_idx_r.at(3); // flag that keeps info on whether the current input element is from input // or is from dilation by stride bool is_from_input = true; for (int32_t d = 1; d < input_idx.rank() - 1; ++d) { const auto num = (out_idx.at(d) + pads.at(d - 1) - kernel_idx.at(d - 1)); - const auto div_res = num / strides.dim(d - 1); - const auto rem = num % strides.dim(d - 1); + auto stride = strides.dim(d - 1 ); + const auto div_res = num / stride; + const auto rem = num - div_res*stride; is_from_input = is_from_input && rem == 0; if (rem != 0) break; input_idx.at(d) = div_res; } - // batch is same as output's - input_idx.at(0) = out_idx.at(0); - // channel index - same as kernel's - input_idx.at(3) = kernel_idx.at(2); + if (is_from_input) { + // batch is same as output's + input_idx.at(0) = out_idx.at(0); + // channel index - same as kernel's + input_idx.at(3) = kernel_idx.at(2); - if (in_range.contains(input_idx) and is_from_input) { - auto kernel_region = kernel.getRegion(kernel_idx); - assert( kernel_region.size() == num_kernels ); + if (in_range.contains(input_idx)) { + auto kernel_region = kernel.getRegion(kernel_idx); + assert(kernel_region.size() == num_kernels); - auto out_region = res_accesor.getRegion(out_idx); - assert( out_region.size() == num_kernels ); - auto in = _input.at(input_idx); + auto in = _input.at(input_idx); - for (int32_t kernel_index = 0; kernel_index < num_kernels; kernel_index++) { - out_region.base()[kernel_index] += in * kernel_region.base()[kernel_index]; + for (int32_t kernel_index = 0; kernel_index < num_kernels; kernel_index++) { + out_region.base()[kernel_index] += in * kernel_region.base()[kernel_index]; + } } } } -- 2.7.4