From c3113b4de6be69ffba961d4b7521ca2b69be94cf Mon Sep 17 00:00:00 2001 From: =?utf8?q?Denis=20Maksimenko/AI=20Tools=20Lab=20/SRR/Assistant=20Engi?= =?utf8?q?neer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Tue, 24 Jul 2018 11:36:43 +0300 Subject: [PATCH] [nnc backend] Implement Conv2D_FFT methods (#636) * Added added utility functions. Added pad_input(), unpack_and_pad_kernels(). Implementes most of operator(). * Added fft() implementation. Added concrete implementation of FFT using in-place Cooley-Tukey algorithm. * Added elementwise_product(). Added a function that performs convolution in Fourier domain. Also, fixed some interfaces. * Added ifft() implementation. implemented ifft(), which performs inverse FFT and concludes class implementation. Added multiplying by channels in ifft() indexing. * Restored ShapeRange loop. unpack_and_pad_kernels now uses for (... : kernelRange) again to make code more understandable. Signed-off-by: Denis Maksimenko --- .../core/include/interpreter/ops/conv_FFT.h | 40 +++- .../backend/interpreter/core/src/ops/conv_FFT.cpp | 245 +++++++++++++++++++-- 2 files changed, 256 insertions(+), 29 deletions(-) diff --git a/contrib/nnc/libs/backend/interpreter/core/include/interpreter/ops/conv_FFT.h b/contrib/nnc/libs/backend/interpreter/core/include/interpreter/ops/conv_FFT.h index 6985c70..1e5f068 100644 --- a/contrib/nnc/libs/backend/interpreter/core/include/interpreter/ops/conv_FFT.h +++ b/contrib/nnc/libs/backend/interpreter/core/include/interpreter/ops/conv_FFT.h @@ -53,19 +53,47 @@ public: protected: /// - /// Perform Fast Fourier transform on tensor and return the result. - /// No support for complex tensors yet, so we have to use something else + /// Pad input (with zeroes) according to selected padding type (paddings are calculated in ShapeInference) /// - std::vector fft(const Tensor &tensor); + std::vector pad_input(const Index &pads); /// - /// Perform Inverse Fast Fourier transform on tensor and return the result. + /// Unpack kernels for each out_channel and pad them with zeroes to input size /// - TensorVariant ifft(const std::vector &spectre, - const Shape &out_shape, + std::vector> unpack_and_pad_kernels(const Shape &paddedInputShape, const uint64_t spectreSize); + + /// + /// This function performs elementwise product of input by each kernel + /// and writes result back to kernels vector. + /// + void elementwise_product(const std::vector &input, + std::vector> &kernels); + /// + /// Perform Inverse Fast Fourier transform on elementwise products results. Return result of the convolution. + /// + TensorVariant ifft(std::vector> &spectres, + const Shape &inShape, + const Shape &outShape, const Shape &strides, const Index &paddings); + /// + /// Separate even/odd elements to lower/upper halves of array respectively. + /// This allows in-place computation. + /// + void separate(FFT_complex* array, const uint64_t elements); + + /// + /// Concrete in-place implementation of FFT + /// (using Cooley-Tukey algorithm, hence "_CT" suffixes) + /// + void fft_CT(FFT_complex* array, const uint64_t elements); + + /// + /// Concrete in-place implementation of inverse FFT + /// + void ifft_CT(FFT_complex* array, const uint64_t elements); + private: const Tensor _input; Tensor _kernel; diff --git a/contrib/nnc/libs/backend/interpreter/core/src/ops/conv_FFT.cpp b/contrib/nnc/libs/backend/interpreter/core/src/ops/conv_FFT.cpp index 85149dc..137e40f 100644 --- a/contrib/nnc/libs/backend/interpreter/core/src/ops/conv_FFT.cpp +++ b/contrib/nnc/libs/backend/interpreter/core/src/ops/conv_FFT.cpp @@ -18,30 +18,52 @@ namespace impl using namespace nncc::core::ADT::tensor; -Index reduce(const Index &idx) -{ - Index res = idx; - res.resize(idx.rank() - 1); - return res; -} - // Mostly compatible with tensorflow implementation // Assuming input is in NHWC format with batch omitted( [in_height, in_width, in_channels] ) // Kernel is in [filter_height, filter_width, in_channels, out_channels] // Refer to https://www.tensorflow.org/api_docs/python/tf/nn/conv2d for info std::vector Conv2D_FFT::operator()() { - auto res = allocate_tensor(_out_shape); - - // TODO: implement + Index pads({(uint32_t)_op.getPadding(0), (uint32_t)_op.getPadding(1), 0u}); // - // 1. Pad input (clamp to zero, clamp to edge, wrap) - // 2. Pad kernel with zeroes to match padded input shape - // 3. FFT input and kernel + // 1. Pad input (currently only with clamp to zero, maybe clamp to edge and wrap later) + auto inputPadded = pad_input(pads); + uint64_t spectreSize = inputPadded.size(); + + Shape paddedShape = _input.getShape(); + int rank = paddedShape.rank(); + for (int i = 0; i < rank; i++) + { + paddedShape.dim(i) += 2 * pads.at(i); + } + + // Correct pads to properly index the output + if (pads.at(0) == 0) + { + pads.at(0) = _kernel.getShape().dim(0) / 2; + } + if (pads.at(1) == 0) + { + pads.at(1) = _kernel.getShape().dim(1) / 2; + } + + // 2. Unpack kernels (for separate output channels) and pad them with zeroes to match padded input shape + auto results = unpack_and_pad_kernels(paddedShape, spectreSize); + + // 3. FFT input and kernels + fft_CT(inputPadded.data(), spectreSize); + for (auto &kernel : results) + { + fft_CT(kernel.data(), spectreSize); + } + // 4. Elementwise production - // 5. IFFT input - // 6. Match output shape - + elementwise_product(inputPadded, results); + + // 5. IFFT input and match output shape + + auto res = ifft(results, paddedShape, _out_shape, _strides, pads); + return {res}; } @@ -58,27 +80,204 @@ Conv2D_FFT::Conv2D_FFT(const TensorVariant &input, const Conv2DOp &op) assert(_op.getPadding(2) == 0); } -std::vector Conv2D_FFT::fft(const Tensor &tensor) +std::vector Conv2D_FFT::pad_input(const Index &pads) { - std::vector res; + // Calculate new shape: just add paddings + uint32_t height = _input.getShape().dim(0), + width = _input.getShape().dim(1); + + Shape newShape = _input.getShape(); + int rank = newShape.rank(); + + uint64_t paddedSize = 1; + for (int i = 0; i < rank; i++) + { + newShape.dim(i) += 2 * pads.at(i); + paddedSize *= newShape.dim(i); + } - // TODO: implement + uint64_t spectreSize = 1; + while (spectreSize < paddedSize) + { + spectreSize *= 2; + } + + std::vector res(spectreSize, FFT_complex(0.0f, 0.0f)); + + ShapeRange outRange(newShape); + uint64_t i = 0; + Index unpaddedIndex; + for (auto outIdx : outRange) + { + // Fill paddings with zeroes + if (outIdx.at(0) < pads.at(0) || + outIdx.at(1) < pads.at(1) || + outIdx.at(0) >= (pads.at(0) + height) || + outIdx.at(1) >= (pads.at(1) + width)) + { + //res[i] = FFT_complex(0.0f, 0.0f); + // Already done by vector constructor + } + // Copy values from input + else + { + unpaddedIndex = outIdx; + unpaddedIndex.at(0) -= pads.at(0); + unpaddedIndex.at(1) -= pads.at(1); + res[i] = FFT_complex(_input.at(unpaddedIndex), 0.0f); + } + i++; + } return res; } -TensorVariant Conv2D_FFT::ifft(const std::vector &spectre, - const Shape &out_shape, +std::vector> Conv2D_FFT::unpack_and_pad_kernels(const Shape &paddedInputShape, const uint64_t spectreSize) +{ + const Shape &kShape = _kernel.getShape(); + uint32_t numKernels = kShape.dim(3); + + // Vector to store results to + std::vector> paddedKernels; + for (uint32_t n = 0; n < numKernels; n++) { + std::vector one_kernel(spectreSize, FFT_complex(0.0f, 0.0f)); + paddedKernels.push_back(one_kernel); + } + // Unpack kernels + int64_t shift = kShape.dim(2) - 1 + kShape.dim(2) * + ((kShape.dim(0) - 1) / 2 * paddedInputShape.dim(1) + (kShape.dim(1) - 1) / 2) ; + ShapeRange kernelRange(kShape); + for (auto &kIdx: kernelRange) + { + if (kIdx.at(0) < kShape.dim(0) && + kIdx.at(1) < kShape.dim(1) && + kIdx.at(2) < kShape.dim(2)) + { + Index kernelIdx = kIdx; + // The resulting kernel is mirrored and shifted to make output elements correspond to elements of original tensor + int64_t shifted_index = ((int64_t)spectreSize - shift + kernelIdx.at(2) + paddedInputShape.dim(2) * + (kernelIdx.at(1) + paddedInputShape.dim(1) * kernelIdx.at(0))) % spectreSize; + kernelIdx.at(0) = kShape.dim(0) - kernelIdx.at(0) - 1; + kernelIdx.at(1) = kShape.dim(1) - kernelIdx.at(1) - 1; + kernelIdx.at(2) = kShape.dim(2) - kernelIdx.at(2) - 1; + + paddedKernels[kernelIdx.at(3)][shifted_index] = _kernel.at(kernelIdx); + } + } + + + return paddedKernels; +} + +void Conv2D_FFT::elementwise_product(const std::vector &input, + std::vector> &kernels) +{ + uint32_t size = input.size(); + for (auto &kernel : kernels) + { + for (uint32_t i = 0; i < size; i++) + { + kernel[i] *= input[i]; + } + } +} + + +TensorVariant Conv2D_FFT::ifft(std::vector> &spectres, + const Shape &inShape, + const Shape &outShape, const Shape &strides, const Index &paddings) { - TensorVariant res = allocate_tensor(out_shape); + // TODO: maybe add some asserts() + + // Perform inverse FFT + for (auto &result : spectres) + { + ifft_CT(result.data(), result.size()); + } - // TODO: implement + // Allocate tensor + TensorVariant res = allocate_tensor(outShape); + auto resAccessor = Tensor(res); + + // Move our results to it + ShapeRange outRange(outShape); + uint32_t width = inShape.dim(1); + // We have to multiply by number of channels, because + // only every first of three elements corresponds to + // correct convolution by channels, the rest are + // results of shifted convolution + uint32_t inChannels = inShape.dim(2); + for (auto &outIdx : outRange) + { + resAccessor.at(outIdx) = spectres[outIdx.at(2)][inChannels * ((outIdx.at(0) * strides.dim(0) + paddings.at(0)) * width + + outIdx.at(1) * strides.dim(1) + paddings.at(1))].real(); + } return res; } +void Conv2D_FFT::separate (FFT_complex* array, + const uint64_t elements) +{ + const uint64_t half_size = elements / 2; + + // Temporary heap to store odd elements. + FFT_complex *tmp = new FFT_complex[half_size]; + for (uint64_t i = 0; i < half_size; i++) { + tmp[i] = array[i * 2 + 1]; + } + + // Copy even elements + for (uint64_t i = 0; i < half_size; i++) { + array[i] = array[i * 2]; + } + // Copy odd elements + for (uint64_t i = 0; i < half_size; i++) { + array[i + half_size] = tmp[i]; + } + + delete[] tmp; +} + +void Conv2D_FFT::fft_CT(FFT_complex* array, + const uint64_t elements) +{ + if (elements > 1) { + separate(array, elements); + fft_CT(array, elements / 2); + fft_CT(array + elements / 2, elements / 2); + for(int i = 0; i < elements / 2; i++) { + FFT_complex even = array[i]; + FFT_complex odd = array[i + elements / 2]; + FFT_complex twiddle = exp(FFT_complex(0, -2. * M_PI * i / elements)); + + array[i] = even + twiddle * odd; + array[i + elements / 2] = even - twiddle * odd; + } + } +} + +// TODO: using paddings and strides we can theoretically reduce number of elements calculated +void Conv2D_FFT::ifft_CT(FFT_complex* array, + const uint64_t elements) +{ + if (elements > 1) { + separate(array, elements); + ifft_CT(array, elements / 2); + ifft_CT(array + elements / 2, elements / 2); + for(int i = 0; i < elements / 2; i++) { + FFT_complex even = array[i]; + FFT_complex odd = array[i + elements / 2]; + FFT_complex twiddle = exp(FFT_complex(0, 2. * M_PI * i / elements)); + + array[i] = (even + twiddle * odd) / FFT_complex(2.0f, 0.0f); + array[i + elements / 2] = (even - twiddle * odd) / FFT_complex(2.0f, 0.0f); + } + } +} + } // namespace impl } // namespace interpreter } // namespace backend -- 2.7.4