From c3113b4de6be69ffba961d4b7521ca2b69be94cf Mon Sep 17 00:00:00 2001
From: =?utf8?q?Denis=20Maksimenko/AI=20Tools=20Lab=20/SRR/Assistant=20Engi?=
 =?utf8?q?neer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?=
 <d.maksimenko@partner.samsung.com>
Date: Tue, 24 Jul 2018 11:36:43 +0300
Subject: [PATCH] [nnc backend] Implement Conv2D_FFT methods (#636)

* Added added utility functions.

Added pad_input(), unpack_and_pad_kernels(). Implementes most of operator().

* Added fft() implementation.

Added concrete implementation of FFT using in-place Cooley-Tukey algorithm.

* Added elementwise_product().

Added a function that performs convolution in Fourier domain. Also, fixed some interfaces.

* Added ifft() implementation.

implemented ifft(), which performs inverse FFT and concludes class implementation.

Added multiplying by channels in ifft() indexing.

* Restored ShapeRange loop.

unpack_and_pad_kernels now uses for (... : kernelRange) again to make code more understandable.

Signed-off-by: Denis Maksimenko <d.maksimenko@partner.samsung.com>
---
 .../core/include/interpreter/ops/conv_FFT.h        |  40 +++-
 .../backend/interpreter/core/src/ops/conv_FFT.cpp  | 245 +++++++++++++++++++--
 2 files changed, 256 insertions(+), 29 deletions(-)
diff --git a/contrib/nnc/libs/backend/interpreter/core/include/interpreter/ops/conv_FFT.h b/contrib/nnc/libs/backend/interpreter/core/include/interpreter/ops/conv_FFT.h
index 6985c70..1e5f068 100644
--- a/contrib/nnc/libs/backend/interpreter/core/include/interpreter/ops/conv_FFT.h
+++ b/contrib/nnc/libs/backend/interpreter/core/include/interpreter/ops/conv_FFT.h
@@ -53,19 +53,47 @@ public:
 
 protected:
   ///
-  /// Perform Fast Fourier transform on tensor and return the result.
-  /// No support for complex tensors yet, so we have to use something else
+  /// Pad input (with zeroes) according to selected padding type (paddings are calculated in ShapeInference)
   ///
-  std::vector<FFT_complex> fft(const Tensor<float> &tensor);
+  std::vector<FFT_complex> pad_input(const Index &pads);
 
   ///
-  /// Perform Inverse Fast Fourier transform on tensor and return the result.
+  /// Unpack kernels for each out_channel and pad them with zeroes to input size
   ///
-  TensorVariant ifft(const std::vector<FFT_complex> &spectre,
-                     const Shape &out_shape,
+  std::vector<std::vector<FFT_complex>> unpack_and_pad_kernels(const Shape &paddedInputShape, const uint64_t spectreSize);
+
+  ///
+  /// This function performs elementwise product of input by each kernel
+  /// and writes result back to kernels vector.
+  ///
+  void elementwise_product(const std::vector<FFT_complex> &input,
+                           std::vector<std::vector<FFT_complex>> &kernels);
+  ///
+  /// Perform Inverse Fast Fourier transform on elementwise products results. Return result of the convolution.
+  ///
+  TensorVariant ifft(std::vector<std::vector<FFT_complex>> &spectres,
+                     const Shape &inShape,
+                     const Shape &outShape,
                      const Shape &strides,
                      const Index &paddings);
 
+  ///
+  /// Separate even/odd elements to lower/upper halves of array respectively.
+  /// This allows in-place computation.
+  ///
+  void separate(FFT_complex* array, const uint64_t elements);
+
+  ///
+  /// Concrete in-place implementation of FFT
+  /// (using Cooley-Tukey algorithm, hence "_CT" suffixes)
+  ///
+  void fft_CT(FFT_complex* array, const uint64_t elements);
+
+  ///
+  ///  Concrete in-place implementation of inverse FFT
+  ///
+  void ifft_CT(FFT_complex* array, const uint64_t elements);
+
 private:
   const Tensor<float> _input;
   Tensor<float> _kernel;
diff --git a/contrib/nnc/libs/backend/interpreter/core/src/ops/conv_FFT.cpp b/contrib/nnc/libs/backend/interpreter/core/src/ops/conv_FFT.cpp
index 85149dc..137e40f 100644
--- a/contrib/nnc/libs/backend/interpreter/core/src/ops/conv_FFT.cpp
+++ b/contrib/nnc/libs/backend/interpreter/core/src/ops/conv_FFT.cpp
@@ -18,30 +18,52 @@ namespace impl
 
 using namespace nncc::core::ADT::tensor;
 
-Index reduce(const Index &idx)
-{
-  Index res = idx;
-  res.resize(idx.rank() - 1);
-  return res;
-}
-
 // Mostly compatible with tensorflow implementation
 // Assuming input is in NHWC format with batch omitted( [in_height, in_width, in_channels] )
 // Kernel is in [filter_height, filter_width, in_channels, out_channels]
 // Refer to https://www.tensorflow.org/api_docs/python/tf/nn/conv2d for info
 std::vector<TensorVariant> Conv2D_FFT::operator()()
 {
-  auto res = allocate_tensor(_out_shape);
-  
-  // TODO: implement
+  Index pads({(uint32_t)_op.getPadding(0), (uint32_t)_op.getPadding(1), 0u});
   //
-  // 1. Pad input (clamp to zero, clamp to edge, wrap)
-  // 2. Pad kernel with zeroes to match padded input shape
-  // 3. FFT input and kernel
+  // 1. Pad input (currently only with clamp to zero, maybe clamp to edge and wrap later)
+  auto inputPadded = pad_input(pads);
+  uint64_t spectreSize = inputPadded.size();
+
+  Shape paddedShape = _input.getShape();
+  int rank = paddedShape.rank();
+  for (int i = 0; i < rank; i++)
+  {
+    paddedShape.dim(i) += 2 * pads.at(i);
+  }
+
+  // Correct pads to properly index the output
+  if (pads.at(0) == 0)
+  {
+    pads.at(0) = _kernel.getShape().dim(0) / 2;
+  }
+  if (pads.at(1) == 0)
+  {
+    pads.at(1) = _kernel.getShape().dim(1) / 2;
+  }
+
+  // 2. Unpack kernels (for separate output channels) and pad them with zeroes to match padded input shape
+  auto results = unpack_and_pad_kernels(paddedShape, spectreSize);
+
+  // 3. FFT input and kernels
+  fft_CT(inputPadded.data(), spectreSize);
+  for (auto &kernel : results)
+  {
+    fft_CT(kernel.data(), spectreSize);
+  }
+
   // 4. Elementwise production
-  // 5. IFFT input
-  // 6. Match output shape
-  
+  elementwise_product(inputPadded, results);
+
+  // 5. IFFT input and match output shape
+
+  auto res = ifft(results, paddedShape, _out_shape, _strides, pads);
+
   return {res};
 }
 
@@ -58,27 +80,204 @@ Conv2D_FFT::Conv2D_FFT(const TensorVariant &input, const Conv2DOp &op)
   assert(_op.getPadding(2) == 0);
 }
 
-std::vector<FFT_complex> Conv2D_FFT::fft(const Tensor<float> &tensor)
+std::vector<FFT_complex> Conv2D_FFT::pad_input(const Index &pads)
 {
-  std::vector<FFT_complex> res;
+  // Calculate new shape: just add paddings
+  uint32_t height = _input.getShape().dim(0),
+           width  = _input.getShape().dim(1);
+
+  Shape newShape = _input.getShape();
+  int rank = newShape.rank();
+
+  uint64_t paddedSize = 1;
+  for (int i = 0; i < rank; i++)
+  {
+    newShape.dim(i) += 2 * pads.at(i);
+    paddedSize *= newShape.dim(i);
+  }
 
-  // TODO: implement
+  uint64_t spectreSize = 1;
+  while (spectreSize < paddedSize)
+  {
+    spectreSize *= 2;
+  }
+
+  std::vector<FFT_complex> res(spectreSize, FFT_complex(0.0f, 0.0f));
+
+  ShapeRange outRange(newShape);
+  uint64_t i = 0;
+  Index unpaddedIndex;
+  for (auto outIdx : outRange)
+  {
+    // Fill paddings with zeroes
+    if (outIdx.at(0) < pads.at(0) ||
+        outIdx.at(1) < pads.at(1) ||
+        outIdx.at(0) >= (pads.at(0) + height) ||
+        outIdx.at(1) >= (pads.at(1) + width))
+    {
+      //res[i] = FFT_complex(0.0f, 0.0f);
+      // Already done by vector constructor
+    }
+      // Copy values from input
+    else
+    {
+      unpaddedIndex = outIdx;
+      unpaddedIndex.at(0) -= pads.at(0);
+      unpaddedIndex.at(1) -= pads.at(1);
+      res[i] = FFT_complex(_input.at(unpaddedIndex), 0.0f);
+    }
+    i++;
+  }
 
   return res;
 }
 
-TensorVariant Conv2D_FFT::ifft(const std::vector<FFT_complex> &spectre,
-                               const Shape &out_shape,
+std::vector<std::vector<FFT_complex>> Conv2D_FFT::unpack_and_pad_kernels(const Shape &paddedInputShape, const uint64_t spectreSize)
+{
+  const Shape &kShape = _kernel.getShape();
+  uint32_t numKernels = kShape.dim(3);
+
+  // Vector to store results to
+  std::vector<std::vector<FFT_complex>> paddedKernels;
+  for (uint32_t n = 0; n < numKernels; n++) {
+    std::vector<FFT_complex> one_kernel(spectreSize, FFT_complex(0.0f, 0.0f));
+    paddedKernels.push_back(one_kernel);
+  }
+  // Unpack kernels
+  int64_t shift = kShape.dim(2) - 1 + kShape.dim(2) *
+                                    ((kShape.dim(0) - 1) / 2 * paddedInputShape.dim(1) + (kShape.dim(1) - 1) / 2) ;
+  ShapeRange kernelRange(kShape);
+  for (auto &kIdx: kernelRange)
+  {
+    if (kIdx.at(0) < kShape.dim(0) &&
+        kIdx.at(1) < kShape.dim(1) &&
+        kIdx.at(2) < kShape.dim(2))
+    {
+      Index kernelIdx = kIdx;
+      // The resulting kernel is mirrored and shifted to make output elements correspond to elements of original tensor
+      int64_t shifted_index = ((int64_t)spectreSize - shift + kernelIdx.at(2) + paddedInputShape.dim(2) *
+                                                             (kernelIdx.at(1) + paddedInputShape.dim(1) * kernelIdx.at(0))) % spectreSize;
+      kernelIdx.at(0) = kShape.dim(0) - kernelIdx.at(0) - 1;
+      kernelIdx.at(1) = kShape.dim(1) - kernelIdx.at(1) - 1;
+      kernelIdx.at(2) = kShape.dim(2) - kernelIdx.at(2) - 1;
+
+      paddedKernels[kernelIdx.at(3)][shifted_index] = _kernel.at(kernelIdx);
+    }
+  }
+
+
+  return paddedKernels;
+}
+
+void Conv2D_FFT::elementwise_product(const std::vector<FFT_complex> &input,
+                                     std::vector<std::vector<FFT_complex>> &kernels)
+{
+  uint32_t size = input.size();
+  for (auto &kernel : kernels)
+  {
+    for (uint32_t i = 0; i < size; i++)
+    {
+      kernel[i] *= input[i];
+    }
+  }
+}
+
+
+TensorVariant Conv2D_FFT::ifft(std::vector<std::vector<FFT_complex>> &spectres,
+                               const Shape &inShape,
+                               const Shape &outShape,
                                const Shape &strides,
                                const Index &paddings)
 {
-  TensorVariant res = allocate_tensor(out_shape);
+  // TODO: maybe add some asserts()
+
+  // Perform inverse FFT
+  for (auto &result : spectres)
+  {
+    ifft_CT(result.data(), result.size());
+  }
 
-  // TODO: implement
+  // Allocate tensor
+  TensorVariant res = allocate_tensor(outShape);
+  auto resAccessor = Tensor<float>(res);
+
+  // Move our results to it
+  ShapeRange outRange(outShape);
+  uint32_t width = inShape.dim(1);
+  // We have to multiply by number of channels, because
+  // only every first of three elements corresponds to
+  // correct convolution by channels, the rest are
+  // results of shifted convolution
+  uint32_t inChannels = inShape.dim(2);
+  for (auto &outIdx : outRange)
+  {
+    resAccessor.at(outIdx) = spectres[outIdx.at(2)][inChannels * ((outIdx.at(0) * strides.dim(0) + paddings.at(0)) * width +
+                                                                   outIdx.at(1) * strides.dim(1) + paddings.at(1))].real();
+  }
 
   return res;
 }
 
+void Conv2D_FFT::separate (FFT_complex* array,
+                           const uint64_t elements)
+{
+  const uint64_t half_size = elements / 2;
+
+  // Temporary heap to store odd elements.
+  FFT_complex *tmp = new FFT_complex[half_size];
+  for (uint64_t i = 0; i < half_size; i++) {
+    tmp[i] = array[i * 2 + 1];
+  }
+
+  // Copy even elements
+  for (uint64_t i = 0; i < half_size; i++) {
+    array[i] = array[i * 2];
+  }
+  // Copy odd elements
+  for (uint64_t i = 0; i < half_size; i++) {
+    array[i + half_size] = tmp[i];
+  }
+
+  delete[] tmp;
+}
+
+void Conv2D_FFT::fft_CT(FFT_complex* array,
+                        const uint64_t elements)
+{
+  if (elements > 1) {
+    separate(array, elements);
+    fft_CT(array, elements / 2);
+    fft_CT(array + elements / 2, elements / 2);
+    for(int i = 0; i < elements / 2; i++) {
+      FFT_complex even = array[i];
+      FFT_complex odd  = array[i + elements / 2];
+      FFT_complex twiddle = exp(FFT_complex(0, -2. * M_PI * i / elements));
+
+      array[i] = even + twiddle * odd;
+      array[i + elements / 2] = even - twiddle * odd;
+    }
+  }
+}
+
+// TODO: using paddings and strides we can theoretically reduce number of elements calculated
+void Conv2D_FFT::ifft_CT(FFT_complex* array,
+                         const uint64_t elements)
+{
+  if (elements > 1) {
+    separate(array, elements);
+    ifft_CT(array, elements / 2);
+    ifft_CT(array + elements / 2, elements / 2);
+    for(int i = 0; i < elements / 2; i++) {
+      FFT_complex even = array[i];
+      FFT_complex odd  = array[i + elements / 2];
+      FFT_complex twiddle = exp(FFT_complex(0, 2. * M_PI * i / elements));
+
+      array[i] = (even + twiddle * odd) / FFT_complex(2.0f, 0.0f);
+      array[i + elements / 2] = (even - twiddle * odd) / FFT_complex(2.0f, 0.0f);
+    }
+  }
+}
+
 } // namespace impl
 } // namespace interpreter
 } // namespace backend
-- 
2.7.4