Fix arm compute cl kernel for transpose conv (#5543)
author오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Wed, 3 Jul 2019 09:29:03 +0000 (18:29 +0900)
committer이춘석/On-Device Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
Wed, 3 Jul 2019 09:29:03 +0000 (18:29 +0900)
* Fix arm compute cl kernel for transpose conv

Fix arm compute cl kernel for transpose conv
- Padding calculation
- Allow asymmetric padding
- Get and fix upsample layer and kernel
- Rename DeconvolutionXXX to TransposeConvXXX
- Enable all transpose conv tests

Signed-off-by: Hyeongseok Oh <hseok82.oh@samsung.com>
* Fix pacl

14 files changed:
libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h [new file with mode: 0644]
libs/ARMComputeEx/arm_compute/core/UtilsEx.h
libs/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h [new file with mode: 0644]
libs/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h [moved from libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLDeconvolutionLayerEx.h with 53% similarity]
libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h [new file with mode: 0644]
libs/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp [new file with mode: 0644]
libs/ARMComputeEx/src/core/UtilsEx.cpp
libs/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp [moved from libs/ARMComputeEx/src/runtime/CL/functions/CLDeconvolutionLayerEx.cpp with 69% similarity]
libs/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp [new file with mode: 0644]
runtimes/neurun/backend/acl_cl/StageGenerator.cc
runtimes/pure_arm_compute/src/compilation.cc
tests/framework/tests/transpose_conv/config.sh
tests/nnapi/nnapi_gtest.skip.armv7l-linux

diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
new file mode 100644 (file)
index 0000000..c5ef730
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
+#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL.
+ */
+class CLTransposeConvLayerUpsampleKernel : public ICLKernel
+{
+public:
+  /** Constructor */
+  CLTransposeConvLayerUpsampleKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTransposeConvLayerUpsampleKernel &
+  operator=(const CLTransposeConvLayerUpsampleKernel &) = delete;
+  /** Default Move Constructor. */
+  CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default;
+  /** Default move assignment operator */
+  CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default;
+  /** Default destructor */
+  ~CLTransposeConvLayerUpsampleKernel() = default;
+
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input        Source tensor. Data types supported: QASYMM8/F16/F32.
+   * @param[out] output       Destination tensor. Data types supported: same as @p input. All but
+   * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only
+   * performed within the XY-plane.
+   * @param[in]  inner_border Top and right inner border sizes. These rows and columns will be
+   * filled with zero.
+   * @param[in]  info         Contains padding and stride information described in @ref
+   * PadStrideInfo.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+                 const PadStrideInfo &info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLTransposeConvLayerUpsample
+   *
+   * @param[in] input        Source tensor info. Data types supported: QASYMM8/F16/F32.
+   * @param[in] output       Destination tensor info. Data types supported: same as @p input. All
+   * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is
+   * only performed within the XY-plane.
+   * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled
+   * with zero.
+   * @param[in] info         Contains padding and stride information described in @ref
+   * PadStrideInfo.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const BorderSize &inner_border, const PadStrideInfo &info);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  BorderSize _inner_border;
+  PadStrideInfo _info;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */
index 3fb3955..39026e6 100644 (file)
 
 #include <utility>
 
+#include "arm_compute/core/Types.h"
+
 namespace arm_compute
 {
 
-/** Returns expected width and height of the deconvolution's output tensor.
+/** Returns expected width and height of the transpose convolution's output tensor.
  *
  * @note This function was copied in order to fix a bug computing to wrong output dimensions.
- *       The formula for computing the output dimension is: o = s*(i - 1) + a + k - 2*p
- *         k: kernel size
- *         s: stride
- *         i: input size
- *         o: output size
- *         p: padding
- *         a: inner border
- *       Refer to : https://github.com/ARM-software/ComputeLibrary/issues/523#issuecomment-414606797
  *
  * @param[in] in_width      Width of input tensor (Number of columns)
  * @param[in] in_height     Height of input tensor (Number of rows)
  * @param[in] kernel_width  Kernel width.
  * @param[in] kernel_height Kernel height.
- * @param[in] padx          X axis padding.
- * @param[in] pady          Y axis padding.
- * @param[in] stride_x      X axis input stride.
- * @param[in] stride_y      Y axis input stride.
- * @param[in] inner_border_right  The number of zeros added to right edge of the input.
- * @param[in] inner_border_top    The number of zeros added to top edge of the input.
+ * @param[in] info          padding and stride info.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_top   The number of zeros added to bottom edge of the output.
  *
  * @return A pair with the new width in the first position and the new height in the second.
  */
-const std::pair<unsigned int, unsigned int> deconvolution_output_dimensions_ex(
-    unsigned int in_width, unsigned int in_height, unsigned int kernel_width,
-    unsigned int kernel_height, unsigned int padx, unsigned int pady, unsigned int stride_x,
-    unsigned int stride_y, unsigned int inner_border_right = 0, unsigned int inner_border_top = 0);
+const std::pair<unsigned int, unsigned int>
+transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
+                                unsigned int kernel_width, unsigned int kernel_height,
+                                const PadStrideInfo &info, unsigned int invalid_right,
+                                unsigned int invalid_top);
 }
 #endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/libs/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
new file mode 100644 (file)
index 0000000..367129e
--- /dev/null
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
+#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+namespace misc
+{
+namespace shape_calculator
+{
+
+/** Calculate the upsampled output shape used for transpose convolution
+ *
+ * @param[in] input              Input tensor info
+ * @param[in] weights            Weights tensor shape
+ * @param[in] info               Padding and stride info
+ * @param[in] out_dims           Output shape dimensions
+ * @param[in] pad_left           Padding on left
+ * @param[in] pad_right          Padding on right
+ * @param[in] pad_top            Padding on top
+ * @param[in] pad_bottom         Padding on bottom
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_transposeconv_upsampled_shape(
+    const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
+    std::pair<unsigned int, unsigned int> &out_dims, unsigned int &pad_left,
+    unsigned int &pad_right, unsigned int &pad_top, unsigned int &pad_bottom)
+{
+  unsigned int sx = info.stride().first;
+  unsigned int sy = info.stride().second;
+  const DataLayout data_layout = input.data_layout();
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  // Find the upsampled dimensions
+  // transpose conv out:
+  //    tconv_out + pad = 1 + (in - 1) * stride + invalid
+  //    tconv_out = 1 + (in - 1) * stride + invalid - pad
+  // upsample out:
+  //    upsample_out = 1 + (in - 1) * stride
+  unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1;
+  unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1;
+
+  // Find the padding needed for the convolution with stride 1 in order to match output shape
+  // upsample+pad out:
+  //    upsample_out + pad = tconv_out + kernel - 1
+  //    pad = tconv_out + kernel - 1 - upsample_out
+  unsigned int padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1);
+  unsigned int pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1);
+  out_x += padx;
+  out_y += pady;
+
+  unsigned int padx_all = padx + info.pad_left() + info.pad_right();
+  unsigned int pady_all = pady + info.pad_top() + info.pad_bottom();
+  pad_left = (padx_all + 1) / 2 - info.pad_left();
+  pad_right = padx_all / 2 - info.pad_right();
+  pad_top = (pady_all + 1) / 2 - info.pad_top();
+  pad_bottom = pady_all / 2 - info.pad_bottom();
+
+  TensorShape scale_out_shape(input.tensor_shape());
+  scale_out_shape.set(idx_w, out_x);
+  scale_out_shape.set(idx_h, out_y);
+
+  return scale_out_shape;
+}
+
+/** Calculate the output shape of the transpose convolution layer
+ *
+ * @param[in] out_dims Output x and y shape dimensions
+ * @param[in] input    Input tensor info
+ * @param[in] weights  Weights tensor shape
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &out_dims,
+                                   const ITensorInfo &input, const ITensorInfo &weights)
+{
+  const TensorShape input_shape{input.tensor_shape()};
+  const TensorShape weights_shape{weights.tensor_shape()};
+
+  const DataLayout data_layout = input.data_layout();
+  const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const int channel_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+  const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+  TensorShape out_shape{input_shape};
+  out_shape.set(width_idx, out_dims.first);
+  out_shape.set(height_idx, out_dims.second);
+  out_shape.set(channel_idx, weights_shape[batch_idx]);
+  return out_shape;
+}
+
+} // namespace shape_calculator
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__
index 277e1e7..3b7fcde 100644 (file)
@@ -20,7 +20,6 @@
 #include <arm_compute/runtime/CL/functions/CLBatchToSpaceND.h>
 #include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
 #include <arm_compute/runtime/CL/functions/CLCast.h>
-#include <arm_compute/runtime/CL/functions/CLDeconvolutionLayerEx.h>
 #include <arm_compute/runtime/CL/functions/CLDepthToSpace.h>
 #include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
@@ -38,5 +37,6 @@
 #include <arm_compute/runtime/CL/functions/CLSplit.h>
 #include <arm_compute/runtime/CL/functions/CLStridedSliceEx.h>
 #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
+#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
 
 #endif // __ARM_COMPUTE_CLFUNCTIONSEX_H__
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_CLDECONVOLUTIONLAYEREX_H__
-#define __ARM_COMPUTE_CLDECONVOLUTIONLAYEREX_H__
+#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
 
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
 
 #include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
 
 namespace arm_compute
 {
 class ICLTensor;
-/** Function to run the deconvolution layer.
+/** Function to run the transpose convolution layer.
  *
  * @note This layer was copied in order to fix a bug computing to wrong output dimensions.
  *
- * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
+ * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input
  * depending on the stride and pad info and then perform a 1x1
  * convolution pass. Input stride defines how many zeroes we should put between each element of the
  * input, pad is the amount of padding and finally a is a user
@@ -53,10 +53,10 @@ class ICLTensor;
  *
  *  The relation between input to output is as follows:
  *  \f[
- *       width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x
+ *       width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x
  *  \f]
  *  \f[
- *       height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y
+ *       height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y
  *  \f]
  *
  *  where:
@@ -74,69 +74,69 @@ class ICLTensor;
  *
  * This function calls the following OpenCL kernels/functions:
  *
- * -# @ref CLDeconvolutionLayerUpsample
+ * -# @ref CLTransposeConvLayerUpsample
  * -# @ref CLConvolutionLayer
  *
  */
-class CLDeconvolutionLayerEx : public IFunction
+class CLTransposeConvLayer : public IFunction
 {
 public:
   /** Constructor */
-  CLDeconvolutionLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
   /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLDeconvolutionLayerEx(const CLDeconvolutionLayerEx &) = delete;
+  CLTransposeConvLayer(const CLTransposeConvLayer &) = delete;
   /** Default move constructor */
-  CLDeconvolutionLayerEx(CLDeconvolutionLayerEx &&) = default;
+  CLTransposeConvLayer(CLTransposeConvLayer &&) = default;
   /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLDeconvolutionLayerEx &operator=(const CLDeconvolutionLayerEx &) = delete;
+  CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete;
   /** Default move assignment operator */
-  CLDeconvolutionLayerEx &operator=(CLDeconvolutionLayerEx &&) = default;
+  CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default;
   /** Set the input, weights, biases and output tensors.
    *
-   * @param[in,out] input              Input tensor. 3 lower dimensions represent a single input,
-   * and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8/F16/F32.
-   * @param[in]     weights            The 4d weights with dimensions [width, height, IFM, OFM].
-   * Data type supported: Same as @p input.
-   * @param[in]     bias               (Optional) The biases have one dimension. Data type
-   * supported: Same as @p input.
-   * @param[out]    output             Output tensor. The output has the same number of dimensions
-   * as the @p input.
-   * @param[in]     info               Contains padding and policies to be used in the
-   * deconvolution, this is decribed in @ref PadStrideInfo.
-   * @param[in]     inner_border_right The number of zeros added to right edge of the input.
-   * @param[in]     inner_border_top   The number of zeros added to top edge of the input.
-   * @param[in]     weights_info       (Optional) Weights information needed for @ref
-   * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
-   * CLWeightsReshapeKernel.
-   *
+   * @param[in,out] input          Input tensor. 3 lower dimensions represent a single input,
+   *                               and an optional 4th dimension for batch of inputs.
+   *                               Data types supported: QASYMM8/F16/F32.
+   * @param[in]     weights        The 4d weights with dimensions [width, height, IFM, OFM].
+   *                               Data type supported: Same as @p input.
+   * @param[in]     bias           (Optional) The biases have one dimension. Data type supported:
+   *                               Same as @p input.
+   * @param[out]    output         Output tensor. The output has the same number of dimensions
+   *                               as the @p input.
+   * @param[in]     info           Contains padding and policies to be used in the
+   *                               deconvolution, this is decribed in @ref PadStrideInfo.
+   * @param[in]     invalid_right  The number of zeros added to right edge of the input.
+   * @param[in]     invalid_bottom The number of zeros added to top edge of the input.
+   * @param[in]     weights_info   (Optional) Weights information needed for @ref
+   *                               CLConvolutionLayer, specifies if the weights tensor has been
+   *                               reshaped with @ref CLWeightsReshapeKernel.
    */
   void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                 const PadStrideInfo &info, unsigned int inner_border_right,
-                 unsigned int inner_border_top, const WeightsInfo &weights_info = WeightsInfo());
+                 const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
+                 const WeightsInfo &weights_info = WeightsInfo());
   /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLDeconvolutionLayerEx
-   *
-   * @param[in] input              Input tensor info. 3 lower dimensions represent a single input,
-   * and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8/F16/F32.
-   * @param[in] weights            The 4d weights info with dimensions [width, height, IFM, OFM].
-   * Data type supported: Same as @p input.
-   * @param[in] bias               (Optional) The biases have one dimension. Data type supported:
-   * Same as @p input.
-   * @param[in] output             Output tensor info. The output has the same number of dimensions
-   * as the @p input.
-   * @param[in] info               Contains padding and policies to be used in the deconvolution,
-   * this is decribed in @ref PadStrideInfo.
-   * @param[in] inner_border_right The number of zeros added to right edge of the input.
-   * @param[in] inner_border_top   The number of zeros added to top edge of the input.
-   * @param[in] weights_info       (Optional) Weights information needed for @ref
-   * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
-   * CLWeightsReshapeKernel.
+   * CLTransposeConvLayer
    *
+   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
+   *                            and an optional 4th dimension for batch of inputs.
+   *                            Data types supported: QASYMM8/F16/F32.
+   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
+   *                            Data type supported: Same as @p input.
+   * @param[in] bias            (Optional) The biases have one dimension. Data type supported:
+   *                            Same as @p input.
+   * @param[in] output          Output tensor info. The output has the same number of dimensions
+   *                            as the @p input.
+   * @param[in] info            Contains padding and policies to be used in the deconvolution,
+   *                            this is decribed in @ref PadStrideInfo.
+   * @param[in] innvalid_right  The number of zeros added to right edge of the input.
+   * @param[in] invalid_bottom  The number of zeros added to top edge of the input.
+   * @param[in] weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer,
+   *                            specifies if the weights tensor has been reshaped with @ref
+   *                            CLWeightsReshapeKernel.
    * @return a status
    */
   static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
                          const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                         unsigned int inner_border_right, unsigned int inner_border_top,
+                         unsigned int innvalid_right, unsigned int invalid_bottom,
                          const WeightsInfo &weights_info = WeightsInfo());
 
   // Inherited methods overridden:
@@ -145,7 +145,7 @@ public:
 
 private:
   CLMemoryGroup _memory_group;
-  CLDeconvolutionLayerUpsample _scale_f;
+  CLTransposeConvLayerUpsample _scale_f;
   CLConvolutionLayer _conv_f;
   CPPFlipWeightsKernel _flip_weights;
   CLTensor _scaled_output;
@@ -154,4 +154,4 @@ private:
   bool _is_prepared;
 };
 }
-#endif /* __ARM_COMPUTE_CLDECONVOLUTIONLAYEREX_H__ */
+#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
new file mode 100644 (file)
index 0000000..4ae0e18
--- /dev/null
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
+#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */
+class CLTransposeConvLayerUpsample : public IFunction
+{
+public:
+  /** Default constructor */
+  CLTransposeConvLayerUpsample();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete;
+  /** Allow instances of this class to be moved */
+  CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default;
+  /** Allow instances of this class to be moved */
+  CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default;
+  /** Default destructor */
+  virtual ~CLTransposeConvLayerUpsample() = default;
+
+  /** Initialize the function's source, destination, interpolation type and border_mode.
+   *
+   * @param[in, out] input        Source tensor. Data type supported: QASYMM8/F16/F32.
+   * @param[out]     output       Destination tensor. Data type supported: same as @p input.
+   * @param[in]      inner_border The number of zeros added to right and top edges of the input.
+   * @param[in]      info         Contains padding and policies to be used in the deconvolution.
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+                 const PadStrideInfo &info);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLTransposeConvLayerUpsample
+   *
+   * @param[in] input        Source tensor info. Data type supported: QASYMM8/F16/F32.
+   * @param[in] output       Destination tensor info. Data type supported: same as @p input.
+   * @param[in] inner_border The number of zeros added to right and top edges of the input.
+   * @param[in] info         Contains padding and policies to be used in the deconvolution.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const BorderSize &inner_border, const PadStrideInfo &info);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  CLTransposeConvLayerUpsampleKernel _upsample;
+  ICLTensor *_output;
+};
+}
+#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
new file mode 100644 (file)
index 0000000..6cc8d9d
--- /dev/null
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel()
+    : _input(nullptr), _output(nullptr), _inner_border(), _info()
+{
+}
+
+Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input,
+                                                    const ITensorInfo *output,
+                                                    const BorderSize &inner_border,
+                                                    const PadStrideInfo &info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+
+  const DataLayout data_layout = input->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
+  for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1,
+                                  "inner_border_right must be smaller that stride_x");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1,
+                                  "inner_border_top must be smaller that stride_y");
+
+  return Status{};
+}
+
+void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                                   const BorderSize &inner_border,
+                                                   const PadStrideInfo &info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  _input = input;
+  _output = output;
+  _inner_border = inner_border;
+  _info = info;
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate(
+      input->info(), output->info(), inner_border, info));
+
+  // Create kernel
+  CLBuildOptions build_opts;
+  build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
+
+  constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+  ICLKernel::configure_internal(win);
+}
+
+void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  const DataLayout data_layout = _input->info()->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  const int out_start_x = _info.pad_left();
+  const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right -
+                        _info.pad_right() + _info.stride().first - 1;
+  const int out_step_x = _info.stride().first;
+
+  const int out_start_y = _inner_border.top + _info.pad_top();
+  const int out_end_y =
+      _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
+  const int out_step_y = _info.stride().second;
+
+  switch (data_layout)
+  {
+    case DataLayout::NCHW:
+    {
+      Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+      Window slice_out = collapsed.first_slice_window_3D();
+      slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
+      slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+      Window slice_in = collapsed.first_slice_window_3D();
+
+      do
+      {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+      } while (collapsed.slide_window_slice_3D(slice_in) &&
+               collapsed.slide_window_slice_3D(slice_out));
+      break;
+    }
+    case DataLayout::NHWC:
+    {
+      // NOTE: not collapsing in NHWC
+      Window slice_out = window.first_slice_window_3D();
+      slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
+      slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+      Window slice_in = window.first_slice_window_3D();
+
+      do
+      {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+      } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data layout");
+  }
+}
index 3322796..94242b5 100644 (file)
 
 using namespace arm_compute;
 
-const std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions_ex(
-    unsigned int in_width, unsigned int in_height, unsigned int kernel_width,
-    unsigned int kernel_height, unsigned int padx, unsigned int pady, unsigned int stride_x,
-    unsigned int stride_y, unsigned int inner_border_right, unsigned int inner_border_top)
+const std::pair<unsigned int, unsigned int>
+arm_compute::transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
+                                             unsigned int kernel_width, unsigned int kernel_height,
+                                             const PadStrideInfo &info, unsigned int invalid_right,
+                                             unsigned int invalid_bottom)
 {
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+  const unsigned int padx = info.pad_left() + info.pad_right();
+  const unsigned int pady = info.pad_top() + info.pad_bottom();
+
   ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1);
-  ARM_COMPUTE_ERROR_ON(((in_width - 1) * stride_x + kernel_width) < 2 * padx);
-  ARM_COMPUTE_ERROR_ON(((in_height - 1) * stride_y + kernel_height) < 2 * pady);
-  const int w = stride_x * (in_width - 1) + kernel_width - 2 * padx + inner_border_right;
-  const int h = stride_y * (in_height - 1) + kernel_height - 2 * pady + inner_border_top;
+  ARM_COMPUTE_ERROR_ON(kernel_width <= padx);
+  ARM_COMPUTE_ERROR_ON(kernel_height <= pady);
+
+  // Find the transpose conv out dimensions
+  // transpose conv out:
+  //    tconv_out + pad = 1 + (in - 1) * stride + invalid
+  //    tconv_out = 1 + (in - 1) * stride + invalid - pad
+  const int w = stride_x * (in_width - 1) + kernel_width - padx + invalid_right;
+  const int h = stride_y * (in_height - 1) + kernel_height - pady + invalid_bottom;
 
   return std::make_pair<unsigned int, unsigned int>(w, h);
 }
@@ -22,7 +22,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerEx.h"
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
@@ -38,8 +39,7 @@
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
-CLDeconvolutionLayerEx::CLDeconvolutionLayerEx(
-    std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
       _scale_f(),
       _conv_f(),
@@ -51,11 +51,10 @@ CLDeconvolutionLayerEx::CLDeconvolutionLayerEx(
 {
 }
 
-Status CLDeconvolutionLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                                        const ITensorInfo *bias, ITensorInfo *output,
-                                        const PadStrideInfo &info, unsigned int inner_border_right,
-                                        unsigned int inner_border_top,
-                                        const WeightsInfo &weights_info)
+Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                      const ITensorInfo *bias, ITensorInfo *output,
+                                      const PadStrideInfo &info, unsigned int invalid_right,
+                                      unsigned int invalid_bottom, const WeightsInfo &weights_info)
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
@@ -70,24 +69,21 @@ Status CLDeconvolutionLayerEx::validate(const ITensorInfo *input, const ITensorI
 
   ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
   ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
 
-  const unsigned int stride_x = info.stride().first;
-  const unsigned int stride_y = info.stride().second;
+  const unsigned int kernel_x = weights->dimension(idx_w);
+  const unsigned int kernel_y = weights->dimension(idx_h);
 
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1,
-                                  "inner_border_right must be smaller than stride_x");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1,
-                                  "inner_border_top must be smaller than stride_y");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
+                                  "invalid_right must be smaller than kernel_x");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
+                                  "inner_border_top must be smaller than kernel_y");
 
-  // NOTE From the existing CLDeconvolutionLayer, inner_border_right and inner_border_top were
-  // added.
-  auto out_dims = deconvolution_output_dimensions_ex(
+  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
+  auto out_dims = transposeconv_output_dimensions(
       input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
-      weights->dimension(idx_h), info.pad().first, info.pad().second, stride_x, stride_y,
-      inner_border_right, inner_border_top);
+      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
 
-  const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
+  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
 
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
 
@@ -111,11 +107,12 @@ Status CLDeconvolutionLayerEx::validate(const ITensorInfo *input, const ITensorI
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
                                   "Output's depth is invalid.");
 
-  unsigned int padx = 0;
-  unsigned int pady = 0;
-  const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
-      *input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx,
-      pady);
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input, *weights, info, out_dims, pad_left, pad_right, pad_top, pad_bottom);
   TensorInfo scale_out_info(input->clone()
                                 ->set_is_resizable(true)
                                 .reset_padding()
@@ -123,19 +120,18 @@ Status CLDeconvolutionLayerEx::validate(const ITensorInfo *input, const ITensorI
                                 .set_data_layout(data_layout));
   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
-  ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(
-      input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
   ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
                                                            conv_info, weights_info));
 
   return Status{};
 }
 
-void CLDeconvolutionLayerEx::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
-                                       ICLTensor *output, const PadStrideInfo &info,
-                                       unsigned int inner_border_right,
-                                       unsigned int inner_border_top,
-                                       const WeightsInfo &weights_info)
+void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+                                     ICLTensor *output, const PadStrideInfo &info,
+                                     unsigned int invalid_right, unsigned int invalid_bottom,
+                                     const WeightsInfo &weights_info)
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -151,15 +147,15 @@ void CLDeconvolutionLayerEx::configure(ICLTensor *input, ICLTensor *weights, con
   _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
   _flip_weights.configure(weights, &_weights_flipped);
 
-  // NOTE From the existing CLDeconvolutionLayer, inner_border_right and inner_border_top were
+  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
   // added.
-  auto out_dims = deconvolution_output_dimensions_ex(
+  auto out_dims = transposeconv_output_dimensions(
       input->info()->dimension(idx_w), input->info()->dimension(idx_h),
-      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info.pad().first,
-      info.pad().second, stride_x, stride_y, inner_border_right, inner_border_top);
+      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+      invalid_bottom);
 
   const TensorShape output_shape =
-      compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
+      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
 
   // Output auto initialization if not yet initialized
   auto_init_if_empty(
@@ -167,9 +163,9 @@ void CLDeconvolutionLayerEx::configure(ICLTensor *input, ICLTensor *weights, con
       input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
 
   // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerEx::validate(
+  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
       input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
-      info, inner_border_right, inner_border_top));
+      info, invalid_right, invalid_bottom));
 
   _is_prepared = weights_info.retain_internal_weights();
 
@@ -177,11 +173,12 @@ void CLDeconvolutionLayerEx::configure(ICLTensor *input, ICLTensor *weights, con
 
   // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
   // to match output shape
-  unsigned int padx = 0;
-  unsigned int pady = 0;
-  const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
-      *input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top,
-      out_dims, padx, pady);
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input->info(), *weights->info(), info, out_dims, pad_left, pad_right, pad_top, pad_bottom);
 
   TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
                             input->info()->quantization_info());
@@ -189,9 +186,9 @@ void CLDeconvolutionLayerEx::configure(ICLTensor *input, ICLTensor *weights, con
   _scaled_output.allocator()->init(scale_out_info);
 
   // configure scale function
-  const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
-  _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right),
-                     upsample_info);
+  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                    DimensionRoundingType::FLOOR);
+  _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
 
   // setup the function to convolve the upscaled output
   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
@@ -199,7 +196,7 @@ void CLDeconvolutionLayerEx::configure(ICLTensor *input, ICLTensor *weights, con
   _scaled_output.allocator()->allocate();
 }
 
-void CLDeconvolutionLayerEx::run()
+void CLTransposeConvLayer::run()
 {
   prepare();
 
@@ -211,7 +208,7 @@ void CLDeconvolutionLayerEx::run()
   _memory_group.release();
 }
 
-void CLDeconvolutionLayerEx::prepare()
+void CLTransposeConvLayer::prepare()
 {
   if (!_is_prepared)
   {
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
new file mode 100644 (file)
index 0000000..0ce3e67
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
+    : _upsample(),
+      _output(nullptr)
+{
+}
+
+Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                              const BorderSize &inner_border,
+                                              const PadStrideInfo &info)
+{
+  return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
+}
+
+void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
+                                             const BorderSize &inner_border,
+                                             const PadStrideInfo &info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  _output = output;
+  _upsample.configure(input, _output, inner_border, info);
+}
+
+void CLTransposeConvLayerUpsample::run()
+{
+  _output->map(CLScheduler::get().queue(), true);
+  if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
+  {
+    const uint8_t quantized_zero = _output->info()->quantization_info().offset;
+    std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+  }
+  else
+  {
+    memset(_output->buffer(), 0, _output->info()->total_size());
+  }
+  _output->unmap(CLScheduler::get().queue());
+
+  CLScheduler::get().enqueue(_upsample, false);
+}
index 9ba4c1a..7304c34 100644 (file)
@@ -2403,6 +2403,8 @@ void StageGenerator::visit(const model::operation::TransposeConvNode &node)
 
     model::ExplicitPadding padding;
     model::Stride stride;
+    uint32_t invalid_horizontal;
+    uint32_t invalid_vertical;
   };
 
   Param param;
@@ -2417,6 +2419,18 @@ void StageGenerator::visit(const model::operation::TransposeConvNode &node)
          (node.param().padding.type == model::PaddingType::VALID));
   param.padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape,
                                                  param.stride, ker_shape.W, ker_shape.H);
+  if (node.param().padding.type == model::PaddingType::VALID)
+  {
+    param.invalid_horizontal =
+        ofm_shape.W - (1 + (ifm_shape.W - 1) * param.stride.horizontal) - (ker_shape.W - 1);
+    param.invalid_vertical =
+        ofm_shape.H - (1 + (ifm_shape.H - 1) * param.stride.vertical) - (ker_shape.H - 1);
+  }
+  else
+  {
+    param.invalid_horizontal = 0;
+    param.invalid_vertical = 0;
+  }
 
   auto tensors = _tensor_builder;
 
@@ -2425,21 +2439,14 @@ void StageGenerator::visit(const model::operation::TransposeConvNode &node)
     auto ifm_alloc = tensors->at(param.ifm_index).get();
     auto ker_alloc = tensors->at(param.ker_index).get();
 
-    std::unique_ptr<::arm_compute::IFunction> fn;
-
-    auto l = nnfw::cpp14::make_unique<::arm_compute::CLDeconvolutionLayerEx>();
+    const auto tconv_info = acl_common::asPadStrideInfo(param.padding, param.stride);
 
-    auto padding = param.padding;
-    auto inner_border_right = padding.right - padding.left;
-    auto inner_border_top = padding.bottom - padding.top;
+    std::unique_ptr<::arm_compute::IFunction> fn;
 
-    padding.left = padding.right;
-    padding.top = padding.bottom;
-    auto symmetric_tconv_info =
-        ::neurun::backend::acl_common::asPadStrideInfo(padding, param.stride);
+    auto l = nnfw::cpp14::make_unique<::arm_compute::CLTransposeConvLayer>();
 
-    l->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(),
-                 symmetric_tconv_info, inner_border_right, inner_border_top);
+    l->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
+                 param.invalid_vertical, param.invalid_horizontal);
 
     fn = std::move(l);
 
index b2b6505..666d27e 100644 (file)
@@ -4011,6 +4011,8 @@ void Planner::visit(const ::internal::tflite::op::TransposeConv::Node &node)
     int ker_index;
     Padding padding;
     Stride stride;
+    uint32_t invalid_horizontal;
+    uint32_t invalid_vertical;
   };
 
   Param param;
@@ -4026,6 +4028,15 @@ void Planner::visit(const ::internal::tflite::op::TransposeConv::Node &node)
                       ? same_padding(ofm_shape, ifm_shape, param.stride, ker_shape.W, ker_shape.H)
                       : valid_padding();
 
+  param.invalid_horizontal =
+      (padding_type == ANEURALNETWORKS_PADDING_SAME)
+          ? 0
+          : ofm_shape.W - (1 + (ifm_shape.W - 1) * hstride) - (ker_shape.W - 1);
+  param.invalid_vertical =
+      (padding_type == ANEURALNETWORKS_PADDING_SAME)
+          ? 0
+          : ofm_shape.H - (1 + (ifm_shape.H - 1) * param.stride.vertical) - (ker_shape.H - 1);
+
   auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
     auto ofm_alloc = ctx.at(::internal::tflite::operand::Index{param.ofm_index});
     auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
@@ -4036,19 +4047,13 @@ void Planner::visit(const ::internal::tflite::op::TransposeConv::Node &node)
 
     if (::internal::arm_compute::isGpuMode())
     {
-      auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDeconvolutionLayerEx>();
-
-      auto padding = param.padding;
-      auto inner_border_right = padding.right - padding.left;
-      auto inner_border_top = padding.bottom - padding.top;
+      auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTransposeConvLayer>();
 
-      padding.left = padding.right;
-      padding.top = padding.bottom;
-      auto symmetric_tconv_info = asPadStrideInfo(padding, param.stride);
+      auto symmetric_tconv_info = asPadStrideInfo(param.padding, param.stride);
 
       // TODO Support WeightInfo in some cases in order to performance improvement
       fn->configure(CAST_CL(ifm_alloc), CAST_CL(ker_alloc), nullptr, CAST_CL(ofm_alloc),
-                    symmetric_tconv_info, inner_border_right, inner_border_top);
+                    symmetric_tconv_info, param.invalid_horizontal, param.invalid_vertical);
       builder.append("TransposeConv", std::move(fn));
     }
     else
index ee76672..88ea9df 100644 (file)
@@ -11,7 +11,6 @@ GeneratedTests.prelu_ex_quant8_1
 GeneratedTests.prelu_ex_broadcast_quant8_1
 # Unexpected result
 GeneratedTests.pack*
-GeneratedTests.transpose_conv_ex_float_4
 # Not support broadcast
 GeneratedTests.logical_or_ex_broadcast_4D_2D
 # Unsupported optional input that has shape