Add CLkernel for Generic Padding (#3428)
authorShubham Gupta/SNAP /SRI-Bangalore/Engineer/삼성전자 <shub98.gupta@samsung.com>
Wed, 28 Nov 2018 01:34:54 +0000 (07:04 +0530)
committer이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
Wed, 28 Nov 2018 01:34:54 +0000 (10:34 +0900)
This patch will add cl kernel to execute pad op on GPU

Signed-off-by: shubham <shub98.gupta@samsung.com>
libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h [new file with mode: 0644]
libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayer.h [new file with mode: 0644]
libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl [new file with mode: 0644]
libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp [new file with mode: 0644]
libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayer.cpp [new file with mode: 0644]
runtimes/pure_arm_compute/src/compilation.cc

diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernel.h
new file mode 100644 (file)
index 0000000..cbaa2ad
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#ifndef __ARM_COMPUTE_CLPADLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLPADLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform PAD operation */
+class CLPadLayerKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLPadLayerKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerKernel(const CLPadLayerKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLPadLayerKernel(CLPadLayerKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default;
+  /** Default destructor */
+  ~CLPadLayerKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  pad_size Padding Size tensor. Data types supported : S32
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /**< Source tensor */
+  ICLTensor *_output;      /**< Destination tensor */
+  ICLTensor *_pad_size;    /**< Padding Size tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLPADLAYERKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayer.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayer.h
new file mode 100644 (file)
index 0000000..d8ad0e1
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#ifndef __ARM_COMPUTE_CLPADLAYER_H__
+#define __ARM_COMPUTE_CLPADLAYER_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLPadLayerKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLPadLayer : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]           input     Input tensor. Data types supported:
+   *                                U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out]          output    Output tensor. Data types supported:
+   *                                U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]           pad_size  Tensor for Padding values in NHWC format shape [n, 2],
+   *                                where n is the rank of tensor . Data types supported: S32
+   */
+  void configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLPADLAYER_H__ */
index 844a3f1..ab7962a 100644 (file)
@@ -226,6 +226,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"NV21_to_RGBA8888_bt709", "color_convert.cl"},
     {"NV21_to_YUV444_bt709", "color_convert.cl"},
     {"output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
+    {"pad", "pad.cl"},
     {"permute_201", "permute.cl"},
     {"permute_120", "permute.cl"},
     {"permute_3201", "permute.cl"},
@@ -385,6 +386,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/notequal.clembed"
     },
     {
+        "pad.cl",
+#include "./cl_kernels/pad.clembed"
+    },
+    {
         "pixelwise_div_float.cl",
 #include "./cl_kernels/pixelwise_div_float.clembed"
     },
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl
new file mode 100644 (file)
index 0000000..ecf4696
--- /dev/null
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ *
+ * @param[in]  pad_values                           Padding values for each of the dimensions. Only pad values for Up(for
+ *                                                  batch), Top(for height), Left(for width) and Front(for depth) are
+ *                                                 required. Supported data type: S32
+ */
+
+__kernel void pad(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output),
+     const int4 pad_values)
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int index[4]={0};
+
+    index[0] = get_global_id(0);//W
+    index[1] = get_global_id(1);//H
+    index[2] = get_global_id(2) % DEPTH_OUT;//C
+    index[3] = get_global_id(2) / DEPTH_OUT;//N
+
+    if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) ||
+        index[1] < pad_values.y || index[1] >= (IH + pad_values.y) ||
+        index[2] < pad_values.z || index[2] >= (ID + pad_values.z) ||
+        index[3] < pad_values.w || index[3] >= (IB + pad_values.w))
+    {
+        *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+    }
+    else
+    {
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)
+                                              tensor4D_offset(&in, index[0] - pad_values.x,
+                                                              index[1] - pad_values.y,
+                                                              index[2] - pad_values.z,
+                                                              index[3] - pad_values.w));
+    }
+ }
+
+#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernel.cpp
new file mode 100644 (file)
index 0000000..27d9301
--- /dev/null
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info,
+                          const ITensorInfo *pad_size_info)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() == 4,
+                                  "Pad kernel supports only 4-D input tensor");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      input_info->num_dimensions() == output_info->num_dimensions(),
+      "output tensor should have same number of dimensions as input tensor");
+
+  if (input_info->data_type() == DataType::QASYMM8)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() !=
+                                        output_info->quantization_info(),
+                                    "The input and output quantization info are different!");
+  }
+
+  return Status{};
+}
+
+} // namespace
+
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {}
+
+void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info()));
+
+  _input = input;
+  _output = output;
+  _pad_size = pad_size;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3)));
+  build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0)));
+  build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1)));
+  build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2)));
+  if (input->info()->data_type() == DataType::QASYMM8)
+  {
+    build_opts.emplace("-DZERO_VALUE=" +
+                       support::cpp11::to_string(input->info()->quantization_info().offset));
+  }
+  else
+  {
+    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+  }
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure(win);
+}
+
+void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  _pad_size->map(queue);
+
+  // Padding values only for up, top, left and front are required
+  auto pad_batch_up = *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0}));
+  auto pad_height_top = *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 1}));
+  auto pad_width_left = *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 2}));
+  auto pad_depth_front = *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 3}));
+
+  _pad_size->unmap(queue);
+
+  // Pad_values which needs to be passed
+  const cl_int4 paddingValues = {
+      {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top),
+       static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}};
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    _kernel.setArg<cl_int4>(idx++, paddingValues);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayer.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayer.cpp
new file mode 100644 (file)
index 0000000..0c0b1ef
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#include "arm_compute/runtime/CL/functions/CLPadLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+using namespace arm_compute;
+
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLPadLayerKernel>();
+  k->configure(input, output, pad_size);
+  _kernel = std::move(k);
+}
index 579f569..885b803 100644 (file)
@@ -32,6 +32,7 @@
 #include <arm_compute/runtime/CL/CLSubTensor.h>
 #include <arm_compute/runtime/CL/functions/CLArithmeticAddition.h>
 #include <arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h>
+#include <arm_compute/runtime/CL/functions/CLPadLayer.h>
 #include <arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h>
 #include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h>
 #include <arm_compute/runtime/CL/functions/CLPoolingLayer.h>
@@ -3949,21 +3950,23 @@ void Planner::visit(const ::internal::tflite::op::Pad::Node &node)
   const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index};
   const ::internal::tflite::operand::Index paddings_index{node.param().paddings_index};
 
-  const auto paddings_shape = _ctx.at(paddings_index).shape().asTensor();
-
+  // TODO: Support input having rank < 4
+  assert(_ctx.at(ifm_index).shape().rank() == 4);
   assert(_ctx.at(paddings_index).hasData() == true);
 
   // Set Shape Constraints and TensorInfo
+  _builder.addShapeConstr(ifm_index,
+                          asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape(), false),
+                                       _ctx.at(ifm_index).type(), _ctx.at(ifm_index).scale(),
+                                       _ctx.at(ifm_index).zeroPoint()));
+  _builder.addShapeConstr(ofm_index,
+                          asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape(), false),
+                                       _ctx.at(ofm_index).type(), _ctx.at(ofm_index).scale(),
+                                       _ctx.at(ofm_index).zeroPoint()));
   _builder.addShapeConstr(
-      ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()), _ctx.at(ifm_index).type(),
-                              _ctx.at(ifm_index).scale(), _ctx.at(ifm_index).zeroPoint()));
-  _builder.addShapeConstr(
-      ofm_index, asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape()), _ctx.at(ofm_index).type(),
-                              _ctx.at(ofm_index).scale(), _ctx.at(ofm_index).zeroPoint()));
-  _builder.addShapeConstr(
-      paddings_index,
-      asTensorInfo(asTensorShape(_ctx.at(paddings_index).shape()), _ctx.at(paddings_index).type(),
-                   _ctx.at(paddings_index).scale(), _ctx.at(paddings_index).zeroPoint()));
+      paddings_index, asTensorInfo(asTensorShape(_ctx.at(paddings_index).shape(), false),
+                                   _ctx.at(paddings_index).type(), _ctx.at(paddings_index).scale(),
+                                   _ctx.at(paddings_index).zeroPoint()));
 
   // initializer for padding
   {
@@ -4004,15 +4007,32 @@ void Planner::visit(const ::internal::tflite::op::Pad::Node &node)
     auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
     auto pad_alloc = ctx.at(::internal::tflite::operand::Index{param.padding_index});
 
-    auto fn = nnfw::make_unique<SimplePadLayer>();
+    if (from_env<bool>(std::getenv("USE_SIMPLE_PAD")))
+    {
+      // USE CPU VERSION OF PADLAYER
+      auto rank = 4;
+      auto fn = nnfw::make_unique<SimplePadLayer>();
 
-    // only 4d Tensors are supported
-    int rank = 4;
+      fn->configure(ifm_alloc, ofm_alloc, pad_alloc, getARMComputeAxises(rank));
 
-    fn->configure(ifm_alloc, ofm_alloc, pad_alloc, getARMComputeAxises(rank));
+      builder.append("PAD", std::move(fn));
+    }
+    else
+    {
+      if (::internal::arm_compute::isGpuMode()) // GPU
+      {
+        auto fn = nnfw::make_unique<::arm_compute::CLPadLayer>();
 
-    builder.append("Pad", std::move(fn));
+        fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), CAST_CL(pad_alloc));
 
+        builder.append("PAD", std::move(fn));
+      }
+      else // NEON
+      {
+        // TODO Enable NEON Support
+        throw std::runtime_error("Not supported, yet");
+      }
+    }
   };
 
   _builder.addStage(stage);