--- /dev/null
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#ifndef __ARM_COMPUTE_CLPADLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLPADLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform PAD operation */
+class CLPadLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLPadLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPadLayerKernel(const CLPadLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLPadLayerKernel(CLPadLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default;
+ /** Default destructor */
+ ~CLPadLayerKernel() = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] pad_size Padding Size tensor. Data types supported : S32
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input; /**< Source tensor */
+ ICLTensor *_output; /**< Destination tensor */
+ ICLTensor *_pad_size; /**< Padding Size tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLPADLAYERKERNEL_H__ */
--- /dev/null
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#ifndef __ARM_COMPUTE_CLPADLAYER_H__
+#define __ARM_COMPUTE_CLPADLAYER_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLPadLayerKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLPadLayer : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data types supported:
+ * U8/QASYMM8/S16/S32/F16/F32.
+ * @param[out] output Output tensor. Data types supported:
+ * U8/QASYMM8/S16/S32/F16/F32.
+ * @param[in] pad_size Tensor for Padding values in NHWC format shape [n, 2],
+ * where n is the rank of tensor . Data types supported: S32
+ */
+ void configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLPADLAYER_H__ */
{"NV21_to_RGBA8888_bt709", "color_convert.cl"},
{"NV21_to_YUV444_bt709", "color_convert.cl"},
{"output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
+ {"pad", "pad.cl"},
{"permute_201", "permute.cl"},
{"permute_120", "permute.cl"},
{"permute_3201", "permute.cl"},
#include "./cl_kernels/notequal.clembed"
},
{
+ "pad.cl",
+#include "./cl_kernels/pad.clembed"
+ },
+ {
"pixelwise_div_float.cl",
#include "./cl_kernels/pixelwise_div_float.clembed"
},
--- /dev/null
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu
+t_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ *
+ * @param[in] pad_values Padding values for each of the dimensions. Only pad values for Up(for
+ * batch), Top(for height), Left(for width) and Front(for depth) are
+ * required. Supported data type: S32
+ */
+
+__kernel void pad(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ const int4 pad_values)
+ {
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+ int index[4]={0};
+
+ index[0] = get_global_id(0);//W
+ index[1] = get_global_id(1);//H
+ index[2] = get_global_id(2) % DEPTH_OUT;//C
+ index[3] = get_global_id(2) / DEPTH_OUT;//N
+
+ if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) ||
+ index[1] < pad_values.y || index[1] >= (IH + pad_values.y) ||
+ index[2] < pad_values.z || index[2] >= (ID + pad_values.z) ||
+ index[3] < pad_values.w || index[3] >= (IB + pad_values.w))
+ {
+ *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+ }
+ else
+ {
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)
+ tensor4D_offset(&in, index[0] - pad_values.x,
+ index[1] - pad_values.y,
+ index[2] - pad_values.z,
+ index[3] - pad_values.w));
+ }
+ }
+
+#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE)
--- /dev/null
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input_info, const ITensorInfo *output_info,
+ const ITensorInfo *pad_size_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_info, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_info, 1, DataType::U8, DataType::QASYMM8,
+ DataType::S16, DataType::S32, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(pad_size_info, 1, DataType::S32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->num_dimensions() == 4,
+ "Pad kernel supports only 4-D input tensor");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ input_info->num_dimensions() == output_info->num_dimensions(),
+ "output tensor should have same number of dimensions as input tensor");
+
+ if (input_info->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_info->quantization_info() !=
+ output_info->quantization_info(),
+ "The input and output quantization info are different!");
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _pad_size(nullptr) {}
+
+void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, pad_size);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pad_size->info()));
+
+ _input = input;
+ _output = output;
+ _pad_size = pad_size;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DIB=" + support::cpp11::to_string(input->info()->dimension(3)));
+ build_opts.emplace("-DIW=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.emplace("-DIH=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DID=" + support::cpp11::to_string(input->info()->dimension(2)));
+ if (input->info()->data_type() == DataType::QASYMM8)
+ {
+ build_opts.emplace("-DZERO_VALUE=" +
+ support::cpp11::to_string(input->info()->quantization_info().offset));
+ }
+ else
+ {
+ build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("pad", build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ _pad_size->map(queue);
+
+ // Padding values only for up, top, left and front are required
+ auto pad_batch_up = *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 0}));
+ auto pad_height_top = *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 1}));
+ auto pad_width_left = *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 2}));
+ auto pad_depth_front = *reinterpret_cast<const int32_t *>(_pad_size->ptr_to_element({0, 3}));
+
+ _pad_size->unmap(queue);
+
+ // Pad_values which needs to be passed
+ const cl_int4 paddingValues = {
+ {static_cast<cl_int>(pad_width_left), static_cast<cl_int>(pad_height_top),
+ static_cast<cl_int>(pad_depth_front), static_cast<cl_int>(pad_batch_up)}};
+
+ Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+ // Setup output slice
+ Window slice_in(slice_out);
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ slice_in.set(3, Window::Dimension(0, 0, 0));
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_out);
+ _kernel.setArg<cl_int4>(idx++, paddingValues);
+ enqueue(queue, *this, slice_out);
+ } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
--- /dev/null
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#include "arm_compute/runtime/CL/functions/CLPadLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+using namespace arm_compute;
+
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLPadLayerKernel>();
+ k->configure(input, output, pad_size);
+ _kernel = std::move(k);
+}
#include <arm_compute/runtime/CL/CLSubTensor.h>
#include <arm_compute/runtime/CL/functions/CLArithmeticAddition.h>
#include <arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h>
+#include <arm_compute/runtime/CL/functions/CLPadLayer.h>
#include <arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h>
#include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h>
#include <arm_compute/runtime/CL/functions/CLPoolingLayer.h>
const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index};
const ::internal::tflite::operand::Index paddings_index{node.param().paddings_index};
- const auto paddings_shape = _ctx.at(paddings_index).shape().asTensor();
-
+ // TODO: Support input having rank < 4
+ assert(_ctx.at(ifm_index).shape().rank() == 4);
assert(_ctx.at(paddings_index).hasData() == true);
// Set Shape Constraints and TensorInfo
+ _builder.addShapeConstr(ifm_index,
+ asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape(), false),
+ _ctx.at(ifm_index).type(), _ctx.at(ifm_index).scale(),
+ _ctx.at(ifm_index).zeroPoint()));
+ _builder.addShapeConstr(ofm_index,
+ asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape(), false),
+ _ctx.at(ofm_index).type(), _ctx.at(ofm_index).scale(),
+ _ctx.at(ofm_index).zeroPoint()));
_builder.addShapeConstr(
- ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()), _ctx.at(ifm_index).type(),
- _ctx.at(ifm_index).scale(), _ctx.at(ifm_index).zeroPoint()));
- _builder.addShapeConstr(
- ofm_index, asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape()), _ctx.at(ofm_index).type(),
- _ctx.at(ofm_index).scale(), _ctx.at(ofm_index).zeroPoint()));
- _builder.addShapeConstr(
- paddings_index,
- asTensorInfo(asTensorShape(_ctx.at(paddings_index).shape()), _ctx.at(paddings_index).type(),
- _ctx.at(paddings_index).scale(), _ctx.at(paddings_index).zeroPoint()));
+ paddings_index, asTensorInfo(asTensorShape(_ctx.at(paddings_index).shape(), false),
+ _ctx.at(paddings_index).type(), _ctx.at(paddings_index).scale(),
+ _ctx.at(paddings_index).zeroPoint()));
// initializer for padding
{
auto ifm_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
auto pad_alloc = ctx.at(::internal::tflite::operand::Index{param.padding_index});
- auto fn = nnfw::make_unique<SimplePadLayer>();
+ if (from_env<bool>(std::getenv("USE_SIMPLE_PAD")))
+ {
+ // USE CPU VERSION OF PADLAYER
+ auto rank = 4;
+ auto fn = nnfw::make_unique<SimplePadLayer>();
- // only 4d Tensors are supported
- int rank = 4;
+ fn->configure(ifm_alloc, ofm_alloc, pad_alloc, getARMComputeAxises(rank));
- fn->configure(ifm_alloc, ofm_alloc, pad_alloc, getARMComputeAxises(rank));
+ builder.append("PAD", std::move(fn));
+ }
+ else
+ {
+ if (::internal::arm_compute::isGpuMode()) // GPU
+ {
+ auto fn = nnfw::make_unique<::arm_compute::CLPadLayer>();
- builder.append("Pad", std::move(fn));
+ fn->configure(CAST_CL(ifm_alloc), CAST_CL(ofm_alloc), CAST_CL(pad_alloc));
+ builder.append("PAD", std::move(fn));
+ }
+ else // NEON
+ {
+ // TODO Enable NEON Support
+ throw std::runtime_error("Not supported, yet");
+ }
+ }
};
_builder.addStage(stage);