This commit supports Layout::NHWC for CL Kernel of SpaceToBatchND.
Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
~CLSpaceToBatchNDKernel() = default;
/** Initialise the kernel's input and output.
*
+ * @note The data layout of input and output must be the same.
+ * @note The number of dimensions of input and output must be 4, and `spatial` dimensions
+ * are height and width.
* @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+ * Data layout supported: NCHW/NHWC
* @param[in] block_size Block size tensor. Data types supported: S32.
* @param[in] padding_size Padding size tensor. Data types supported: S32.
* @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+ * Data layout supported: NCHW/NHWC
*/
void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
ICLTensor *output);
public:
/** Initialise the kernel's input and output.
*
+ * @note The data layout of input and output must be the same.
+ * @note The number of dimensions of input and output must be 4, and `spatial` dimensions
+ * are height and width.
* @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+ * Data layout supported: NCHW/NHWC
* @param[in] block_size Tensor of integer values specifying block sizes for spatial
* dimension.
* Data types supported: S32
* dimension.
* Data types supported: S32
* @param[out] output Output tensor. Data types supported: same as @p input.
+ * Data layout supported: NCHW/NHWC
*/
void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
ICLTensor *output);
{"radixsort_reorder", "topkv2_radixsort.cl"},
{"topkv2_quicksort", "topkv2_quicksort.cl"},
{"space_to_batch_4d_nchw", "space_to_batch.cl"},
+ {"space_to_batch_4d_nhwc", "space_to_batch.cl"},
{"space_to_depth", "space_to_depth.cl"},
};
}
}
#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+
+#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
+/** Perform space to batch with input of 4D and NHWC format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ VECTOR_DECLARATION(block_size),
+ IMAGE_DECLARATION(padding_size))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
+
+ int block_size_x = *((__global int *)(block_size_ptr + block_size_stride_x));
+ int block_size_y = *((__global int *)block_size_ptr);
+ int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
+ int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
+
+ int in_index[4] = {0, };
+ in_index[0] = get_global_id(0) * VEC_SIZE;
+ in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr + padding_size_stride_y));
+ in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr));
+ in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
+
+ if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
+ {
+ VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
+ }
+ else
+ {
+ VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])),
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+ 0, (__global DATA_TYPE *)out.ptr);
+ }
+}
+
+#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
namespace
{
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
const ITensorInfo *padding_size, const ITensorInfo *output)
{
padding_size->dimension(1) != 2,
"Only 2-dimensional spatial block's size was wrong");
}
+ else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
+ "Input Depth should be equal to Output Depth");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+ padding_size->dimension(1) != 2,
+ "Only 2-dimensional spatial block's size was wrong");
+ }
else
{
- ARM_COMPUTE_RETURN_ERROR_MSG(
- "CLSpaceToBatchNDKernel supports only 4-dimensional input and DataLayout::NCHW");
+ ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
}
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
// Set kernel build options
// TODO Support other cases
- std::string kernel_name = "space_to_batch_4d_nchw";
+ std::string kernel_name = "space_to_batch_4d";
std::set<std::string> build_opts;
+ Window win;
+
+ if (input->info()->data_layout() == DataLayout::NCHW)
+ {
+ kernel_name += "_nchw";
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+ win = calculate_max_window(*output->info(), Steps());
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+ }
+ else if (input->info()->data_layout() == DataLayout::NHWC)
+ {
+ kernel_name += "_nhwc";
+ build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+ build_opts.emplace("-DVEC_SIZE=" +
+ support::cpp11::to_string(num_elems_processed_per_iteration));
+
+ win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+ input_access.set_valid_region(win, output->info()->valid_region());
+
+ if (window_changed)
+ {
+ ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unsupported layout");
+ }
build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
- build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
- build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
if (input->info()->data_type() == DataType::QASYMM8)
{
build_opts.emplace("-DZERO_VALUE=" +
static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
// Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps());
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
ICLKernel::configure(win);
}