From 1ab35335135723717ddb064d897cc57b28457dc6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=EC=9E=A5=EC=A7=80=EC=84=AD/On-Device=20Lab=28SR=29/Enginee?= =?utf8?q?r/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Mon, 11 Feb 2019 11:18:50 +0900 Subject: [PATCH] Support NHWC to SpaceToDepthKernel (#4369) This commit supports NHWC to SpaceToDepthKernel Signed-off-by: jiseob.jang --- libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp | 3 +- .../src/core/CL/cl_kernels/space_to_depth.cl | 66 ++++++++++++++++++++-- .../src/core/CL/kernels/CLSpaceToDepthKernel.cpp | 31 ++++++---- 3 files changed, 83 insertions(+), 17 deletions(-) diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index c5ab86b..1bc08a2 100644 --- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -76,7 +76,8 @@ const std::map CLKernelLibraryEx::_kernel_program_map {"topkv2_quicksort", "topkv2_quicksort.cl"}, {"space_to_batch_4d_nchw", "space_to_batch.cl"}, {"space_to_batch_4d_nhwc", "space_to_batch.cl"}, - {"space_to_depth", "space_to_depth.cl"}, + {"space_to_depth_nchw", "space_to_depth.cl"}, + {"space_to_depth_nhwc", "space_to_depth.cl"}, }; const std::map CLKernelLibraryEx::_program_source_map = { diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl index f697704..20eeb38 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl @@ -16,11 +16,12 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) /** Perform space to depth rearrangement of tensor * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention The value of the z-axis of input tensor depth should be given as a preprocessor argument using -DZ_IN=size. e.g. -DZ_IN=16 * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 * * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 @@ -44,11 +45,11 @@ bytes) * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image */ -__kernel void space_to_depth( +__kernel void space_to_depth_nchw( TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); int out_index[4]={0}; @@ -56,8 +57,8 @@ __kernel void space_to_depth( in_index[0] = get_global_id(0);//W in_index[1] = get_global_id(1);//H - in_index[2] = get_global_id(2) % DEPTH_IN;//C - in_index[3] = get_global_id(2) / DEPTH_IN;//B + in_index[2] = get_global_id(2) % Z_IN;//C + in_index[3] = get_global_id(2) / Z_IN;//B out_index[0] = in_index[0]/BLOCK_SIZE; out_index[1] = in_index[1]/BLOCK_SIZE; @@ -66,4 +67,57 @@ __kernel void space_to_depth( *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); } -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) +#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) + +#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention The value of the z-axis of input tensor depth should be given as a preprocessor argument using -DZ_IN=size. e.g. -DZ_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void space_to_depth_nhwc( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4]={0}; + int in_index[4]={0}; + + in_index[0] = get_global_id(0);//C + in_index[1] = get_global_id(1);//W + in_index[2] = get_global_id(2) % Z_IN;//H + in_index[3] = get_global_id(2) / Z_IN;//B + + out_index[0] = in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN; + out_index[1] = in_index[1]/BLOCK_SIZE; + out_index[2] = in_index[2]/BLOCK_SIZE; + out_index[3] = in_index[3]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp index b803366..b085192 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp @@ -39,18 +39,25 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3), "Input batch should be equal to Output batch"); + auto layout_out = input->data_layout(); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT); + auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH); ARM_COMPUTE_RETURN_ERROR_ON_MSG( - input->dimension(2) * block_size * block_size != output->dimension(2), + input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth), "Output depth should be equal to (input depth * block size *block size)"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(0) % block_size) || - (input->dimension(1) % block_size), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) || + (input->dimension(index_height) % block_size), "Input height and width should be divisible by block size"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != (input->dimension(0) / block_size)) || - (output->dimension(1) != (input->dimension(1) / block_size)), - "Output height and width should be equal to " - "input_height/blocksize and input_width/blocksize respectively"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (output->dimension(index_width) != (input->dimension(index_width) / block_size)) || + (output->dimension(index_height) != (input->dimension(index_height) / block_size)), + "Output height and width should be equal to " + "input_height/blocksize and input_width/blocksize respectively"); return Status{}; } @@ -70,14 +77,18 @@ void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, _output = output; // Set kernel build options + auto layout_out = input->info()->data_layout(); std::set build_opts; build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); - build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto depth = input->info()->dimension(index_depth); + build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth)); + build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z())); // Create kernel - _kernel = - static_cast(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts)); + _kernel = static_cast(CLKernelLibraryEx::get().create_kernel( + "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts)); // Configure kernel window Window win = calculate_max_window(*input->info(), Steps()); -- 2.7.4