From: Shubham Gupta/SNAP /SRI-Bangalore/Engineer/삼성전자 Date: Mon, 10 Dec 2018 01:27:55 +0000 (+0530) Subject: Generic HashTableLookup kernel to handle tensors upto rank=4 (#3798) X-Git-Tag: 0.3~155 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=130373b5736deff2bb8e9220e38601805f112fca;p=platform%2Fcore%2Fml%2Fnnfw.git Generic HashTableLookup kernel to handle tensors upto rank=4 (#3798) This patch unifies the 3 different hashtable lookup kernel into one kernel so as to make a generic cl to support upto 4d tensors. Signed-off-by: shubham --- diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index 85b80ca..73efe44 100644 --- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -170,9 +170,7 @@ const std::map CLKernelLibraryEx::_kernel_program_map {"harris_score_3x3", "harris_corners.cl"}, {"harris_score_5x5", "harris_corners.cl"}, {"harris_score_7x7", "harris_corners.cl"}, - {"hashtable_lookup_2d", "hashtable_lookup.cl"}, - {"hashtable_lookup_3d", "hashtable_lookup.cl"}, - {"hashtable_lookup_4d", "hashtable_lookup.cl"}, + {"hashtable_lookup", "hashtable_lookup.cl"}, {"hist_border_kernel", "histogram.cl"}, {"hist_border_kernel_fixed", "histogram.cl"}, {"hist_local_kernel", "histogram.cl"}, diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl index d1a8eee..b60a6cc 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -16,106 +16,13 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(VEC_SIZE) -/** Perform hashtable_lookup of input image - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 - * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) - * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector - */ -__kernel void hashtable_lookup_2d(IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output), - VECTOR_DECLARATION(lookups)) -{ - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - int lup_id = *((__global int *)vector_offset(&lups, get_global_id(1))); - - if (lup_id < 0) - { - VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); - return; - } - - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input); - in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + lup_id * input_step_y; - - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); -} - -/** Perform hashtable_lookup of input tensor3D - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 - * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) - * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector - */ -__kernel void hashtable_lookup_3d(TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output), - VECTOR_DECLARATION(lookups)) -{ - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); - - Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - int lup_id = *((__global int *)vector_offset(&lups, get_global_id(2))); - - if (lup_id < 0) - { - VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); - return; - } - - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input); - in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + get_global_id(1) * input_step_y - + lup_id * input_step_z; - - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); -} - -#if defined(DEPTH_OUT) -/** Perform hashtable_lookup of input tensor4D +#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform hashtable_lookup of input tensor * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 * * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) @@ -142,27 +49,36 @@ __kernel void hashtable_lookup_3d(TENSOR3D_DECLARATION(input), * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector */ -__kernel void hashtable_lookup_4d(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - VECTOR_DECLARATION(lookups)) +__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) { Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - int lup_id = *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)); - if (lup_id < 0) + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) + :get_global_id(0); + lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) + :get_global_id(1); + lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) + :get_global_id(2)%DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + :get_global_id(2) / DEPTH_OUT; + + if (lup_id[NUM_DIMS-1] < 0) { VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); return; } - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); - in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + get_global_id(1) * input_step_y - + (get_global_id(2) % DEPTH_OUT) * input_step_z + lup_id * input_step_w; + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + + lup_id[2] * input_step_z + lup_id[3] * input_step_w; VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)out.ptr); } -#endif // defined(DEPTH_OUT) -#endif // defined(DATA_TYPE) && defined(VEC_SIZE) +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp index ee1809c..2f8f0dd 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -93,13 +93,12 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso // Set kernel build options std::stringstream kernel_name; std::set build_opts; - kernel_name << "hashtable_lookup_" << input->info()->num_dimensions() << "d"; - if (input->info()->num_dimensions() == 4) - { - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - } + kernel_name << "hashtable_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); // Create kernel _kernel = static_cast( @@ -166,25 +165,13 @@ void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) Window win_lookup; win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); - unsigned int idx = 0; - switch (_input->info()->num_dimensions()) + do { - case 2: - add_2D_tensor_argument(idx, _input, win); - add_2D_tensor_argument(idx, _output, win); - break; - case 3: - add_3D_tensor_argument(idx, _input, win); - add_3D_tensor_argument(idx, _output, win); - break; - case 4: - add_4D_tensor_argument(idx, _input, win); - add_4D_tensor_argument(idx, _output, win); - break; - default: - ARM_COMPUTE_ERROR("Supported inputs with dimensions of 2 or more and 4 or less."); - break; - } - add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); - enqueue(queue, *this, win); + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, win); + add_4D_tensor_argument(idx, _output, win); + add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); + + enqueue(queue, *this, win); + } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); }