From: Shubham Gupta/SNAP /SRI-Bangalore/Engineer/삼성전자 Date: Mon, 10 Dec 2018 01:28:33 +0000 (+0530) Subject: Generic EmbeddingLookup kernel to handle tensors upto rank=4 (#3794) X-Git-Tag: 0.3~153 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7b2fad2cf613584ba448d166c718ce18e164fd39;p=platform%2Fcore%2Fml%2Fnnfw.git Generic EmbeddingLookup kernel to handle tensors upto rank=4 (#3794) This patch unifies the 3 different embedding lookup kernel into one kernel so as to make a generic cl to support upto 4d tensors. Signed-off-by: shubham --- diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index 73efe44..246ab27 100644 --- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -119,9 +119,7 @@ const std::map CLKernelLibraryEx::_kernel_program_map {"direct_convolution5x5", "direct_convolution5x5.cl"}, {"direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl"}, {"direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"}, - {"embedding_lookup_2d", "embedding_lookup.cl"}, - {"embedding_lookup_3d", "embedding_lookup.cl"}, - {"embedding_lookup_4d", "embedding_lookup.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, {"equal", "equal.cl"}, {"equal_quantized", "equal_quantized.cl"}, {"erode", "erode.cl"}, diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl index 2331ff5..9105e16 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -16,94 +16,13 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(VEC_SIZE) -/** Perform embedding_lookup of input image - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 - * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) - * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector - */ -__kernel void embedding_lookup_2d(IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output), - VECTOR_DECLARATION(lookups)) -{ - Image out = CONVERT_TO_IMAGE_STRUCT(output); - - Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - int lup_id = *((__global int *)vector_offset(&lups, get_global_id(1))); - - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input); - in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + lup_id * input_step_y; - - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); -} - -/** Perform embedding_lookup of input tensor3D - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 - * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) - * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector - */ -__kernel void embedding_lookup_3d(TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output), - VECTOR_DECLARATION(lookups)) -{ - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); - - Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - int lup_id = *((__global int *)vector_offset(&lups, get_global_id(2))); - - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input); - in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + get_global_id(1) * input_step_y - + lup_id * input_step_z; - - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); -} - -#if defined(DEPTH_OUT) -/** Perform embedding_lookup of input tensor4D +#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform embedding_lookup of input tensor * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 * * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) @@ -130,21 +49,32 @@ __kernel void embedding_lookup_3d(TENSOR3D_DECLARATION(input), * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector */ -__kernel void embedding_lookup_4d(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - VECTOR_DECLARATION(lookups)) + +__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) { Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - int lup_id = *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)); - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); - in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + get_global_id(1) * input_step_y - + (get_global_id(2) % DEPTH_OUT) * input_step_z + lup_id * input_step_w; + //lookup ids for based on the tensor dimensions + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) + :get_global_id(0); + lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) + :get_global_id(1); + lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) + :get_global_id(2)%DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + :get_global_id(2) / DEPTH_OUT; + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + + lup_id[2] * input_step_z + lup_id[3] * input_step_w; VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)out.ptr); } -#endif // defined(DEPTH_OUT) -#endif // defined(DATA_TYPE) && defined(VEC_SIZE) +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp index 348e168..ba05e2d 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -75,13 +75,12 @@ void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *outpu // Set kernel build options std::stringstream kernel_name; std::set build_opts; - kernel_name << "embedding_lookup_" << input->info()->num_dimensions() << "d"; - if (input->info()->num_dimensions() == 4) - { - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); - } + kernel_name << "embedding_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); // Create kernel _kernel = static_cast( @@ -98,44 +97,18 @@ void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); -#if defined(DEBUG) - const_cast(_lookups)->map(queue); - - for (uint32_t n = 0; n < _lookups->info()->tensor_shape().total_size(); ++n) - { - int32_t lookup_id = reinterpret_cast(_lookups->buffer())[n]; - ARM_COMPUTE_ERROR_ON(lookup_id < 0 || - lookup_id >= - _input->info()->dimension(_input->info()->num_dimensions() - 1)); - } - - const_cast(_lookups)->unmap(queue); -#endif /* defined(DEBUG) */ - - Window win = window.collapse(ICLKernel::window(), 2, 4); + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); Window win_lookup; win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); - unsigned int idx = 0; - switch (_input->info()->num_dimensions()) + do { - case 2: - add_2D_tensor_argument(idx, _input, win); - add_2D_tensor_argument(idx, _output, win); - break; - case 3: - add_3D_tensor_argument(idx, _input, win); - add_3D_tensor_argument(idx, _output, win); - break; - case 4: - add_4D_tensor_argument(idx, _input, win); - add_4D_tensor_argument(idx, _output, win); - break; - default: - ARM_COMPUTE_ERROR("Supported inputs with dimensions of 2 or more and 4 or less."); - break; - } - add_1D_tensor_argument(idx, _lookups, win_lookup); - enqueue(queue, *this, win); + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_in); + add_1D_tensor_argument(idx, _lookups, win_lookup); + + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); }