From: Shubham Gupta/SNAP /SRI-Bangalore/Engineer/삼성전자 <shub98.gupta@samsung.com>
Date: Mon, 10 Dec 2018 01:28:33 +0000 (+0530)
Subject: Generic EmbeddingLookup kernel to handle tensors upto rank=4 (#3794)
X-Git-Tag: 0.3~153
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7b2fad2cf613584ba448d166c718ce18e164fd39;p=platform%2Fcore%2Fml%2Fnnfw.git

Generic EmbeddingLookup kernel to handle tensors upto rank=4 (#3794)

This patch unifies the 3 different embedding lookup kernel into
one kernel so as to make a generic cl to support upto 4d tensors.

Signed-off-by: shubham <shub98.gupta@samsung.com>
---

diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 73efe44..246ab27 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -119,9 +119,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"direct_convolution5x5", "direct_convolution5x5.cl"},
     {"direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl"},
     {"direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
-    {"embedding_lookup_2d", "embedding_lookup.cl"},
-    {"embedding_lookup_3d", "embedding_lookup.cl"},
-    {"embedding_lookup_4d", "embedding_lookup.cl"},
+    {"embedding_lookup", "embedding_lookup.cl"},
     {"equal", "equal.cl"},
     {"equal_quantized", "equal_quantized.cl"},
     {"erode", "erode.cl"},
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
index 2331ff5..9105e16 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -16,94 +16,13 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(VEC_SIZE)
-/** Perform embedding_lookup of input image
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                             Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] output_ptr                            Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
- * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
- * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
- */
-__kernel void embedding_lookup_2d(IMAGE_DECLARATION(input),
-                                  IMAGE_DECLARATION(output),
-                                  VECTOR_DECLARATION(lookups))
-{
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-    int lup_id = *((__global int *)vector_offset(&lups, get_global_id(1)));
-
-    Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input);
-    in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + lup_id * input_step_y;
-
-    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                     0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Perform embedding_lookup of input tensor3D
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
- * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
- * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
- */
-__kernel void embedding_lookup_3d(TENSOR3D_DECLARATION(input),
-                                  TENSOR3D_DECLARATION(output),
-                                  VECTOR_DECLARATION(lookups))
-{
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-    int lup_id = *((__global int *)vector_offset(&lups, get_global_id(2)));
-
-    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + get_global_id(1) * input_step_y
-              + lup_id * input_step_z;
-
-    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                     0, (__global DATA_TYPE *)out.ptr);
-}
-
-#if defined(DEPTH_OUT)
-/** Perform embedding_lookup of input tensor4D
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform embedding_lookup of input tensor
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
  *
  * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
  * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
@@ -130,21 +49,32 @@ __kernel void embedding_lookup_3d(TENSOR3D_DECLARATION(input),
  * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
  */
-__kernel void embedding_lookup_4d(TENSOR4D_DECLARATION(input),
-                                  TENSOR4D_DECLARATION(output),
-                                  VECTOR_DECLARATION(lookups))
+
+__kernel void embedding_lookup(TENSOR4D_DECLARATION(input),
+                               TENSOR4D_DECLARATION(output),
+                               VECTOR_DECLARATION(lookups))
 {
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
 
     Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-    int lup_id = *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT));
 
-    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
-    in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + get_global_id(1) * input_step_y
-              + (get_global_id(2) % DEPTH_OUT) * input_step_z + lup_id * input_step_w;
+    //lookup ids for based on the tensor dimensions
+    int lup_id[4] = {0};
+
+    lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
+	                       :get_global_id(0);
+    lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
+	                       :get_global_id(1);
+    lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
+	                       :get_global_id(2)%DEPTH_OUT;
+    lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                               :get_global_id(2) / DEPTH_OUT;
+
+    in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
+              + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
 
     VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
                      0, (__global DATA_TYPE *)out.ptr);
 }
-#endif // defined(DEPTH_OUT)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index 348e168..ba05e2d 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -75,13 +75,12 @@ void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *outpu
   // Set kernel build options
   std::stringstream kernel_name;
   std::set<std::string> build_opts;
-  kernel_name << "embedding_lookup_" << input->info()->num_dimensions() << "d";
-  if (input->info()->num_dimensions() == 4)
-  {
-    build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-  }
+  kernel_name << "embedding_lookup";
+
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
   build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
   build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
 
   // Create kernel
   _kernel = static_cast<cl::Kernel>(
@@ -98,44 +97,18 @@ void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue)
   ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
   ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-#if defined(DEBUG)
-  const_cast<ICLTensor *>(_lookups)->map(queue);
-
-  for (uint32_t n = 0; n < _lookups->info()->tensor_shape().total_size(); ++n)
-  {
-    int32_t lookup_id = reinterpret_cast<int32_t *>(_lookups->buffer())[n];
-    ARM_COMPUTE_ERROR_ON(lookup_id < 0 ||
-                         lookup_id >=
-                             _input->info()->dimension(_input->info()->num_dimensions() - 1));
-  }
-
-  const_cast<ICLTensor *>(_lookups)->unmap(queue);
-#endif /* defined(DEBUG) */
-
-  Window win = window.collapse(ICLKernel::window(), 2, 4);
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
 
   Window win_lookup;
   win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
 
-  unsigned int idx = 0;
-  switch (_input->info()->num_dimensions())
+  do
   {
-    case 2:
-      add_2D_tensor_argument(idx, _input, win);
-      add_2D_tensor_argument(idx, _output, win);
-      break;
-    case 3:
-      add_3D_tensor_argument(idx, _input, win);
-      add_3D_tensor_argument(idx, _output, win);
-      break;
-    case 4:
-      add_4D_tensor_argument(idx, _input, win);
-      add_4D_tensor_argument(idx, _output, win);
-      break;
-    default:
-      ARM_COMPUTE_ERROR("Supported inputs with dimensions of 2 or more and 4 or less.");
-      break;
-  }
-  add_1D_tensor_argument(idx, _lookups, win_lookup);
-  enqueue(queue, *this, win);
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_in);
+    add_1D_tensor_argument(idx, _lookups, win_lookup);
+
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
 }