From: Shubham Gupta/SNAP /SRI-Bangalore/Engineer/삼성전자 <shub98.gupta@samsung.com>
Date: Mon, 10 Dec 2018 01:27:55 +0000 (+0530)
Subject: Generic HashTableLookup kernel to handle tensors upto rank=4 (#3798)
X-Git-Tag: 0.3~155
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=130373b5736deff2bb8e9220e38601805f112fca;p=platform%2Fcore%2Fml%2Fnnfw.git

Generic HashTableLookup kernel to handle tensors upto rank=4 (#3798)

This patch unifies the 3 different hashtable lookup kernel into
one kernel so as to make a generic cl to support upto 4d tensors.

Signed-off-by: shubham <shub98.gupta@samsung.com>
---

diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 85b80ca..73efe44 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -170,9 +170,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"harris_score_3x3", "harris_corners.cl"},
     {"harris_score_5x5", "harris_corners.cl"},
     {"harris_score_7x7", "harris_corners.cl"},
-    {"hashtable_lookup_2d", "hashtable_lookup.cl"},
-    {"hashtable_lookup_3d", "hashtable_lookup.cl"},
-    {"hashtable_lookup_4d", "hashtable_lookup.cl"},
+    {"hashtable_lookup", "hashtable_lookup.cl"},
     {"hist_border_kernel", "histogram.cl"},
     {"hist_border_kernel_fixed", "histogram.cl"},
     {"hist_local_kernel", "histogram.cl"},
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
index d1a8eee..b60a6cc 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -16,106 +16,13 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(VEC_SIZE)
-/** Perform hashtable_lookup of input image
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                             Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source image
- * @param[out] output_ptr                            Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
- * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
- * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
- */
-__kernel void hashtable_lookup_2d(IMAGE_DECLARATION(input),
-                                  IMAGE_DECLARATION(output),
-                                  VECTOR_DECLARATION(lookups))
-{
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-    int lup_id = *((__global int *)vector_offset(&lups, get_global_id(1)));
-
-    if (lup_id < 0)
-    {
-      VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
-      return;
-    }
-
-    Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input);
-    in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + lup_id * input_step_y;
-
-    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                     0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Perform hashtable_lookup of input tensor3D
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  lookups_ptr                           Pointer to the lookups vector. Supported data types: S32
- * @param[in]  lookups_stride_x                      Stride of the lookups vector in X dimension (in bytes)
- * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
- */
-__kernel void hashtable_lookup_3d(TENSOR3D_DECLARATION(input),
-                                  TENSOR3D_DECLARATION(output),
-                                  VECTOR_DECLARATION(lookups))
-{
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-    int lup_id = *((__global int *)vector_offset(&lups, get_global_id(2)));
-
-    if (lup_id < 0)
-    {
-      VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
-      return;
-    }
-
-    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + get_global_id(1) * input_step_y
-              + lup_id * input_step_z;
-
-    VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
-                     0, (__global DATA_TYPE *)out.ptr);
-}
-
-#if defined(DEPTH_OUT)
-/** Perform hashtable_lookup of input tensor4D
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform hashtable_lookup of input tensor
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
  *
  * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
  * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
@@ -142,27 +49,36 @@ __kernel void hashtable_lookup_3d(TENSOR3D_DECLARATION(input),
  * @param[in]  lookups_step_x                        lookups_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
  */
-__kernel void hashtable_lookup_4d(TENSOR4D_DECLARATION(input),
-                                  TENSOR4D_DECLARATION(output),
-                                  VECTOR_DECLARATION(lookups))
+__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input),
+                               TENSOR4D_DECLARATION(output),
+                               VECTOR_DECLARATION(lookups))
 {
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
 
     Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
-    int lup_id = *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT));
 
-    if (lup_id < 0)
+    int lup_id[4] = {0};
+
+    lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
+	                       :get_global_id(0);
+    lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
+	                       :get_global_id(1);
+    lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
+	                       :get_global_id(2)%DEPTH_OUT;
+    lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                               :get_global_id(2) / DEPTH_OUT;
+
+    if (lup_id[NUM_DIMS-1] < 0)
     {
       VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr);
       return;
     }
 
-    Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
-    in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + get_global_id(1) * input_step_y
-              + (get_global_id(2) % DEPTH_OUT) * input_step_z + lup_id * input_step_w;
+    in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
+              + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
 
     VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
                      0, (__global DATA_TYPE *)out.ptr);
 }
-#endif // defined(DEPTH_OUT)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index ee1809c..2f8f0dd 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -93,13 +93,12 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
   // Set kernel build options
   std::stringstream kernel_name;
   std::set<std::string> build_opts;
-  kernel_name << "hashtable_lookup_" << input->info()->num_dimensions() << "d";
-  if (input->info()->num_dimensions() == 4)
-  {
-    build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-  }
+  kernel_name << "hashtable_lookup";
+
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
   build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
   build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+  build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
 
   // Create kernel
   _kernel = static_cast<cl::Kernel>(
@@ -166,25 +165,13 @@ void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
   Window win_lookup;
   win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
 
-  unsigned int idx = 0;
-  switch (_input->info()->num_dimensions())
+  do
   {
-    case 2:
-      add_2D_tensor_argument(idx, _input, win);
-      add_2D_tensor_argument(idx, _output, win);
-      break;
-    case 3:
-      add_3D_tensor_argument(idx, _input, win);
-      add_3D_tensor_argument(idx, _output, win);
-      break;
-    case 4:
-      add_4D_tensor_argument(idx, _input, win);
-      add_4D_tensor_argument(idx, _output, win);
-      break;
-    default:
-      ARM_COMPUTE_ERROR("Supported inputs with dimensions of 2 or more and 4 or less.");
-      break;
-  }
-  add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
-  enqueue(queue, *this, win);
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, win);
+    add_4D_tensor_argument(idx, _output, win);
+    add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup);
+
+    enqueue(queue, *this, win);
+  } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup));
 }