*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(VEC_SIZE)
-/** Perform embedding_lookup of input image
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32
- * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes)
- * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
- */
-__kernel void embedding_lookup_2d(IMAGE_DECLARATION(input),
- IMAGE_DECLARATION(output),
- VECTOR_DECLARATION(lookups))
-{
- Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
- Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
- int lup_id = *((__global int *)vector_offset(&lups, get_global_id(1)));
-
- Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input);
- in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + lup_id * input_step_y;
-
- VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
- 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Perform embedding_lookup of input tensor3D
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32
- * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes)
- * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
- */
-__kernel void embedding_lookup_3d(TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output),
- VECTOR_DECLARATION(lookups))
-{
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
- int lup_id = *((__global int *)vector_offset(&lups, get_global_id(2)));
-
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
- in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + get_global_id(1) * input_step_y
- + lup_id * input_step_z;
-
- VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
- 0, (__global DATA_TYPE *)out.ptr);
-}
-
-#if defined(DEPTH_OUT)
-/** Perform embedding_lookup of input tensor4D
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
+/** Perform embedding_lookup of input tensor
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
* @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
* @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16
+ * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4
*
* @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector
*/
-__kernel void embedding_lookup_4d(TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output),
- VECTOR_DECLARATION(lookups))
+
+__kernel void embedding_lookup(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output),
+ VECTOR_DECLARATION(lookups))
{
Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups);
- int lup_id = *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT));
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT);
- in.ptr += input_offset_first_element_in_bytes + get_global_id(0) * input_step_x + get_global_id(1) * input_step_y
- + (get_global_id(2) % DEPTH_OUT) * input_step_z + lup_id * input_step_w;
+ //lookup ids for based on the tensor dimensions
+ int lup_id[4] = {0};
+
+ lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0)))
+ :get_global_id(0);
+ lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1)))
+ :get_global_id(1);
+ lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2)))
+ :get_global_id(2)%DEPTH_OUT;
+ lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+ :get_global_id(2) / DEPTH_OUT;
+
+ in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y
+ + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
0, (__global DATA_TYPE *)out.ptr);
}
-#endif // defined(DEPTH_OUT)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_OUT) && defined(NUM_DIMS)
// Set kernel build options
std::stringstream kernel_name;
std::set<std::string> build_opts;
- kernel_name << "embedding_lookup_" << input->info()->num_dimensions() << "d";
- if (input->info()->num_dimensions() == 4)
- {
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
- }
+ kernel_name << "embedding_lookup";
+
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
// Create kernel
_kernel = static_cast<cl::Kernel>(
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-#if defined(DEBUG)
- const_cast<ICLTensor *>(_lookups)->map(queue);
-
- for (uint32_t n = 0; n < _lookups->info()->tensor_shape().total_size(); ++n)
- {
- int32_t lookup_id = reinterpret_cast<int32_t *>(_lookups->buffer())[n];
- ARM_COMPUTE_ERROR_ON(lookup_id < 0 ||
- lookup_id >=
- _input->info()->dimension(_input->info()->num_dimensions() - 1));
- }
-
- const_cast<ICLTensor *>(_lookups)->unmap(queue);
-#endif /* defined(DEBUG) */
-
- Window win = window.collapse(ICLKernel::window(), 2, 4);
+ Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
Window win_lookup;
win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0));
- unsigned int idx = 0;
- switch (_input->info()->num_dimensions())
+ do
{
- case 2:
- add_2D_tensor_argument(idx, _input, win);
- add_2D_tensor_argument(idx, _output, win);
- break;
- case 3:
- add_3D_tensor_argument(idx, _input, win);
- add_3D_tensor_argument(idx, _output, win);
- break;
- case 4:
- add_4D_tensor_argument(idx, _input, win);
- add_4D_tensor_argument(idx, _output, win);
- break;
- default:
- ARM_COMPUTE_ERROR("Supported inputs with dimensions of 2 or more and 4 or less.");
- break;
- }
- add_1D_tensor_argument(idx, _lookups, win_lookup);
- enqueue(queue, *this, win);
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice_in);
+ add_4D_tensor_argument(idx, _output, slice_in);
+ add_1D_tensor_argument(idx, _lookups, win_lookup);
+
+ enqueue(queue, *this, slice_in);
+ } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup));
}