// type of elements of lookups is always integer
const int32_t *lookups_buf = reinterpret_cast<int32_t *>(_lookups->buffer());
- const auto values_buf = _values->buffer();
- auto output_buf = _output->buffer();
const auto lookups_info = _lookups->info();
const auto values_info = _values->info();
// NOTE The first dimension's position is always at the end of dimensions.
const auto first_dim_pos = values_info->num_dimensions() - 1;
- ::arm_compute::Coordinates offset_coord{};
- for (size_t i = 0; i < first_dim_pos; ++i)
- {
- offset_coord.set(i, 0);
- }
const size_t first_dim = values_info->dimension(first_dim_pos);
- const size_t copy_bytes = values_info->total_size() / first_dim;
for (size_t i = 0; i < lookups_info->dimension(0); ++i)
{
if (lookups_buf[i] < 0 || lookups_buf[i] >= first_dim)
throw std::runtime_error("Embedding Lookup: index out of bounds.");
+ }
+
+ // If each strides of values and output are different, applied padding size of the two tensors are
+ // different, therefore, it can not be copied at once.
+ auto can_copy_at_once = [&]() -> bool {
+ const auto &values_strides = values_info->strides_in_bytes();
+ const auto &output_strides = output_info->strides_in_bytes();
+
+ for (size_t i = 0; i < first_dim_pos; ++i)
+ {
+ if (values_strides[i] != values_strides[i])
+ return false;
+ }
- size_t idx = lookups_buf[i];
- offset_coord.set(first_dim_pos, idx);
- size_t values_offset = values_info->offset_element_in_bytes(offset_coord);
- offset_coord.set(first_dim_pos, i);
- size_t output_offset = output_info->offset_element_in_bytes(offset_coord);
+ return true;
+ };
- unsigned char *sink_addr = output_buf + output_offset;
- unsigned char *source_addr = values_buf + values_offset;
- memcpy(sink_addr, source_addr, copy_bytes);
+ using ::arm_compute::Window;
+ using ::arm_compute::Iterator;
+
+ size_t copy_bytes;
+ Window window;
+ if (can_copy_at_once())
+ {
+ copy_bytes = values_info->total_size() / first_dim;
+ window.use_tensor_dimensions(output_info->tensor_shape(), first_dim_pos);
}
+ else
+ {
+ copy_bytes = values_info->dimension(0) * values_info->element_size();
+ window.use_tensor_dimensions(output_info->tensor_shape(), Window::DimY);
+ }
+
+ Iterator it(_output, window);
+ execute_window_loop(window,
+ [&](const ::arm_compute::Coordinates &id) {
+ ::arm_compute::Coordinates values_id = id;
+ const int idx = id[first_dim_pos];
+ values_id.set(first_dim_pos, lookups_buf[idx]);
+ memcpy(it.ptr(), _values->ptr_to_element(values_id), copy_bytes);
+ },
+ it);
if (::internal::arm_compute::isGpuMode())
{