inline cldnn::data_types DataTypeFromPrecision(InferenceEngine::Precision p) {
switch (p) {
case Precision::I16:
+ case Precision::U16:
case Precision::FP32:
return cldnn::data_types::f32;
case Precision::FP16:
auto check_inputs = [](InferenceEngine::InputsDataMap _networkInputs) {
for (auto ii : _networkInputs) {
auto input_precision = ii.second->getTensorDesc().getPrecision();
- if (input_precision != InferenceEngine::Precision::FP16 && input_precision != InferenceEngine::Precision::I16
- && input_precision != InferenceEngine::Precision::FP32 && input_precision != InferenceEngine::Precision::U8
- && input_precision != InferenceEngine::Precision::I32 && input_precision != InferenceEngine::Precision::I64
- && input_precision != InferenceEngine::Precision::I8 && input_precision != InferenceEngine::Precision::BOOL) {
+ if (input_precision != InferenceEngine::Precision::FP16 &&
+ input_precision != InferenceEngine::Precision::FP32 &&
+ input_precision != InferenceEngine::Precision::U8 &&
+ input_precision != InferenceEngine::Precision::I8 &&
+ input_precision != InferenceEngine::Precision::I16 &&
+ input_precision != InferenceEngine::Precision::U16 &&
+ input_precision != InferenceEngine::Precision::I32 &&
+ input_precision != InferenceEngine::Precision::I64 &&
+ input_precision != InferenceEngine::Precision::BOOL) {
THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str
<< "Input image format " << input_precision << " is not supported yet...";
}
return make_shared_blob<int16_t>(desc, reinterpret_cast<int16_t*>(mem_ptr));
else
return make_shared_blob<int16_t>(desc);
+ case Precision::U16:
+ if (mem_ptr != nullptr)
+ return make_shared_blob<uint16_t>(desc, reinterpret_cast<uint16_t*>(mem_ptr));
+ else
+ return make_shared_blob<uint16_t>(desc);
case Precision::I32:
if (mem_ptr != nullptr)
return make_shared_blob<int32_t>(desc, reinterpret_cast<int32_t*>(mem_ptr));
cldnn::pointer<uint8_t> mem_ptr = inputsMemory.at(name).pointer<uint8_t>();
_inputs[name] = createInputBlob(desc, mem_ptr.data());
- if (desc.getPrecision() == Precision::I16) {
+ if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
cldnn::layout layout_fp32 = layout;
layout_fp32.data_type = cldnn::data_types::f32;
input_alloc(name + fp32_suffix, layout_fp32);
}
Blob::Ptr inputBlob = createInputBlob(desc);
- if (desc.getPrecision() == Precision::I16) {
+ if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
desc.setPrecision(Precision::FP32);
auto fp32inputBlob = InferenceEngine::make_shared_blob<float>(desc);
fp32inputBlob->allocate();
if (inputBlob.is<gpu::ClBlob>()) {
// no need to check for reuse
_nw_ptr->set_input_data(internalName, memory);
- } else if (prec == Precision::I16) {
+ } else if (prec == Precision::I16 || prec == Precision::U16) {
// clDNN doesn't support I16 input precision, so we always have to convert input data to fp32 precision
const cldnn::memory& fp32_mem = inputsMemory.at(inputName+fp32_suffix);
cldnn::pointer<float> ptr = fp32_mem.pointer<float>();
- copyToFloat<int16_t>(ptr.data(), &inputBlob);
+ if (prec == Precision::I16) {
+ copyToFloat<int16_t>(ptr.data(), &inputBlob);
+ } else {
+ copyToFloat<uint16_t>(ptr.data(), &inputBlob);
+ }
+
_nw_ptr->set_input_data(internalName, fp32_mem);
} else if (is_same_buffer(inputBlob, memory)) {
// If input memory was allocated by cldnn engine and wasn't overwritten by user set_input_data method won't copy input data.
const std::vector<InferenceEngine::Precision> netPrecisions = {
InferenceEngine::Precision::FP32,
InferenceEngine::Precision::FP16,
+ InferenceEngine::Precision::U8,
+ InferenceEngine::Precision::I8,
+ InferenceEngine::Precision::U16,
+ InferenceEngine::Precision::I32
};
const std::vector<int64_t> batchAxisIndices = { 0L };
// Expected behavior
R"(.*EltwiseLayerTest.*eltwiseOpType=Pow.*netPRC=I64.*)",
R"(.*EltwiseLayerTest.*IS=\(.*\..*\..*\..*\..*\).*eltwiseOpType=Pow.*secondaryInputType=CONSTANT.*)",
- // TODO: Issue: 40736
- R"(.*ReverseSequenceLayerTest.*)",
- // TODO: Issue: 40741
- R"(.*GatherTreeLayerTest.*)",
};
}
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#include "kernel_selector_utils.h"
namespace kernel_selector {
- JitConstants GatherTreeKernelBase::GetJitConstants(const gather_tree_params & params) const {
- JitConstants jit = MakeBaseParamsJitConstants(params);
- return jit;
- }
+JitConstants GatherTreeKernelBase::GetJitConstants(const gather_tree_params & params) const {
+ JitConstants jit = MakeBaseParamsJitConstants(params);
+ return jit;
+}
- GatherTreeKernelBase::DispatchData GatherTreeKernelBase::SetDefault(const gather_tree_params & params) const {
- std::vector<size_t> global{
- params.output.Y().v, // beam
- params.output.Feature().v, // batch
- 1
- };
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
- /*
- b -> time
- f -> batch
- y -> beam
- */
- DispatchData data;
- data.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
- data.gws0 = global[0];
- data.gws1 = global[1];
- data.gws2 = global[2];
- data.lws0 = local[0];
- data.lws1 = local[1];
- data.lws2 = local[2];
- return data;
- }
+GatherTreeKernelBase::DispatchData GatherTreeKernelBase::SetDefault(const gather_tree_params & params) const {
+ std::vector<size_t> global{
+ params.output.Y().v, // beam
+ params.output.Feature().v, // batch
+ 1
+ };
+ const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ /*
+ b -> time
+ f -> batch
+ y -> beam
+ */
+ DispatchData data;
+ data.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+ data.gws0 = global[0];
+ data.gws1 = global[1];
+ data.gws2 = global[2];
+ data.lws0 = local[0];
+ data.lws1 = local[1];
+ data.lws2 = local[2];
+ return data;
+}
- KernelsData GatherTreeKernelBase::GetCommonKernelsData(const Params& params,
- const optional_params& options,
- float estimated_time) const {
- assert(params.GetType() == KernelType::GATHER_TREE);
- const auto& gt_params = static_cast<const gather_tree_params&>(params);
+KernelsData GatherTreeKernelBase::GetCommonKernelsData(const Params& params,
+ const optional_params& options,
+ float estimated_time) const {
+ assert(params.GetType() == KernelType::GATHER_TREE);
+ const auto& gt_params = static_cast<const gather_tree_params&>(params);
- auto run_info = SetDefault(gt_params);
- auto kernel_data = KernelData::Default<gather_tree_params>(params);
- auto cldnn_jit = GetJitConstants(gt_params);
- auto entry_point = GetEntryPoint(kernelName, gt_params.layerID, options);
- auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
- FillCLKernelData(kernel_data.kernels[0], run_info, params.engineInfo, kernelName, jit, entry_point, DEFAULT, false, false, 4);
- kernel_data.estimatedTime = estimated_time;
- return { kernel_data };
- }
+ auto run_info = SetDefault(gt_params);
+ auto kernel_data = KernelData::Default<gather_tree_params>(params);
+ auto cldnn_jit = GetJitConstants(gt_params);
+ auto entry_point = GetEntryPoint(kernelName, gt_params.layerID, options);
+ auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
+ FillCLKernelData(kernel_data.kernels[0],
+ run_info,
+ params.engineInfo,
+ kernelName,
+ jit,
+ entry_point,
+ DEFAULT,
+ false,
+ false,
+ static_cast<int>(gt_params.inputs.size()));
+ kernel_data.estimatedTime = estimated_time;
+ return { kernel_data };
+}
} // namespace kernel_selector
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
namespace kernel_selector {
ParamsKey ReverseSequenceKernelRef::GetSupportedKey() const {
ParamsKey k;
+ k.EnableInputDataType(Datatype::UINT8);
+ k.EnableInputDataType(Datatype::INT8);
+ k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
+ k.EnableOutputDataType(Datatype::UINT8);
+ k.EnableOutputDataType(Datatype::INT8);
+ k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableAllInputLayout();
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#include "include/include_all.cl"
-KERNEL(gather_tree_gpu_ref.cl)(
- const __global UNIT_TYPE* step_input,
- const __global UNIT_TYPE* parent_input,
- const __global UNIT_TYPE* max_seq_len_input,
- const __global UNIT_TYPE* end_token,
- __global UNIT_TYPE* output)
+KERNEL(gather_tree_gpu_ref)(
+ const __global INPUT0_TYPE* step_input,
+ const __global INPUT1_TYPE* parent_input,
+ const __global INPUT2_TYPE* max_seq_len_input,
+ const __global INPUT3_TYPE* end_token,
+ __global OUTPUT_TYPE* output)
{
- const uint beam = get_global_id(0);
- const uint batch = get_global_id(1);
+ const int beam = get_global_id(0);
+ const int batch = get_global_id(1);
/*
b -> time
f -> batch
y -> beam
*/
- uint parent = beam;
- for(int time = INPUT0_BATCH_NUM - 1; time >= 0; time--) {
- while (time >= (uint)max_seq_len_input[batch]) {
- output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = end_token[0];
- time--;
- }
- output[OUTPUT_GET_INDEX(time, batch, beam, 0)] =
- step_input[INPUT0_GET_INDEX(time, batch, parent, 0)];
- parent = (uint)parent_input[INPUT0_GET_INDEX(time, batch, parent, 0)];
+ const int max_sequence_in_beam = min(INPUT0_BATCH_NUM, (int)max_seq_len_input[batch]);
+ int time;
+ for (time = INPUT0_BATCH_NUM - 1; time >= max_sequence_in_beam; time--) {
+ output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = TO_OUTPUT_TYPE(end_token[0]);
}
+ for (int parent = beam; time >= 0; time--) {
+ output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = step_input[INPUT0_GET_INDEX(time, batch, parent, 0)];
+ parent = parent_input[INPUT1_GET_INDEX(time, batch, parent, 0)];
+ }
+ bool finished = false;
+ for (int time = 0; time < max_sequence_in_beam; time++) {
+ if (finished) {
+ output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = TO_OUTPUT_TYPE(end_token[0]);
+ } else if (output[OUTPUT_GET_INDEX(time, batch, beam, 0)] == TO_OUTPUT_TYPE(end_token[0])) {
+ finished = true;
+ }
+ }
}
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#include "include/include_all.cl"
-KERNEL(reverse_sequence_ref)(const __global UNIT_TYPE* input, const __global INPUT1_TYPE* seq_lengths, __global UNIT_TYPE* output)
+KERNEL(reverse_sequence_ref)(const __global INPUT0_TYPE* input, const __global INPUT1_TYPE* seq_lengths, __global OUTPUT_TYPE* output)
{
const uint batch = get_global_id(0);
const uint feature = get_global_id(1);
const uint x = (uint)get_global_id(2) % INPUT0_SIZE_X;
uint dimensions[] = { batch, feature, y, x };
- const uint input_index = INPUT0_OFFSET +
- batch * INPUT0_BATCH_PITCH +
- feature * INPUT0_FEATURE_PITCH +
- y * INPUT0_Y_PITCH +
- x * INPUT0_X_PITCH;
+ const uint input_index = INPUT0_GET_INDEX(batch, feature, y, x);
const uint length = (uint)seq_lengths[dimensions[BATCH_AXIS]];
if (dimensions[SEQ_AXIS] < length)
dimensions[SEQ_AXIS] = length - dimensions[SEQ_AXIS] - 1;
- const uint output_index = OUTPUT_OFFSET +
- dimensions[0] * OUTPUT_BATCH_PITCH +
- dimensions[1] * OUTPUT_FEATURE_PITCH +
- dimensions[2] * OUTPUT_Y_PITCH +
- dimensions[3] * OUTPUT_X_PITCH;
-
+ const uint output_index = OUTPUT_GET_INDEX(dimensions[0], dimensions[1], dimensions[2], dimensions[3]);
output[output_index] = ACTIVATION(input[input_index], ACTIVATION_PARAMS);
}
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
static primitive_impl* create(const gather_tree_node& arg) {
auto b_params = get_default_params<kernel_selector::gather_tree_params>(arg, 1);
- auto b_optional_params =
- get_default_optional_params<kernel_selector::gather_tree_optional_params>(arg.get_program());
+ auto b_optional_params = get_default_optional_params<kernel_selector::gather_tree_optional_params>(arg.get_program());
+ for (size_t i = 1; i < arg.get_dependencies().size(); i++) {
+ b_params.inputs.push_back(convert_data_tensor(arg.get_dependency(i).get_output_layout(), 1));
+ }
auto desc = arg.get_primitive();
auto& kernel_selector = kernel_selector::gather_tree_kernel_selector::Instance();
/*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
attach_reverse_sequence_gpu::attach_reverse_sequence_gpu() {
auto val_fw = reverse_sequence_gpu::create;
- implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
- val_fw);
- implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
- val_fw);
+ implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
+ implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
+ implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw);
+ implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw);
+ implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw);
}
} // namespace detail