[IE CLDNN] Fixes for GatherTree and ReverseSequence (#2660)
authorVladimir Paramuzov <vladimir.paramuzov@intel.com>
Thu, 15 Oct 2020 07:28:00 +0000 (10:28 +0300)
committerGitHub <noreply@github.com>
Thu, 15 Oct 2020 07:28:00 +0000 (10:28 +0300)
12 files changed:
inference-engine/src/cldnn_engine/cldnn_common_utils.h
inference-engine/src/cldnn_engine/cldnn_engine.cpp
inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reverse_sequence.cpp
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather_tree/gather_tree_kernel_base.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather_tree/gather_tree_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_tree_gpu_ref.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reverse_sequence_ref.cl
inference-engine/thirdparty/clDNN/src/gpu/gather_tree_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/reverse_sequence_gpu.cpp

index 6423163..384d157 100644 (file)
@@ -41,6 +41,7 @@ const auto CldnnTensorFromIEDims = [](const InferenceEngine::SizeVector& dims, i
 inline cldnn::data_types DataTypeFromPrecision(InferenceEngine::Precision p) {
     switch (p) {
     case Precision::I16:
+    case Precision::U16:
     case Precision::FP32:
         return cldnn::data_types::f32;
     case Precision::FP16:
index 5b2818b..db16779 100644 (file)
@@ -196,10 +196,15 @@ clDNNEngine::clDNNEngine() : m_defaultContext(nullptr) {
 auto check_inputs = [](InferenceEngine::InputsDataMap _networkInputs) {
     for (auto ii : _networkInputs) {
         auto input_precision = ii.second->getTensorDesc().getPrecision();
-        if (input_precision != InferenceEngine::Precision::FP16 && input_precision != InferenceEngine::Precision::I16
-            && input_precision != InferenceEngine::Precision::FP32 && input_precision != InferenceEngine::Precision::U8
-            && input_precision != InferenceEngine::Precision::I32 && input_precision != InferenceEngine::Precision::I64
-            && input_precision != InferenceEngine::Precision::I8 && input_precision != InferenceEngine::Precision::BOOL) {
+        if (input_precision != InferenceEngine::Precision::FP16 &&
+            input_precision != InferenceEngine::Precision::FP32 &&
+            input_precision != InferenceEngine::Precision::U8 &&
+            input_precision != InferenceEngine::Precision::I8 &&
+            input_precision != InferenceEngine::Precision::I16 &&
+            input_precision != InferenceEngine::Precision::U16 &&
+            input_precision != InferenceEngine::Precision::I32 &&
+            input_precision != InferenceEngine::Precision::I64 &&
+            input_precision != InferenceEngine::Precision::BOOL) {
             THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str
                 << "Input image format " << input_precision << " is not supported yet...";
         }
index bf591b6..931083a 100644 (file)
@@ -41,6 +41,11 @@ Blob::Ptr CLDNNInferRequest::createInputBlob(const TensorDesc& desc, uint8_t* me
             return make_shared_blob<int16_t>(desc, reinterpret_cast<int16_t*>(mem_ptr));
         else
             return make_shared_blob<int16_t>(desc);
+    case Precision::U16:
+        if (mem_ptr != nullptr)
+            return make_shared_blob<uint16_t>(desc, reinterpret_cast<uint16_t*>(mem_ptr));
+        else
+            return make_shared_blob<uint16_t>(desc);
     case Precision::I32:
         if (mem_ptr != nullptr)
             return make_shared_blob<int32_t>(desc, reinterpret_cast<int32_t*>(mem_ptr));
@@ -586,7 +591,7 @@ void CLDNNInferRequest::AllocateInputs() {
             cldnn::pointer<uint8_t> mem_ptr = inputsMemory.at(name).pointer<uint8_t>();
             _inputs[name] = createInputBlob(desc, mem_ptr.data());
 
-            if (desc.getPrecision() == Precision::I16) {
+            if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
                 cldnn::layout layout_fp32 = layout;
                 layout_fp32.data_type = cldnn::data_types::f32;
                 input_alloc(name + fp32_suffix, layout_fp32);
@@ -609,7 +614,7 @@ void CLDNNInferRequest::AllocateInputsDyn() {
         }
 
         Blob::Ptr inputBlob = createInputBlob(desc);
-        if (desc.getPrecision() == Precision::I16) {
+        if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
             desc.setPrecision(Precision::FP32);
             auto fp32inputBlob = InferenceEngine::make_shared_blob<float>(desc);
             fp32inputBlob->allocate();
@@ -910,11 +915,16 @@ void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const
     if (inputBlob.is<gpu::ClBlob>()) {
         // no need to check for reuse
         _nw_ptr->set_input_data(internalName, memory);
-    } else if (prec == Precision::I16) {
+    } else if (prec == Precision::I16 || prec == Precision::U16) {
         // clDNN doesn't support I16 input precision, so we always have to convert input data to fp32 precision
         const cldnn::memory& fp32_mem = inputsMemory.at(inputName+fp32_suffix);
         cldnn::pointer<float> ptr = fp32_mem.pointer<float>();
-        copyToFloat<int16_t>(ptr.data(), &inputBlob);
+        if (prec == Precision::I16) {
+            copyToFloat<int16_t>(ptr.data(), &inputBlob);
+        } else {
+            copyToFloat<uint16_t>(ptr.data(), &inputBlob);
+        }
+
         _nw_ptr->set_input_data(internalName, fp32_mem);
     } else if (is_same_buffer(inputBlob, memory)) {
         // If input memory was allocated by cldnn engine and wasn't overwritten by user set_input_data method won't copy input data.
index 2787fed..44d8f3f 100644 (file)
@@ -14,6 +14,10 @@ namespace {
 const std::vector<InferenceEngine::Precision> netPrecisions = {
         InferenceEngine::Precision::FP32,
         InferenceEngine::Precision::FP16,
+        InferenceEngine::Precision::U8,
+        InferenceEngine::Precision::I8,
+        InferenceEngine::Precision::U16,
+        InferenceEngine::Precision::I32
 };
 
 const std::vector<int64_t> batchAxisIndices = { 0L };
index 7335d73..c305942 100644 (file)
@@ -22,9 +22,5 @@ std::vector<std::string> disabledTestPatterns() {
             // Expected behavior
             R"(.*EltwiseLayerTest.*eltwiseOpType=Pow.*netPRC=I64.*)",
             R"(.*EltwiseLayerTest.*IS=\(.*\..*\..*\..*\..*\).*eltwiseOpType=Pow.*secondaryInputType=CONSTANT.*)",
-            // TODO: Issue: 40736
-            R"(.*ReverseSequenceLayerTest.*)",
-            // TODO: Issue: 40741
-            R"(.*GatherTreeLayerTest.*)",
     };
 }
index 1759916..1042910 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 #include "kernel_selector_utils.h"
 
 namespace kernel_selector {
-    JitConstants GatherTreeKernelBase::GetJitConstants(const gather_tree_params & params) const {
-        JitConstants jit = MakeBaseParamsJitConstants(params);
-        return jit;
-    }
+JitConstants GatherTreeKernelBase::GetJitConstants(const gather_tree_params & params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+    return jit;
+}
 
-    GatherTreeKernelBase::DispatchData GatherTreeKernelBase::SetDefault(const gather_tree_params & params) const {
-        std::vector<size_t> global{
-                                    params.output.Y().v,  // beam
-                                    params.output.Feature().v,  // batch
-                                    1
-                                  };
-        const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-        /*
-            b -> time
-            f -> batch
-            y -> beam
-        */
-        DispatchData data;
-        data.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-        data.gws0 = global[0];
-        data.gws1 = global[1];
-        data.gws2 = global[2];
-        data.lws0 = local[0];
-        data.lws1 = local[1];
-        data.lws2 = local[2];
-        return data;
-    }
+GatherTreeKernelBase::DispatchData GatherTreeKernelBase::SetDefault(const gather_tree_params & params) const {
+    std::vector<size_t> global{
+                                params.output.Y().v,  // beam
+                                params.output.Feature().v,  // batch
+                                1
+                              };
+    const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+    /*
+        b -> time
+        f -> batch
+        y -> beam
+    */
+    DispatchData data;
+    data.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+    data.gws0 = global[0];
+    data.gws1 = global[1];
+    data.gws2 = global[2];
+    data.lws0 = local[0];
+    data.lws1 = local[1];
+    data.lws2 = local[2];
+    return data;
+}
 
-    KernelsData GatherTreeKernelBase::GetCommonKernelsData(const Params& params,
-                                                            const optional_params& options,
-                                                            float estimated_time) const {
-        assert(params.GetType() == KernelType::GATHER_TREE);
-        const auto& gt_params = static_cast<const gather_tree_params&>(params);
+KernelsData GatherTreeKernelBase::GetCommonKernelsData(const Params& params,
+                                                        const optional_params& options,
+                                                        float estimated_time) const {
+    assert(params.GetType() == KernelType::GATHER_TREE);
+    const auto& gt_params = static_cast<const gather_tree_params&>(params);
 
-        auto run_info = SetDefault(gt_params);
-        auto kernel_data = KernelData::Default<gather_tree_params>(params);
-        auto cldnn_jit = GetJitConstants(gt_params);
-        auto entry_point = GetEntryPoint(kernelName, gt_params.layerID, options);
-        auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-        FillCLKernelData(kernel_data.kernels[0], run_info, params.engineInfo, kernelName, jit, entry_point, DEFAULT, false, false, 4);
-        kernel_data.estimatedTime = estimated_time;
-        return { kernel_data };
-    }
+    auto run_info = SetDefault(gt_params);
+    auto kernel_data = KernelData::Default<gather_tree_params>(params);
+    auto cldnn_jit = GetJitConstants(gt_params);
+    auto entry_point = GetEntryPoint(kernelName, gt_params.layerID, options);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    FillCLKernelData(kernel_data.kernels[0],
+                        run_info,
+                        params.engineInfo,
+                        kernelName,
+                        jit,
+                        entry_point,
+                        DEFAULT,
+                        false,
+                        false,
+                        static_cast<int>(gt_params.inputs.size()));
+    kernel_data.estimatedTime = estimated_time;
+    return { kernel_data };
+}
 }  // namespace kernel_selector
index eb3e029..f7d7bf7 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
index 392bdfa..f3926a7 100644 (file)
 namespace kernel_selector {
 ParamsKey ReverseSequenceKernelRef::GetSupportedKey() const {
     ParamsKey k;
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::INT32);
     k.EnableInputDataType(Datatype::F16);
     k.EnableInputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::INT32);
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
     k.EnableAllInputLayout();
index 3f3bee3..73dba74 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 
 #include "include/include_all.cl"
 
-KERNEL(gather_tree_gpu_ref.cl)(
-    const __global UNIT_TYPE* step_input,
-    const __global UNIT_TYPE* parent_input,
-    const __global UNIT_TYPE* max_seq_len_input,
-    const __global UNIT_TYPE* end_token,
-    __global UNIT_TYPE* output)
+KERNEL(gather_tree_gpu_ref)(
+    const __global INPUT0_TYPE* step_input,
+    const __global INPUT1_TYPE* parent_input,
+    const __global INPUT2_TYPE* max_seq_len_input,
+    const __global INPUT3_TYPE* end_token,
+    __global OUTPUT_TYPE* output)
 {
-    const uint beam = get_global_id(0);
-    const uint batch = get_global_id(1);
+    const int beam = get_global_id(0);
+    const int batch = get_global_id(1);
     /*
          b -> time
          f -> batch
          y -> beam
     */
-    uint parent = beam;
-    for(int time = INPUT0_BATCH_NUM - 1; time >= 0; time--) {
 
-        while (time >= (uint)max_seq_len_input[batch]) {
-            output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = end_token[0];
-            time--;
-        }
-        output[OUTPUT_GET_INDEX(time, batch, beam, 0)] =
-            step_input[INPUT0_GET_INDEX(time, batch, parent, 0)];
-        parent = (uint)parent_input[INPUT0_GET_INDEX(time, batch, parent, 0)];
+    const int max_sequence_in_beam = min(INPUT0_BATCH_NUM, (int)max_seq_len_input[batch]);
+    int time;
+    for (time = INPUT0_BATCH_NUM - 1; time >= max_sequence_in_beam; time--) {
+        output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = TO_OUTPUT_TYPE(end_token[0]);
     }
 
+    for (int parent = beam; time >= 0; time--) {
+        output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = step_input[INPUT0_GET_INDEX(time, batch, parent, 0)];
+        parent = parent_input[INPUT1_GET_INDEX(time, batch, parent, 0)];
+    }
+    bool finished = false;
+    for (int time = 0; time < max_sequence_in_beam; time++) {
+        if (finished) {
+            output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = TO_OUTPUT_TYPE(end_token[0]);
+        } else if (output[OUTPUT_GET_INDEX(time, batch, beam, 0)] == TO_OUTPUT_TYPE(end_token[0])) {
+            finished = true;
+        }
+    }
 }
index 061079c..7060a20 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 
 #include "include/include_all.cl"
 
-KERNEL(reverse_sequence_ref)(const __global UNIT_TYPE* input, const __global INPUT1_TYPE* seq_lengths, __global UNIT_TYPE* output)
+KERNEL(reverse_sequence_ref)(const __global INPUT0_TYPE* input, const __global INPUT1_TYPE* seq_lengths, __global OUTPUT_TYPE* output)
 {
     const uint batch = get_global_id(0);
     const uint feature = get_global_id(1);
@@ -23,21 +23,12 @@ KERNEL(reverse_sequence_ref)(const __global UNIT_TYPE* input, const __global INP
     const uint x = (uint)get_global_id(2) % INPUT0_SIZE_X;
     uint dimensions[] = { batch, feature, y, x };
 
-    const uint input_index = INPUT0_OFFSET +
-                             batch * INPUT0_BATCH_PITCH +
-                             feature * INPUT0_FEATURE_PITCH +
-                             y * INPUT0_Y_PITCH +
-                             x * INPUT0_X_PITCH;
+    const uint input_index = INPUT0_GET_INDEX(batch, feature, y, x);
 
     const uint length = (uint)seq_lengths[dimensions[BATCH_AXIS]];
     if (dimensions[SEQ_AXIS] < length)
         dimensions[SEQ_AXIS] = length - dimensions[SEQ_AXIS] - 1;
 
-    const uint output_index = OUTPUT_OFFSET +
-                              dimensions[0] * OUTPUT_BATCH_PITCH +
-                              dimensions[1] * OUTPUT_FEATURE_PITCH +
-                              dimensions[2] * OUTPUT_Y_PITCH +
-                              dimensions[3] * OUTPUT_X_PITCH;
-
+    const uint output_index = OUTPUT_GET_INDEX(dimensions[0], dimensions[1], dimensions[2], dimensions[3]);
     output[output_index] = ACTIVATION(input[input_index], ACTIVATION_PARAMS);
 }
index 604d28d..9eeff6e 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -29,9 +29,11 @@ struct gather_tree_gpu : typed_primitive_gpu_impl<gather_tree> {
 
     static primitive_impl* create(const gather_tree_node& arg) {
         auto b_params = get_default_params<kernel_selector::gather_tree_params>(arg, 1);
-        auto b_optional_params =
-            get_default_optional_params<kernel_selector::gather_tree_optional_params>(arg.get_program());
+        auto b_optional_params = get_default_optional_params<kernel_selector::gather_tree_optional_params>(arg.get_program());
 
+        for (size_t i = 1; i < arg.get_dependencies().size(); i++) {
+            b_params.inputs.push_back(convert_data_tensor(arg.get_dependency(i).get_output_layout(), 1));
+        }
         auto desc = arg.get_primitive();
 
         auto& kernel_selector = kernel_selector::gather_tree_kernel_selector::Instance();
index ec3a89a..1b4b848 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -59,10 +59,11 @@ namespace detail {
 
 attach_reverse_sequence_gpu::attach_reverse_sequence_gpu() {
     auto val_fw = reverse_sequence_gpu::create;
-    implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-                                              val_fw);
-    implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-                                              val_fw);
+    implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
+    implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
+    implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw);
+    implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw);
+    implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw);
 }
 
 }  // namespace detail