[IE CLDNN] Reduce b_fs_yx_fsv16 optimized kernel (#1690)
authorMikołaj Życzyński <mikolaj.zyczynski@intel.com>
Wed, 2 Sep 2020 06:35:30 +0000 (08:35 +0200)
committerGitHub <noreply@github.com>
Wed, 2 Sep 2020 06:35:30 +0000 (09:35 +0300)
22 files changed:
inference-engine/src/cldnn_engine/cldnn_engine.cpp
inference-engine/src/cldnn_engine/cldnn_program.cpp
inference-engine/src/transformations/include/transformations/convert_reduce_to_pooling.hpp
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp [new file with mode: 0644]
inference-engine/tests_deprecated/functional/cldnn/shared_tests_instance/single_layer_tests/reduce_ftests.cpp [deleted file]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_b_fs_yx_fsv16.h [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_base.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_base.h [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_ref.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_selector.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reduce_ref.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
inference-engine/thirdparty/clDNN/src/gpu/activation_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/reduce_gpu.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
inference-engine/thirdparty/clDNN/src/program.cpp
inference-engine/thirdparty/clDNN/src/reduce.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/reduce_gpu_test.cpp

index bf23da1..0aad0a1 100644 (file)
@@ -87,6 +87,23 @@ InferenceEngine::ICNNNetwork::Ptr clDNNEngine::CloneAndTransformNetwork(const In
                 return stdOp->input_value(0).get_shape().size() <= 5lu && stdOp->input_value(0).get_shape().size() == stdOp->get_output_shape(0).size();
             }
 
+            // Reduce node implementation with reduce along features performs better with Reshape->Pooling->Reshape pattern
+            if (auto redOp = std::dynamic_pointer_cast<const ::ngraph::opset1::ReduceMean>(node)) {
+                auto reduction_axes = redOp->get_reduction_axes().to_vector();
+                bool reduce_along_f = redOp->get_reduction_axes().size() == 1 && std::count(reduction_axes.begin(), reduction_axes.end(), 1) != 0;
+                return !reduce_along_f;
+            }
+            if (auto redOp = std::dynamic_pointer_cast<const ::ngraph::opset1::ReduceMax>(node)) {
+                auto reduction_axes = redOp->get_reduction_axes().to_vector();
+                bool reduce_along_f = redOp->get_reduction_axes().size() == 1 && std::count(reduction_axes.begin(), reduction_axes.end(), 1) != 0;
+                return !reduce_along_f;
+            }
+            if (auto redOp = std::dynamic_pointer_cast<const ::ngraph::opset1::ReduceSum>(node)) {
+                auto reduction_axes = redOp->get_reduction_axes().to_vector();
+                bool reduce_along_f = redOp->get_reduction_axes().size() == 1 && std::count(reduction_axes.begin(), reduction_axes.end(), 1) != 0;
+                return !reduce_along_f;
+            }
+
             return std::dynamic_pointer_cast<const ::ngraph::opset2::Gelu>(node) ||
                    std::dynamic_pointer_cast<const ::ngraph::opset3::ShuffleChannels>(node) ||
                    std::dynamic_pointer_cast<const ::ngraph::opset2::BatchToSpace>(node) ||
index 55a4c70..791f4bc 100644 (file)
@@ -4367,7 +4367,24 @@ void Program::CreateReducePrimitive(cldnn::topology& topology, InferenceEngine::
             static_cast<int32_t>(reduce->keep_dims));
 
     topology.add(reducePrim);
-    AddPrimitiveToProfiler(reduceLayerName, layer);
+
+    auto reorderLayerName = reduceLayerName + "_reorder";
+    cldnn::format out_format = cldnn::format::any;
+    auto out_dt = DataTypeFromPrecision(reduce->outData[0]->getTensorDesc().getPrecision());
+    if (!reduce->keep_dims && reduceDimNumber > 4) {
+        if (reduceDimNumber - rawAxes.size() == 6)
+            out_format = cldnn::format::bfwzyx;
+        else if (reduceDimNumber - rawAxes.size() == 5)
+            out_format = cldnn::format::bfzyx;
+        else if (reduceDimNumber - rawAxes.size() <= 4)
+            out_format = cldnn::format::bfyx;
+
+        auto reorder_prim = cldnn::reorder(reorderLayerName, reduceLayerName, out_format, out_dt);
+        topology.add(reorder_prim);
+        AddPrimitiveToProfiler(reduceLayerName, layer, reorderLayerName);
+    } else {
+        AddPrimitiveToProfiler(reduceLayerName, layer);
+    }
 }
 
 void Program::CreateOneHotPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr &layer) {
index d52484d..94386d0 100644 (file)
@@ -29,6 +29,12 @@ class TRANSFORMATIONS_API ConvertReduceSumToPooling;
 }  // namespace pass
 }  // namespace ngraph
 
+class ConvertReduceBase : public ngraph::pass::MatcherPass {
+public:
+    template <class T>
+    ngraph::matcher_pass_callback convert_reduce_to_pooling();
+};
+
 class ngraph::pass::ConvertReduceToPooling: public ngraph::pass::GraphRewrite {
 public:
     ConvertReduceToPooling() {
@@ -38,10 +44,7 @@ public:
     }
 };
 
-template <class T>
-ngraph::matcher_pass_callback convert_reduce_to_pooling();
-
-class ngraph::pass::ConvertReduceMeanToPooling: public ngraph::pass::MatcherPass {
+class ngraph::pass::ConvertReduceMeanToPooling: public ConvertReduceBase {
 public:
     ConvertReduceMeanToPooling() {
         auto m = std::make_shared<ngraph::pattern::Matcher>(ngraph::pattern::wrap_type<opset1::ReduceMean>(), "ConvertReduceMean");
@@ -49,7 +52,7 @@ public:
     }
 };
 
-class ngraph::pass::ConvertReduceMaxToPooling: public ngraph::pass::MatcherPass {
+class ngraph::pass::ConvertReduceMaxToPooling: public ConvertReduceBase {
 public:
     ConvertReduceMaxToPooling() {
         auto m = std::make_shared<ngraph::pattern::Matcher>(ngraph::pattern::wrap_type<opset1::ReduceMax>(), "ConvertReduceMax");
@@ -57,7 +60,7 @@ public:
     }
 };
 
-class ngraph::pass::ConvertReduceSumToPooling: public ngraph::pass::MatcherPass {
+class ngraph::pass::ConvertReduceSumToPooling: public ConvertReduceBase {
 public:
     ConvertReduceSumToPooling() {
         auto m = std::make_shared<ngraph::pattern::Matcher>(ngraph::pattern::wrap_type<opset1::ReduceSum>(), "ConvertReduceSum");
@@ -66,10 +69,11 @@ public:
 };
 
 template <class T>
-ngraph::matcher_pass_callback convert_reduce_to_pooling() {
-    return [](ngraph::pattern::Matcher& m) {
+ngraph::matcher_pass_callback ConvertReduceBase::convert_reduce_to_pooling() {
+    return [&](ngraph::pattern::Matcher& m) {
         auto reduce = std::dynamic_pointer_cast<T>(m.get_match_root());
-        if (!reduce) {
+
+        if (!reduce || m_transformation_callback(reduce)) {
             return false;
         }
 
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp
new file mode 100644 (file)
index 0000000..a100eaa
--- /dev/null
@@ -0,0 +1,70 @@
+// Copyright (C) 20120 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/reduce_ops.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+    const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+    };
+
+    const std::vector<std::vector<size_t>> inputShapes = {
+        std::vector<size_t>{1, 2, 4, 4},
+        std::vector<size_t>{3, 2, 5, 6},
+    };
+
+    const std::vector<std::vector<int>> axes = {
+        {0, 2},
+        {1, 3}
+    };
+
+    std::vector<CommonTestUtils::OpType> opTypes = {
+        CommonTestUtils::OpType::SCALAR,
+        CommonTestUtils::OpType::VECTOR,
+    };
+
+    const std::vector<ngraph::helpers::ReductionType> reductionTypes = {
+        ngraph::helpers::ReductionType::Mean,
+        ngraph::helpers::ReductionType::Min,
+        ngraph::helpers::ReductionType::Max,
+        ngraph::helpers::ReductionType::Sum,
+        ngraph::helpers::ReductionType::Prod,
+    };
+
+    const auto paramsOneAxis = testing::Combine(
+        testing::Values(std::vector<int>{0}),
+        testing::ValuesIn(opTypes),
+        testing::Values(true, false),
+        testing::ValuesIn(reductionTypes),
+        testing::ValuesIn(netPrecisions),
+        testing::ValuesIn(inputShapes),
+        testing::Values(CommonTestUtils::DEVICE_GPU));
+
+    INSTANTIATE_TEST_CASE_P(
+        ReduceOneAxis,
+        ReduceOpsLayerTest,
+        paramsOneAxis,
+        ReduceOpsLayerTest::getTestCaseName);
+
+    const auto params = testing::Combine(
+        testing::ValuesIn(axes),
+        testing::Values(opTypes[1]),
+        testing::Values(true, false),
+        testing::ValuesIn(reductionTypes),
+        testing::ValuesIn(netPrecisions),
+        testing::ValuesIn(inputShapes),
+        testing::Values(CommonTestUtils::DEVICE_GPU));
+
+    INSTANTIATE_TEST_CASE_P(
+        Reduce,
+        ReduceOpsLayerTest,
+        params,
+        ReduceOpsLayerTest::getTestCaseName);
+
+}  // namespace
diff --git a/inference-engine/tests_deprecated/functional/cldnn/shared_tests_instance/single_layer_tests/reduce_ftests.cpp b/inference-engine/tests_deprecated/functional/cldnn/shared_tests_instance/single_layer_tests/reduce_ftests.cpp
deleted file mode 100644 (file)
index 29009c9..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "reduce_tests.hpp"
-
-INSTANTIATE_TEST_CASE_P(
-        smoke_GPU_TestsReduceSum, ReduceTestsShared,
-        ::testing::Values(
-        // Params: library_name, reduce_type, keep_dims, in_shape, input_tensor, axes_for_reduction, out_shape, reference
-        reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 0 },{ 1, 3, 4 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ -3 },{ 1, 3, 4 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 2 },{ 2, 3, 1 },{ 10, 26, 42, 58, 74, 90 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4, 1, 1 },{},{ 2 },{ 2, 3, 1, 1, 1 },{ 10, 26, 42, 58, 74, 90 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ -1 },{ 2, 3, 1 },{ 10, 26, 42, 58, 74, 90 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 0, 2 },{ 1, 3, 1 },{ 68, 100, 132 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 1, 2 },{ 2, 1, 1 },{ 78, 222 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 2, 1 },{ 2, 1, 1 },{ 78, 222 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 0, 1, 2 },{},{ 300 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 0, -2, 2 },{},{ 300 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", true,{ 2, 3, 4 },{},{ 2, 2, 0, 2, 0 },{ 1, 3, 1 },{ 68, 100, 132 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 0 },{ 3, 4 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ -3 },{ 3, 4 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 2 },{ 2, 3 },{ 10, 26, 42, 58, 74, 90 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ -1 },{ 2, 3 },{ 10, 26, 42, 58, 74, 90 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 0, 2 },{ 3 },{ 68, 100, 132 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 1, 2 },{ 2 },{ 78, 222 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 2, 1 },{ 2 },{ 78, 222 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 0, 1, 2 },{},{ 300 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 0, -2, 2 },{},{ 300 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 2, 2, 0, 2, 0 },{ 3 },{ 68, 100, 132 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", true,{ 1, 2, 3, 4, 1 },{},{ 1 },{ 1, 1, 3, 4, 1 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } },
-        reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 1, 2, 3, 4, 1 },{},{ 1 },{ 1, 3, 4, 1 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } }
-));
-
-INSTANTIATE_TEST_CASE_P(
-        smoke_GPU_TestsReduce, ReduceTestsShared,
-        ::testing::Values(
-        // Params: library_name, reduce_type, keep_dims, in_shape, input_tensor, axes_for_reduction, out_shape, reference
-        reduce_test_params{ "GPU", "FP32", "ReduceAnd", true,{ 2, 2, 2 },{1, 0, 1, 1, 0, 1, 1, 0},{ 2 },{ 2, 2, 1 },{ 0, 1, 0, 0} },
-        reduce_test_params{ "GPU", "FP32", "ReduceAnd", false, { 2, 2, 2 },{1, 0, 1, 1, 0, 1, 1, 0},{ 0, 1, 2 },{ },{ 0 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceL1", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{ } },
-        reduce_test_params{ "GPU", "FP32", "ReduceL1", true, { 3, 2, 2 },{},{ 2 },{ 3, 2, 1 },{ 3, 7, 11, 15, 19, 23 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceL1", false, { 3, 2, 2 },{},{ 2 },{ 3, 2 },{ 3, 7, 11, 15, 19, 23 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceL1", false, { 3, 2, 2 },{},{ 0, 1, 2 },{ },{ 78 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceL2", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
-        reduce_test_params{ "GPU", "FP32", "ReduceL2", true,{ 3, 2, 2 },{},{ 2 },{ 3, 2, 1 },{ 2.23606798f, 5.f, 7.81024968f, 10.63014581f, 13.45362405f, 16.2788206f } },
-        reduce_test_params{ "GPU", "FP32", "ReduceL2", false,{ 3, 2, 2 },{},{ 2 },{ 3, 2 },{ 2.23606798f, 5.f, 7.81024968f, 10.63014581f, 13.45362405f, 16.2788206f } },
-        reduce_test_params{ "GPU", "FP32", "ReduceL2", false,{ 3, 2, 2 },{},{ 0, 1, 2 },{ },{ 25.49509757f } },
-        reduce_test_params{ "GPU", "FP32", "ReduceLogSum", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
-        reduce_test_params{ "GPU", "FP32", "ReduceLogSum", true,{ 3, 2, 2 },{ },{ 1 },{ 3, 1, 2 },{ } },
-        reduce_test_params{ "GPU", "FP32", "ReduceLogSum", false,{ 3, 2, 2 },{ },{ 1 },{ 3, 2 },{ } },
-        reduce_test_params{ "GPU", "FP32", "ReduceLogSum", false,{ 3, 2, 2 },{ },{ 0, 1, 2 },{},{ } },
-        reduce_test_params{ "GPU", "FP32", "ReduceLogSumExp", true,{ 5, 5, 2 },{},{ 2 },{ 5, 5, 1 },{} },
-        reduce_test_params{ "GPU", "FP32", "ReduceLogSumExp", true,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 1, 2 },{ 20.f, 2.31326175f, 40.00004578f, 2.31326175f, 60.00671387f, 2.31326175f } },
-        reduce_test_params{ "GPU", "FP32", "ReduceLogSumExp", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 2 },{ 20.f, 2.31326175f, 40.00004578f, 2.31326175f, 60.00671387f, 2.31326175f } },
-        reduce_test_params{ "GPU", "FP32", "ReduceLogSumExp", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 0, 1, 2 },{},{ 60.00671387f } },
-        reduce_test_params{ "GPU", "FP32", "ReduceMax", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
-        reduce_test_params{ "GPU", "FP32", "ReduceMax", true,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 1, 2 },{ 20, 2, 40, 2, 60, 2 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceMax", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 2 },{ 20, 2, 40, 2, 60, 2 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceMax", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 0, 1, 2 },{},{ 60 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceMean", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
-        reduce_test_params{ "GPU", "FP32", "ReduceMean", true, { 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 1, 2 },{ 12.5f, 1.5f, 35.f, 1.5f, 57.5f, 1.5f } },
-        reduce_test_params{ "GPU", "FP32", "ReduceMean", false, { 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 2 },{ 12.5f, 1.5f, 35.f, 1.5f, 57.5f, 1.5f } },
-        reduce_test_params{ "GPU", "FP32", "ReduceMean", false, { 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 0, 1, 2 },{ },{ 18.25f } },
-        reduce_test_params{ "GPU", "FP32", "ReduceMin", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
-        reduce_test_params{ "GPU", "FP32", "ReduceMin", true,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 1, 2 },{ 5, 1, 30, 1, 55, 1 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceMin", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 2 },{ 5, 1, 30, 1, 55, 1 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceMin", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 0, 1, 2 },{},{ 1 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceOr", true,{ 2, 2, 2 },{1, 0, 1, 1, 0, 0, 1, 0},{ 2 },{ 2, 2, 1 },{1, 1, 0, 1 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceOr", false, { 2, 2, 2 },{},{ 0, 1, 2 },{ },{ 1 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceProd", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
-        reduce_test_params{ "GPU", "FP32", "ReduceProd", true,{ 3, 2, 2 },{},{ 1 },{ 3, 1, 2 },{ 3, 8, 35, 48, 99, 120 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceProd", false,{ 3, 2, 2 },{},{ 1 },{ 3, 2 },{ 3, 8, 35, 48, 99, 120 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceProd", false,{ 3, 2, 2 },{},{ 0, 1, 2 },{ },{ 4.790016e+08 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSumSquare", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
-        reduce_test_params{ "GPU", "FP32", "ReduceSumSquare", true, { 3, 2, 2 },{},{ 1 },{ 3, 1, 2 },{ 10, 20, 74, 100, 202, 244 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSumSquare", false, { 3, 2, 2 },{},{ 1 },{ 3, 2 },{ 10, 20, 74, 100, 202, 244 } },
-        reduce_test_params{ "GPU", "FP32", "ReduceSumSquare", false, { 3, 2, 2 },{},{ 0, 1, 2 },{ },{ 650 } }
-));
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp
new file mode 100644 (file)
index 0000000..56cf279
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "reduce_kernel_b_fs_yx_fsv16.h"
+#include "kernel_selector_utils.h"
+#include <vector>
+#include <string>
+#include "common_tools.h"
+
+namespace kernel_selector {
+
+static const size_t SIMD = 16;
+using NDims = std::vector<kernel_selector::Tensor::Dim>;
+
+static size_t calc_read_offset(const reduce_params& params) {
+    auto read_offset = 1;
+    if (BytesPerElement(params.inputs[0].GetDType()) == 4)
+        read_offset = 4;
+    else if (BytesPerElement(params.inputs[0].GetDType()) == 2)
+        read_offset = 8;
+    else if (BytesPerElement(params.inputs[0].GetDType()) == 1)
+        read_offset = 16;
+    return read_offset;
+}
+
+static NDims calc_in_dims(const reduce_params& params) {
+    auto input = params.inputs[0];
+    auto in_dims = input.GetDims();
+    auto reduce_axes = params.reduceAxes;
+
+    std::vector<size_t> ordered_axes = {0, 1, 3, 2};
+    std::reverse(in_dims.begin(), in_dims.end());
+    for (size_t a = 0; a < params.reduceAxes.size(); a++) {
+        in_dims[ordered_axes[params.reduceAxes[a]]].v = 1;
+    }
+
+    return in_dims;
+}
+
+ParamsKey ReduceKernel_b_fs_yx_fsv16::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT32);
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT32);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDifferentTypes();
+    return k;
+}
+
+CommonDispatchData ReduceKernel_b_fs_yx_fsv16::SetDefault(const reduce_params& params, const optional_params&) const {
+    CommonDispatchData runInfo;
+
+    auto in_dims = calc_in_dims(params);
+    std::vector<size_t> global = {16,
+                                  CeilDiv(in_dims[3].v, calc_read_offset(params)) * in_dims[2].v,  // X, Y
+                                  CeilDiv(in_dims[1].v, SIMD) * in_dims[0].v};                     // F, B
+
+    runInfo.gws0 = global[0];
+    runInfo.gws1 = global[1];
+    runInfo.gws2 = global[2];
+
+    runInfo.lws0 = SIMD;
+    runInfo.lws1 = 1;
+    runInfo.lws2 = 1;
+
+    return runInfo;
+}
+
+JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& params) const {
+    auto jit = ReduceKernelBase::GetJitConstants(params);
+    auto in_dims = calc_in_dims(params);
+    auto read_offset = calc_read_offset(params);
+
+    // Universal output sizes for keep dims = true/false cases
+    jit.AddConstant(MakeJitConstant("COMMON_OUTPUT_SIZE_X", in_dims[3].v));
+    jit.AddConstant(MakeJitConstant("COMMON_OUTPUT_SIZE_Y", in_dims[2].v));
+    jit.AddConstant(MakeJitConstant("COMMON_OUTPUT_FEATURE_NUM", in_dims[1].v));
+    jit.AddConstant(MakeJitConstant("COMMON_OUTPUT_BATCH_NUM", in_dims[0].v));
+    jit.AddConstant(MakeJitConstant("READ_OFFSET", read_offset));
+    jit.AddConstant(MakeJitConstant("BLOCK_READ(ptr,offset)", "DT_INPUT_BLOCK_READ" + std::to_string(read_offset) + "(ptr,offset)"));
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+    jit.Merge(MakeTypeJitConstants(GetFinalAccumulatorType(params), "FINAL_ACCUMULATOR"));
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        std::vector<std::string> idx_order = {"b", "f", "y", "x"};
+        std::string var_name = "reduce_result";
+
+        bool cant_handle_vec16 = read_offset > 8 ? true : false;
+        size_t vec_size = cant_handle_vec16 ? 8 : read_offset;
+
+        FusedOpsConfiguration conf_scalar = {"_SCALAR",
+                                             idx_order,
+                                             var_name,
+                                             input_dt,
+                                             1,
+                                             LoadType::LT_ALIGNED_READ,
+                                             BoundaryCheck::DISABLED,
+                                             IndexType::TENSOR_COORD,
+                                             Tensor::DataChannelName::X};
+
+        if (cant_handle_vec16) {
+            FusedOpsConfiguration conf_vector_1 = {"_VECTOR_1",
+                                                   idx_order,
+                                                   var_name+".lo",
+                                                   input_dt,
+                                                   vec_size,
+                                                   LoadType::LT_ALIGNED_READ,
+                                                   BoundaryCheck::DISABLED,
+                                                   IndexType::TENSOR_COORD,
+                                                   Tensor::DataChannelName::X};
+
+            std::vector<std::string> idx_order_vec_2 = {"b", "f", "y", "x + 8"};
+            FusedOpsConfiguration conf_vector_2 = {"_VECTOR_2",
+                                                   idx_order_vec_2,
+                                                   var_name+".hi",
+                                                   input_dt,
+                                                   vec_size,
+                                                   LoadType::LT_ALIGNED_READ,
+                                                   BoundaryCheck::DISABLED,
+                                                   IndexType::TENSOR_COORD,
+                                                   Tensor::DataChannelName::X};
+
+            jit.AddConstant(MakeJitConstant("FUSED_OPS_VECTOR", "{FUSED_OPS_VECTOR_1;final_result.lo=FUSED_OPS_RESULT_VECTOR_1;} {FUSED_OPS_VECTOR_2;final_result.hi=FUSED_OPS_RESULT_VECTOR_2;}"));
+            jit.AddConstant(MakeJitConstant("FUSED_OPS_RESULT_VECTOR", "final_result"));
+            jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar, conf_vector_1, conf_vector_2}));
+        } else {
+            FusedOpsConfiguration conf_vector = {"_VECTOR",
+                                                 idx_order,
+                                                 var_name,
+                                                 input_dt,
+                                                 vec_size,
+                                                 LoadType::LT_ALIGNED_READ,
+                                                 BoundaryCheck::DISABLED,
+                                                 IndexType::TENSOR_COORD,
+                                                 Tensor::DataChannelName::X};
+
+            jit.Merge(MakeFusedOpsJitConstants(params, {conf_vector, conf_scalar}));
+        }
+    }
+
+    return jit;
+}
+
+KernelsData ReduceKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, const optional_params& options) const {
+    return GetCommonKernelsData(params, options, FORCE_PRIORITY_6);
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_b_fs_yx_fsv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_b_fs_yx_fsv16.h
new file mode 100644 (file)
index 0000000..edc68af
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "reduce_kernel_base.h"
+#include <vector>
+
+namespace kernel_selector {
+class ReduceKernel_b_fs_yx_fsv16 : public ReduceKernelBase {
+public:
+    ReduceKernel_b_fs_yx_fsv16() : ReduceKernelBase("reduce_gpu_b_fs_yx_fsv16") {}
+    virtual ~ReduceKernel_b_fs_yx_fsv16() {}
+    virtual CommonDispatchData SetDefault(const reduce_params& params, const optional_params&) const;
+    JitConstants GetJitConstants(const reduce_params& params) const override;
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ELTWISE,
+                 FusedOpType::ACTIVATION };
+    }
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_base.cpp
new file mode 100644 (file)
index 0000000..526080e
--- /dev/null
@@ -0,0 +1,262 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "reduce_kernel_base.h"
+#include "kernel_selector_utils.h"
+#include <vector>
+#include <string>
+#include "common_tools.h"
+
+namespace kernel_selector {
+
+bool ReduceKernelBase::Validate(const Params& p, const optional_params&) const {
+    auto& params = dynamic_cast<const reduce_params&>(p);
+
+    if (params.GetType() != KernelType::REDUCE) {
+        return false;
+    }
+
+    for (auto& fused_op : params.fused_ops) {
+        if (!IsFusedPrimitiveSupported(fused_op))
+            return false;
+    }
+
+    return true;
+}
+
+JitConstants ReduceKernelBase::GetJitConstants(const reduce_params& params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+
+    jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", params.output.LogicalSize()));
+    jit.AddConstant(MakeJitConstant("REDUCE_" + toString(params.reduceMode) + "_MODE", 1));
+    jit.AddConstant(MakeJitConstant("KEEP_DIMS", params.keepDims));
+
+    auto inputDims = params.inputs[0].LogicalDims();
+    std::reverse(inputDims.begin(), inputDims.end());
+
+    auto convertAxesToIE = [&]() -> std::vector<int32_t> {
+        std::vector<int32_t> res;
+        auto sz = inputDims.size();
+
+        for (size_t i = 0; i < params.reduceAxes.size(); ++i) {
+            switch (params.reduceAxes[i]) {
+                case 0:
+                    res.push_back(0);
+                    break;
+                case 1:
+                    res.push_back(1);
+                    break;
+                case 2:
+                    res.push_back(sz == 6 ? 5 : sz == 5 ? 4 : 3);
+                    break;
+                case 3:
+                    res.push_back(sz == 6 ? 4 : sz == 5 ? 3 : 2);
+                    break;
+                case 4:
+                    res.push_back(sz == 6 ? 3 : 2);
+                    break;
+                case 5:
+                    res.push_back(2);
+                    break;
+            }
+        }
+        return res;
+    };
+
+    auto getDimSizeNameByNum = [&](size_t dim) -> std::string {
+        if (params.inputs[0].Dimentions() == 6) {
+            switch (dim) {
+                case 0:
+                    return "BATCH_NUM";
+                case 1:
+                    return "FEATURE_NUM";
+                case 2:
+                    return "SIZE_W";
+                case 3:
+                    return "SIZE_Z";
+                case 4:
+                    return "SIZE_Y";
+                case 5:
+                    return "SIZE_X";
+            }
+        } else if (params.inputs[0].Dimentions() == 5) {
+            switch (dim) {
+                case 0:
+                    return "BATCH_NUM";
+                case 1:
+                    return "FEATURE_NUM";
+                case 2:
+                    return "SIZE_Z";
+                case 3:
+                    return "SIZE_Y";
+                case 4:
+                    return "SIZE_X";
+            }
+        } else if (params.inputs[0].Dimentions() == 4) {
+            switch (dim) {
+                case 0:
+                    return "BATCH_NUM";
+                case 1:
+                    return "FEATURE_NUM";
+                case 2:
+                    return "SIZE_Y";
+                case 3:
+                    return "SIZE_X";
+            }
+        }
+        return "";
+    };
+
+    auto convertedAxes = convertAxesToIE();
+
+    std::string divider;
+    for (size_t i = 0; i < params.reduceAxes.size(); ++i) {
+        divider += "INPUT0_" + getDimSizeNameByNum(convertedAxes[i]);
+        size_t range_check = i;
+        if (++range_check < params.reduceAxes.size())
+            divider += "*";
+    }
+    jit.AddConstant(MakeJitConstant("DIVIDER", divider));
+
+    const size_t kept_dims = inputDims.size() - params.reduceAxes.size();
+    if (kept_dims == 1) {
+        for (size_t i = 0; i < inputDims.size(); ++i)
+            if (std::find(convertedAxes.begin(), convertedAxes.end(), i) == convertedAxes.end())
+                jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", "index"));
+    } else {
+        size_t kept_cnt = 0;
+        for (size_t i = 0; i < inputDims.size(); ++i) {
+            if (std::find(convertedAxes.begin(), convertedAxes.end(), i) == convertedAxes.end()) {
+                if (kept_cnt == 0) {
+                    std::string str = "(index ";
+                    for (size_t j = i + 1; j < inputDims.size(); ++j) {
+                        if (std::find(convertedAxes.begin(), convertedAxes.end(), j) == convertedAxes.end()) {
+                            str += "/ INPUT0_" + getDimSizeNameByNum(j);
+                        }
+                    }
+                    str += ")";
+                    jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", str));
+                } else if (kept_cnt == kept_dims - 1) {
+                    jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)",
+                                                    "(index % INPUT0_" + getDimSizeNameByNum(i) + ")"));
+                } else {
+                    std::string str = "(index ";
+                    for (size_t j = i + 1; j < inputDims.size(); ++j) {
+                        if (std::find(convertedAxes.begin(), convertedAxes.end(), j) == convertedAxes.end()) {
+                            str += "/ INPUT0_" + getDimSizeNameByNum(j);
+                        }
+                    }
+                    str += " % INPUT0_" + getDimSizeNameByNum(i) + ")";
+                    jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", str));
+                }
+                kept_cnt += 1;
+            }
+        }
+    }
+
+    for (size_t a = 0; a < params.reduceAxes.size(); a++) {
+        switch (params.reduceAxes[a]) {
+            case 0:
+                jit.AddConstant(MakeJitConstant("REDUCE_BATCH", 1));
+                break;
+            case 1:
+                jit.AddConstant(MakeJitConstant("REDUCE_FEATURE", 1));
+                break;
+            case 2:
+                jit.AddConstant(MakeJitConstant("REDUCE_X", 1));
+                break;
+            case 3:
+                jit.AddConstant(MakeJitConstant("REDUCE_Y", 1));
+                break;
+            case 4:
+                jit.AddConstant(MakeJitConstant("REDUCE_Z", 1));
+                break;
+            case 5:
+                jit.AddConstant(MakeJitConstant("REDUCE_W", 1));
+                break;
+        }
+    }
+
+    return jit;
+}
+
+Datatype ReduceKernelBase::GetAccumulatorType(const reduce_params& params) const {
+    const auto& input_dt = params.inputs[0].GetDType();
+    const auto& reduce_mode = params.reduceMode;
+
+    if (reduce_mode == ReduceMode::MAX || reduce_mode == ReduceMode::MIN) {
+        return input_dt;
+    } else {
+        switch (input_dt) {
+            case Datatype::F32: return Datatype::F32;
+            case Datatype::F16: return Datatype::F32;
+            case Datatype::INT8: return Datatype::INT32;
+            case Datatype::UINT8: return Datatype::INT32;
+            default: return Datatype::F32;
+        }
+    }
+}
+
+Datatype ReduceKernelBase::GetFinalAccumulatorType(const reduce_params& params) const {
+    const auto& reduce_mode = params.reduceMode;
+
+    if (reduce_mode == ReduceMode::MEAN || reduce_mode  == ReduceMode::LOG_SUM_EXP ||
+        reduce_mode == ReduceMode::LOG_SUM || reduce_mode == ReduceMode::L2 || reduce_mode == ReduceMode::L1) {
+            return Datatype::F32;
+    } else
+        return GetAccumulatorType(params);
+}
+
+Datatype ReduceKernelBase::GetActivationType(const reduce_params& params) const {
+    if (params.output.GetDType() == Datatype::F16)
+        return Datatype::F16;
+    else
+        return Datatype::F32;
+}
+
+KernelsData ReduceKernelBase::GetCommonKernelsData(const Params& p,
+                                                   const optional_params& options,
+                                                   float estimatedTime) const {
+    if (!Validate(p, options)) {
+        return {};
+    }
+
+    const reduce_params& params = static_cast<const reduce_params&>(p);
+    DispatchData runInfo = SetDefault(params, options);
+
+    KernelData kd = KernelData::Default<reduce_params>(params);
+
+    auto cldnn_jit = GetJitConstants(params);
+    auto entry_point = GetEntryPoint(kernelName, params.layerID, options);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+    auto& kernel = kd.kernels[0];
+    FillCLKernelData(kernel,
+                     runInfo,
+                     params.engineInfo,
+                     kernelName,
+                     jit,
+                     entry_point,
+                     DEFAULT,
+                     false,
+                     false,
+                     1,
+                     GetFusedPrimitiveInputsCount(params));
+    kd.estimatedTime = estimatedTime;
+
+    return {kd};
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_base.h
new file mode 100644 (file)
index 0000000..143d6af
--- /dev/null
@@ -0,0 +1,60 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "common_kernel_base.h"
+#include "kernel_selector_params.h"
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// reduce_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct reduce_params : public base_params {
+    reduce_params() : base_params(KernelType::REDUCE), reduceMode(ReduceMode::MAX), keepDims(0) {}
+
+    ReduceMode reduceMode;
+    std::vector<uint16_t> reduceAxes;
+    int32_t keepDims;
+
+    virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// reduce_optional_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct reduce_optional_params : optional_params {
+    reduce_optional_params() : optional_params(KernelType::REDUCE) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// ReduceKernelBase
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class ReduceKernelBase : public common_kernel_base {
+public:
+    using common_kernel_base::common_kernel_base;
+    using DispatchData = CommonDispatchData;
+
+    virtual ~ReduceKernelBase() {}
+
+protected:
+    bool Validate(const Params&, const optional_params&) const override;
+    virtual JitConstants GetJitConstants(const reduce_params& params) const;
+    virtual CommonDispatchData SetDefault(const reduce_params& params, const optional_params&) const = 0;
+    Datatype GetAccumulatorType(const reduce_params& p) const;
+    Datatype GetFinalAccumulatorType(const reduce_params& p) const;
+    Datatype GetActivationType(const reduce_params& params) const;
+    KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
+};
+}  // namespace kernel_selector
index cdb53d8..ca26a37 100644 (file)
@@ -18,6 +18,7 @@
 #include "kernel_selector_utils.h"
 #include <vector>
 #include <string>
+#include "common_tools.h"
 
 namespace kernel_selector {
 ParamsKey ReduceKernelRef::GetSupportedKey() const {
@@ -27,19 +28,13 @@ ParamsKey ReduceKernelRef::GetSupportedKey() const {
     k.EnableInputDataType(Datatype::INT32);
     k.EnableInputDataType(Datatype::INT8);
     k.EnableInputDataType(Datatype::UINT8);
-
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::INT32);
     k.EnableOutputDataType(Datatype::INT8);
     k.EnableOutputDataType(Datatype::UINT8);
-
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::bfzyx);
-    k.EnableOutputLayout(DataLayout::bfzyx);
-    k.EnableInputLayout(DataLayout::bfwzyx);
-    k.EnableOutputLayout(DataLayout::bfwzyx);
+    k.EnableAllInputLayout();
+    k.EnableAllOutputLayout();
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();
@@ -50,7 +45,9 @@ ParamsKey ReduceKernelRef::GetSupportedKey() const {
 CommonDispatchData ReduceKernelRef::SetDefault(const reduce_params& params, const optional_params&) const {
     CommonDispatchData runInfo;
 
-    std::vector<size_t> global = {params.output.LogicalSize(), 1, 1};
+    std::vector<size_t> global = {params.output.X().v * params.output.Y().v,
+                                  params.output.Z().v * params.output.W().v,
+                                  params.output.Batch().v * params.output.Feature().v};
 
     auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
 
@@ -66,129 +63,40 @@ CommonDispatchData ReduceKernelRef::SetDefault(const reduce_params& params, cons
 }
 
 JitConstants ReduceKernelRef::GetJitConstants(const reduce_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-
-    jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", params.output.LogicalSize()));
-    jit.AddConstant(MakeJitConstant("REDUCE_" + toString(params.reduceMode) + "_MODE", 1));
-    jit.AddConstant(MakeJitConstant("KEEP_DIMS", params.keepDims));
-
-    auto inputDims = params.inputs[0].LogicalDims();
-    std::reverse(inputDims.begin(), inputDims.end());
-
-    auto convertAxesToIE = [&]() -> std::vector<int32_t> {
-        std::vector<int32_t> res;
-        auto sz = inputDims.size();
-
-        for (size_t i = 0; i < params.reduceAxes.size(); ++i) {
-            switch (params.reduceAxes[i]) {
-                case 0: res.push_back(0); break;
-                case 1: res.push_back(1); break;
-                case 2: res.push_back(sz == 6 ? 5 : sz == 5 ? 4 : 3); break;
-                case 3: res.push_back(sz == 6 ? 4 : sz == 5 ? 3 : 2); break;
-                case 4: res.push_back(sz == 6 ? 3 : 2); break;
-                case 5: res.push_back(2); break;
-            }
-        }
-        return res;
-    };
-
-    auto getDimSizeNameByNum = [&](size_t dim) -> std::string {
-        if (params.inputs[0].GetLayout() == DataLayout::bfwzyx) {
-            switch (dim) {
-                case 0: return "BATCH_NUM";
-                case 1: return "FEATURE_NUM";
-                case 2: return "SIZE_W";
-                case 3: return "SIZE_Z";
-                case 4: return "SIZE_Y";
-                case 5: return "SIZE_X";
-            }
-        } else if (params.inputs[0].GetLayout() == DataLayout::bfzyx) {
-            switch (dim) {
-                case 0: return "BATCH_NUM";
-                case 1: return "FEATURE_NUM";
-                case 2: return "SIZE_Z";
-                case 3: return "SIZE_Y";
-                case 4: return "SIZE_X";
-            }
-        } else if (params.inputs[0].GetLayout() == DataLayout::bfyx) {
-            switch (dim) {
-                case 0: return "BATCH_NUM";
-                case 1: return "FEATURE_NUM";
-                case 2: return "SIZE_Y";
-                case 3: return "SIZE_X";
-            }
-        }
-        return "";
-    };
-
-    auto convertedAxes = convertAxesToIE();
-
-    const size_t kept_dims = inputDims.size() - params.reduceAxes.size();
-    if (kept_dims == 1) {
-        for (size_t i = 0; i < inputDims.size(); ++i)
-            if (std::find(convertedAxes.begin(), convertedAxes.end(), i) == convertedAxes.end())
-                jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", "index"));
-    } else {
-        size_t kept_cnt = 0;
-        for (size_t i = 0; i < inputDims.size(); ++i) {
-            if (std::find(convertedAxes.begin(), convertedAxes.end(), i) == convertedAxes.end()) {
-                if (kept_cnt == 0) {
-                    std::string str = "(index ";
-                    for (size_t j = i + 1; j < inputDims.size(); ++j) {
-                        if (std::find(convertedAxes.begin(), convertedAxes.end(), j) == convertedAxes.end()) {
-                            str += "/ INPUT0_" + getDimSizeNameByNum(j);
-                        }
-                    }
-                    str += ")";
-                    jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", str));
-                } else if (kept_cnt == kept_dims - 1) {
-                    jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", "(index % INPUT0_" + getDimSizeNameByNum(i) + ")"));
-                } else {
-                    std::string str = "(index ";
-                    for (size_t j = i + 1; j < inputDims.size(); ++j) {
-                        if (std::find(convertedAxes.begin(), convertedAxes.end(), j) == convertedAxes.end()) {
-                            str += "/ INPUT0_" + getDimSizeNameByNum(j);
-                        }
-                    }
-                    str += " % INPUT0_" + getDimSizeNameByNum(i) + ")";
-                    jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", str));
-                }
-                kept_cnt += 1;
-            }
-        }
-    }
+    auto jit = ReduceKernelBase::GetJitConstants(params);
+
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+    jit.Merge(MakeTypeJitConstants(GetFinalAccumulatorType(params), "FINAL_ACCUMULATOR"));
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
 
-    for (size_t a = 0; a < params.reduceAxes.size(); a++) {
-        switch (params.reduceAxes[a]) {
-            case 0: jit.AddConstant(MakeJitConstant("REDUCE_BATCH", 1)); break;
-            case 1: jit.AddConstant(MakeJitConstant("REDUCE_FEATURE", 1)); break;
-            case 2: jit.AddConstant(MakeJitConstant("REDUCE_X", 1)); break;
-            case 3: jit.AddConstant(MakeJitConstant("REDUCE_Y", 1)); break;
-            case 4: jit.AddConstant(MakeJitConstant("REDUCE_Z", 1)); break;
-            case 5: jit.AddConstant(MakeJitConstant("REDUCE_W", 1)); break;
+        std::vector<std::string> idx_order;
+        switch (DataTensor::ChannelsCount(params.inputs[0].GetLayout())) {
+            case 6: idx_order = {"b", "f", "w", "z", "y", "x" }; break;
+            case 5: idx_order = {"b", "f", "z", "y", "x" }; break;
+            default: idx_order = {"b", "f", "y", "x" }; break;
         }
+
+        FusedOpsConfiguration conf = {"",
+                                      idx_order,
+                                      "reduce_result",
+                                      input_dt,
+                                      1,
+                                      LoadType::LT_UNALIGNED,
+                                      BoundaryCheck::DISABLED,
+                                      IndexType::TENSOR_COORD,
+                                      Tensor::DataChannelName::X};
+
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
     }
 
     return jit;
 }
 
 KernelsData ReduceKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
-    KernelData kd = KernelData::Default<reduce_params>(params);
-    reduce_params& newParams = *static_cast<reduce_params*>(kd.params.get());
-
-    assert(params.GetType() == KernelType::REDUCE);
-
-    auto runInfo = SetDefault(newParams, options);
-    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
-    auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-
-    FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
-
-    kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-
-    return {kd};
+    return GetCommonKernelsData(params, options, DONT_USE_IF_HAVE_SOMETHING_ELSE);
 }
+
 }  // namespace kernel_selector
index 955b42b..f54af53 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 
 #pragma once
 
-#include "common_kernel_base.h"
+#include "reduce_kernel_base.h"
 #include <vector>
 
 namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// reduce_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct reduce_params : public base_params {
-    reduce_params() : base_params(KernelType::REDUCE), reduceMode(ReduceMode::MAX), keepDims(0) {}
-
-    ReduceMode reduceMode;
-    std::vector<uint16_t> reduceAxes;
-    int32_t keepDims;
-
-    virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// reduce_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct reduce_optional_params : optional_params {
-    reduce_optional_params() : optional_params(KernelType::REDUCE) {}
-};
-
-class ReduceKernelRef : public common_kernel_base {
+class ReduceKernelRef : public ReduceKernelBase {
 public:
-    ReduceKernelRef() : common_kernel_base("reduce_ref") {}
+    ReduceKernelRef() : ReduceKernelBase("reduce_ref") {}
     virtual ~ReduceKernelRef() {}
-    virtual JitConstants GetJitConstants(const reduce_params& params) const;
     virtual CommonDispatchData SetDefault(const reduce_params& params, const optional_params&) const;
     KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
     ParamsKey GetSupportedKey() const override;
+    JitConstants GetJitConstants(const reduce_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ELTWISE,
+                 FusedOpType::ACTIVATION };
+    }
 };
 }  // namespace kernel_selector
index a856d9a..763506f 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 
 #include "reduce_kernel_selector.h"
 #include "reduce_kernel_ref.h"
+#include "reduce_kernel_b_fs_yx_fsv16.h"
 
 namespace kernel_selector {
 
-reduce_kernel_selector::reduce_kernel_selector() { Attach<ReduceKernelRef>(); }
+reduce_kernel_selector::reduce_kernel_selector() {
+    Attach<ReduceKernelRef>();
+    Attach<ReduceKernel_b_fs_yx_fsv16>();
+}
 
 KernelsData reduce_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
     return GetNaiveBestKernel(params, options, KernelType::REDUCE);
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl
new file mode 100644 (file)
index 0000000..1d5f859
--- /dev/null
@@ -0,0 +1,413 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/include_all.cl"
+
+#define SIMD 16
+#define FSV 16
+#define unroll_for  __attribute__((opencl_unroll_hint(READ_OFFSET))) for
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
+#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
+
+#if !defined REDUCE_BATCH
+    #define REDUCE_BATCH 0
+#endif
+#if !defined REDUCE_FEATURE
+    #define REDUCE_FEATURE 0
+#endif
+#if !defined REDUCE_Y
+    #define REDUCE_Y 0
+#endif
+#if !defined REDUCE_X
+    #define REDUCE_X 0
+#endif
+
+#define INPUT_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_OFFSET)
+
+#define ACCUMULATOR_VEC MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, READ_OFFSET)
+#define TO_ACCUMULATOR_VEC CAT(convert_, ACCUMULATOR_VEC)
+#define FINAL_ACCUMULATOR_VEC MAKE_VECTOR_TYPE(FINAL_ACCUMULATOR_TYPE, READ_OFFSET)
+
+#define ACTIVATION_VEC MAKE_VECTOR_TYPE(ACTIVATION_TYPE, READ_OFFSET)
+#define TO_ACTIVATION_VEC CAT(convert_, ACTIVATION_VEC)
+
+#define OUTPUT_VEC MAKE_VECTOR_TYPE(OUTPUT_TYPE, READ_OFFSET)
+#define TO_OUTPUT_VEC CAT(convert_, OUTPUT_VEC)
+
+#define REDUCE_BFY_BY_FY_Y          REDUCE_BATCH && REDUCE_FEATURE && REDUCE_Y && !REDUCE_X || REDUCE_BATCH && REDUCE_Y && !REDUCE_FEATURE && !REDUCE_X || \
+                                    REDUCE_FEATURE && REDUCE_Y && !REDUCE_BATCH && !REDUCE_X|| REDUCE_Y && !REDUCE_BATCH && !REDUCE_FEATURE && !REDUCE_X
+
+#define REDUCE_F                    REDUCE_FEATURE && !REDUCE_BATCH && !REDUCE_Y && !REDUCE_X
+
+#define NEED_SUB_GROUP_REDUCE       REDUCE_FEATURE
+
+#if REDUCE_MAX_MODE
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
+    #define INPUT_INIT_VAL INPUT0_VAL_MIN
+#elif REDUCE_MIN_MODE
+    #define INIT_VAL ACCUMULATOR_VAL_MAX
+    #define INPUT_INIT_VAL INPUT0_VAL_MAX
+#elif REDUCE_PROD_MODE || REDUCE_AND_MODE
+    #define INIT_VAL ACCUMULATOR_VAL_ONE
+    #define INPUT_INIT_VAL INPUT0_VAL_ONE
+#else
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
+    #define INPUT_INIT_VAL INPUT0_VAL_ZERO
+#endif
+
+inline ACCUMULATOR_TYPE FUNC(apply_reduce)(ACCUMULATOR_TYPE acc, ACCUMULATOR_TYPE input) {
+    #if REDUCE_SUM_MODE || REDUCE_MEAN_MODE || REDUCE_LOG_SUM_MODE
+        acc += input;
+    #elif REDUCE_MAX_MODE
+        acc = ACCUMULATOR_MAX_FUNC(acc, input);
+    #elif REDUCE_MIN_MODE
+        acc = ACCUMULATOR_MIN_FUNC(acc, input);
+    #elif REDUCE_PROD_MODE
+        acc *= input;
+    #elif REDUCE_AND_MODE
+        acc = acc && input;
+    #elif REDUCE_OR_MODE
+        acc = acc || input;
+    #elif REDUCE_SUM_SQUARE_MODE || REDUCE_L2_MODE
+        acc += input * input;
+    #elif REDUCE_L1_MODE
+        #if !INPUT0_IS_FP
+            acc += TO_ACCUMULATOR_TYPE(fabs(TO_FINAL_ACCUMULATOR_TYPE(input)));
+        #else
+            acc += fabs(input);
+        #endif
+    #elif REDUCE_LOG_SUM_EXP_MODE
+        #if !INPUT0_IS_FP
+            acc += TO_ACCUMULATOR_TYPE(exp(TO_FINAL_ACCUMULATOR_TYPE(input)));
+        #else
+            acc += exp(input);
+        #endif
+    #endif
+
+    return acc;
+}
+
+inline ACCUMULATOR_TYPE FUNC(sub_group_reduce)(ACCUMULATOR_TYPE acc) {
+    #if NEED_SUB_GROUP_REDUCE
+        #if REDUCE_SUM_MODE || REDUCE_MEAN_MODE || REDUCE_LOG_SUM_MODE
+            acc = sub_group_reduce_add(acc);
+        #elif REDUCE_MAX_MODE
+            acc = sub_group_reduce_max(acc);
+        #elif REDUCE_MIN_MODE
+            acc = sub_group_reduce_min(acc);
+        #elif REDUCE_PROD_MODE
+            ACCUMULATOR_TYPE next = ACCUMULATOR_VAL_ONE;
+            acc *= intel_sub_group_shuffle_down(acc, next, 8);
+            acc *= intel_sub_group_shuffle_down(acc, next, 4);
+            acc *= intel_sub_group_shuffle_down(acc, next, 2);
+            acc *= intel_sub_group_shuffle_down(acc, next, 1);
+            acc  = intel_sub_group_shuffle(acc, 0);
+        #elif REDUCE_AND_MODE
+            acc = sub_group_all(acc);
+        #elif REDUCE_OR_MODE
+            acc = sub_group_any(acc);
+        #elif REDUCE_SUM_SQUARE_MODE || REDUCE_L2_MODE
+            acc = sub_group_reduce_add(acc);
+        #elif REDUCE_L1_MODE
+            acc = sub_group_reduce_add(acc);
+        #elif REDUCE_LOG_SUM_EXP_MODE
+            acc = sub_group_reduce_add(acc);
+        #endif
+    #endif
+
+    return acc;
+}
+
+inline FINAL_ACCUMULATOR_TYPE FUNC(final_reduce)(FINAL_ACCUMULATOR_TYPE acc) {
+    #if REDUCE_MEAN_MODE
+        acc /= DIVIDER;
+    #elif REDUCE_L2_MODE
+        acc = sqrt(acc);
+    #elif REDUCE_LOG_SUM_MODE || REDUCE_LOG_SUM_EXP_MODE
+        acc = log(acc);
+    #endif
+
+    return acc;
+}
+
+inline uint FUNC(calc_linear_offset)(uint b, uint f, uint y, uint x) {
+    uint index = b * COMMON_OUTPUT_SIZE_X * COMMON_OUTPUT_SIZE_Y * COMMON_OUTPUT_FEATURE_NUM +
+                 f * COMMON_OUTPUT_SIZE_X * COMMON_OUTPUT_SIZE_Y +
+                 y * COMMON_OUTPUT_SIZE_X +
+                 x;
+
+    return index;
+}
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+KERNEL(reduce_fsv16)(
+    const __global INPUT0_TYPE* data,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
+{
+    const uint xy   = (uint)get_global_id(1) * READ_OFFSET;
+    const uint x    = xy % ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
+    const uint y    = xy / ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
+    const uint bf   = (uint)get_global_id(2) * SIMD;
+    const uint b    = bf / ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
+    const uint f    = bf % ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
+
+#if KEEP_DIMS
+    const uint out_idx = OUTPUT_GET_INDEX(b, f, y, x);
+#else
+    #if REDUCE_BATCH && REDUCE_FEATURE && REDUCE_X                                      // BFX
+        const uint out_idx = OUTPUT_GET_INDEX(y, x, b, f);
+    #elif REDUCE_BATCH && REDUCE_FEATURE && REDUCE_Y                                    // BFY
+        const uint out_idx = OUTPUT_GET_INDEX(x, b, f, y);
+    #elif REDUCE_FEATURE && REDUCE_X                                                    // FX
+        const uint out_idx = OUTPUT_GET_INDEX(b, y, f, x);
+    #elif REDUCE_BATCH && REDUCE_X                                                      // BX
+        const uint out_idx = OUTPUT_GET_INDEX(f + get_sub_group_local_id(), y, b, x);
+    #elif REDUCE_BATCH && REDUCE_Y                                                      // BY
+        const uint out_idx = OUTPUT_GET_INDEX(f + get_sub_group_local_id(), x, b, y);
+    #elif REDUCE_FEATURE && REDUCE_Y                                                    // FY
+        const uint out_idx = OUTPUT_GET_INDEX(b, x, f, y);
+    #elif REDUCE_BATCH && REDUCE_FEATURE                                                // BF
+        const uint out_idx = OUTPUT_GET_INDEX(y, x, b, f);
+    #elif REDUCE_FEATURE                                                                // F
+        const uint out_idx = OUTPUT_GET_INDEX(b + get_sub_group_local_id(), y, x, f);
+    #elif REDUCE_BATCH                                                                  // B
+        const uint out_idx = OUTPUT_GET_INDEX(f + get_sub_group_local_id(), y, x, b);
+    #elif REDUCE_Y                                                                      // Y
+        const uint out_idx = OUTPUT_GET_INDEX(b, f, x, y);
+    #else
+        const uint out_idx = OUTPUT_GET_INDEX(b, f, y, x);
+    #endif
+#endif
+
+    const uint linear_idx = FUNC_CALL(calc_linear_offset)(b, f, y, x);
+    if (linear_idx >= COMPUTATIONAL_OPERATIONS_NUMBER)
+        return;
+
+    const uint input_x_pitch = FSV;
+    const uint input_y_pitch = input_x_pitch * (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X);
+    const uint input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y);
+    const uint input_batch_pitch = input_fs_pitch * ((INPUT0_PAD_BEFORE_FEATURE_NUM + INPUT0_FEATURE_NUM + INPUT0_PAD_AFTER_FEATURE_NUM + FSV - 1) / FSV);
+
+    const uint output_x_pitch = FSV;
+    const uint output_y_pitch = FSV * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
+
+#if REDUCE_BATCH
+    const uint batch_out = 0;
+    const uint batch_max_val = INPUT0_BATCH_NUM;
+#else
+    const uint batch_out = BATCH_NUM_IDX_COMP(linear_idx);
+    const uint batch_max_val = batch_out + 1;
+#endif
+
+#if REDUCE_FEATURE
+    const uint feature_out = 0;
+    const uint feature_max_val = INPUT0_FEATURE_NUM;
+#else
+    const uint feature_out = FEATURE_NUM_IDX_COMP(linear_idx);
+    const uint feature_max_val = feature_out + 1;
+#endif
+
+#if REDUCE_Y
+    const uint y_out = 0;
+    const uint y_max_val = INPUT0_SIZE_Y;
+#else
+    const uint y_out = SIZE_Y_IDX_COMP(linear_idx);
+    const uint y_max_val = y_out + 1;
+#endif
+
+#if REDUCE_X
+    const uint x_out = 0;
+    const uint x_max_val = INPUT0_SIZE_X / READ_OFFSET;
+    const uint x_leftover_start = x_max_val * READ_OFFSET;
+    const uint x_leftover_end = INPUT0_SIZE_X;
+#else
+    const uint x_out = SIZE_X_IDX_COMP(linear_idx);
+    const uint x_max_val = x_out + 1;
+    const uint x_leftover_start = x_out;
+    const uint x_leftover_end = x_max_val;
+#endif
+
+uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) * input_fs_pitch + y_out * input_y_pitch + x_out * input_x_pitch;
+
+#if REDUCE_X
+    ACCUMULATOR_TYPE acc = INIT_VAL;
+    for (uint bi = batch_out; bi < batch_max_val; ++bi) {
+        for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
+            for (uint yi = y_out; yi < y_max_val; ++yi) {
+                for (uint xi = x_out; xi < x_max_val; ++xi) {
+                    INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
+                    #if (REDUCE_MAX_MODE || REDUCE_MIN_MODE || REDUCE_PROD_MODE || REDUCE_AND_MODE || REDUCE_LOG_SUM_EXP_MODE) && REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+                        if (fi + FSV <= INPUT0_FEATURE_NUM)
+                            input = BLOCK_READ(data, offset);
+                        else
+                            if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
+                                for (int i = 0; i < READ_OFFSET; ++i)
+                                    input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
+                    #else
+                        input = BLOCK_READ(data, offset);
+                    #endif
+                    unroll_for (int i = 0; i < READ_OFFSET; ++i)
+                        acc = FUNC_CALL(apply_reduce)(acc, input[i]);
+                    offset += input_x_pitch * READ_OFFSET;
+                }
+                #if INPUT0_SIZE_X % READ_OFFSET != 0
+                    for (uint xi = x_leftover_start; xi < x_leftover_end; ++xi) {
+                        INPUT0_TYPE leftovers = INIT_VAL;
+                        #if (REDUCE_MAX_MODE || REDUCE_MIN_MODE || REDUCE_PROD_MODE || REDUCE_AND_MODE || REDUCE_LOG_SUM_EXP_MODE) && REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+                            if (fi + FSV <= INPUT0_FEATURE_NUM)
+                                leftovers = DT_INPUT_BLOCK_READ(data, offset);
+                            else
+                                if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
+                                    leftovers = data[offset + get_sub_group_local_id()];
+                        #else
+                            leftovers = DT_INPUT_BLOCK_READ(data, offset);
+                        #endif
+                        acc = FUNC_CALL(apply_reduce)(acc, leftovers);
+                        offset += input_x_pitch;
+                    }
+                #endif
+                offset += input_y_pitch - INPUT0_SIZE_X * input_x_pitch;
+            }
+            offset += input_fs_pitch - ((y_max_val - y_out) * input_y_pitch);
+        }
+        offset += input_batch_pitch - ((((feature_max_val - feature_out) + FSV - 1) / FSV) * input_fs_pitch);
+    }
+
+    FINAL_ACCUMULATOR_TYPE final_acc;
+    acc = FUNC_CALL(sub_group_reduce)(acc);
+    final_acc = FUNC_CALL(final_reduce)(TO_FINAL_ACCUMULATOR_TYPE(acc));
+
+    OUTPUT_TYPE final_result;
+    ACTIVATION_TYPE reduce_result = TO_ACTIVATION_TYPE(final_acc);
+    #if HAS_FUSED_OPS
+        FUSED_OPS_SCALAR;
+        final_result = FUSED_OPS_RESULT_SCALAR;
+    #else
+        final_result = TO_OUTPUT_TYPE(ACTIVATION(reduce_result, ACTIVATION_PARAMS));
+    #endif
+
+    #if (REDUCE_FEATURE && REDUCE_X || REDUCE_BATCH && REDUCE_X) && !KEEP_DIMS
+        output[out_idx] = final_result;
+    #elif REDUCE_BATCH && REDUCE_Y && REDUCE_X || REDUCE_BATCH && REDUCE_X || REDUCE_Y && REDUCE_X || REDUCE_X && !REDUCE_FEATURE
+        DT_OUTPUT_BLOCK_WRITE(output + out_idx, 0, final_result);
+    #else
+        if (get_sub_group_local_id() == 0)
+            output[out_idx] = final_result;
+    #endif
+#else
+    ACCUMULATOR_VEC acc = (ACCUMULATOR_VEC)(INIT_VAL);
+    for (uint bi = batch_out; bi < batch_max_val; ++bi) {
+        for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
+            for (uint yi = y_out; yi < y_max_val; ++yi) {
+                for (uint xi = x_out; xi < x_max_val; ++xi) {
+                    INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
+                    #if (REDUCE_MAX_MODE || REDUCE_MIN_MODE || REDUCE_PROD_MODE || REDUCE_AND_MODE || REDUCE_LOG_SUM_EXP_MODE) && REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+                        if (fi + FSV <= INPUT0_FEATURE_NUM)
+                            input = BLOCK_READ(data, offset);
+                        else
+                            if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
+                                for (int i = 0; i < READ_OFFSET; ++i)
+                                    input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
+                    #else
+                        input = BLOCK_READ(data, offset);
+                    #endif
+                    unroll_for (int i = 0; i < READ_OFFSET; ++i)
+                        acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]);
+                    offset += input_x_pitch;
+                }
+                offset += input_y_pitch - (x_max_val - x_out) * input_x_pitch;
+            }
+            offset += input_fs_pitch - ((y_max_val - y_out) * input_y_pitch);
+        }
+        offset += input_batch_pitch - ((((feature_max_val - feature_out) + FSV - 1) / FSV) * input_fs_pitch);
+    }
+
+    FINAL_ACCUMULATOR_VEC final_acc;
+    unroll_for (uint i = 0; i < READ_OFFSET; ++i) {
+        acc[i] = FUNC_CALL(sub_group_reduce)(acc[i]);
+        final_acc[i] = FUNC_CALL(final_reduce)(TO_FINAL_ACCUMULATOR_TYPE(acc[i]));
+    }
+
+    OUTPUT_VEC final_result;
+    ACTIVATION_VEC reduce_result = TO_ACTIVATION_VEC(final_acc);
+
+#if HAS_FUSED_OPS
+    FUSED_OPS_VECTOR;
+    final_result = (OUTPUT_VEC)(FUSED_OPS_RESULT_VECTOR);
+#else
+    final_result = TO_OUTPUT_VEC(ACTIVATION(reduce_result, ACTIVATION_PARAMS));
+#endif
+
+    unroll_for (uint i = 0; i < READ_OFFSET; ++i) {
+        if(COMMON_OUTPUT_SIZE_X % READ_OFFSET == 0 || x + i < COMMON_OUTPUT_SIZE_X) {
+            #if REDUCE_BATCH && REDUCE_FEATURE && REDUCE_Y && !REDUCE_X && !KEEP_DIMS
+                output[out_idx + output_x_pitch * i] = final_result[i];
+            #elif REDUCE_FEATURE && REDUCE_Y && !KEEP_DIMS
+                if (get_sub_group_local_id() == 0)
+                    output[out_idx + i] = final_result[i];
+            #elif REDUCE_BATCH && REDUCE_Y && !KEEP_DIMS
+                    output[out_idx + i] = final_result[i];
+            #elif REDUCE_BATCH && REDUCE_Y && REDUCE_X && !KEEP_DIMS
+                    output[out_idx + get_sub_group_local_id() + output_y_pitch * i] = final_result[i];
+            #elif REDUCE_BFY_BY_FY_Y
+                    output[out_idx + get_sub_group_local_id() + output_x_pitch * i] = final_result[i];
+            #elif REDUCE_BATCH && REDUCE_FEATURE && !KEEP_DIMS
+                if (get_sub_group_local_id() == 0)
+                    output[out_idx + i] = final_result[i];
+            #elif REDUCE_BATCH && !KEEP_DIMS
+                    output[out_idx + output_y_pitch * i] = final_result[i];
+            #elif REDUCE_BATCH && !REDUCE_FEATURE
+                    DT_OUTPUT_BLOCK_WRITE(output + out_idx + output_x_pitch * i, 0, final_result[i]);
+            #elif REDUCE_BATCH && REDUCE_FEATURE
+                    if (get_sub_group_local_id() == 0)
+                        output[out_idx + output_x_pitch * i] = final_result[i];
+            #elif REDUCE_F && !KEEP_DIMS
+                    if (get_sub_group_local_id() == 0)
+                        output[out_idx + output_y_pitch * i] = final_result[i];
+            #elif REDUCE_F
+                    if (get_sub_group_local_id() == 0)
+                        output[out_idx + output_x_pitch * i] = final_result[i];
+            #endif
+        }
+    }
+#endif
+}
+
+#undef SIMD
+#undef FSV
+#undef unroll_for
+#undef BLOCK_READ
+#undef READ_OFFSET
+#undef INPUT_VEC
+#undef ACCUMULATOR_VEC
+#undef TO_ACCUMULATOR_VEC
+#undef FINAL_ACCUMULATOR_VEC
+#undef ACTIVATION_VEC
+#undef TO_ACTIVATION_VEC
+#undef OUTPUT_VEC
+#undef TO_OUTPUT_VEC
+#undef REDUCE_BFY_BY_FY_Y
+#undef REDUCE_F
+#undef NEED_SUB_GROUP_REDUCE
+#undef INIT_VAL
+#undef INPUT_INIT_VAL
+#undef REDUCE_BATCH
+#undef REDUCE_FEATURE
+#undef REDUCE_Y
+#undef REDUCE_X
index f07dcec..569e3ab 100644 (file)
 
 #include "include/include_all.cl"
 
-KERNEL(reduce_vec)(const __global INPUT0_TYPE* data, __global OUTPUT_TYPE* output) {
-    const uint out_idx = get_global_id(0);
+inline uint FUNC(calc_linear_offset)(uint b, uint f, uint w, uint z, uint y, uint x)
+{
+    uint index = b * OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z * OUTPUT_SIZE_W * OUTPUT_FEATURE_NUM +
+                 f * OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z * OUTPUT_SIZE_W +
+                 w * OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z +
+                 z * OUTPUT_SIZE_X * OUTPUT_SIZE_Y +
+                 y * OUTPUT_SIZE_X +
+                 x;
 
-    if (out_idx >= COMPUTATIONAL_OPERATIONS_NUMBER) return;
+    return index;
+}
+
+KERNEL(reduce_ref)(
+    const __global INPUT0_TYPE* data,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
+{
+    const uint xy   = (uint)get_global_id(0);
+    const uint wz   = (uint)get_global_id(1);
+    const uint bf   = (uint)get_global_id(2);
+    const uint x    = xy % OUTPUT_SIZE_X;
+    const uint y    = xy / OUTPUT_SIZE_X;
+
+    const uint b    = bf / OUTPUT_FEATURE_NUM;
+    const uint f    = bf % OUTPUT_FEATURE_NUM;
+#if INPUT0_DIMS == 4
+    const uint w    = 0;
+    const uint z    = 0;
+    const uint out_idx = OUTPUT_GET_INDEX(b, f, y, x);
+#elif INPUT0_DIMS == 5
+    const uint z    = wz % OUTPUT_SIZE_Z;
+    const uint w    = 0;
+    const uint out_idx = OUTPUT_GET_INDEX(b, f, z, y, x);
+#elif INPUT0_DIMS == 6
+    const uint z    = wz % OUTPUT_SIZE_Z;
+    const uint w    = wz / OUTPUT_SIZE_Z;
+    const uint out_idx = OUTPUT_GET_INDEX(b, f, w, z, y, x);
+#endif
+
+    const uint linear_idx = FUNC_CALL(calc_linear_offset)(b, f, w, z, y, x);
+    if (linear_idx >= COMPUTATIONAL_OPERATIONS_NUMBER)
+        return;
 
 #ifdef REDUCE_BATCH
     const uint batch_out = 0;
     const uint batch_max_val = INPUT0_BATCH_NUM;
 #else
-    const uint batch_out = BATCH_NUM_IDX_COMP(out_idx);
+    const uint batch_out = BATCH_NUM_IDX_COMP(linear_idx);
     const uint batch_max_val = batch_out + 1;
 #endif
 
@@ -31,16 +72,16 @@ KERNEL(reduce_vec)(const __global INPUT0_TYPE* data, __global OUTPUT_TYPE* outpu
     const uint feature_out = 0;
     const uint feature_max_val = INPUT0_FEATURE_NUM;
 #else
-    const uint feature_out = FEATURE_NUM_IDX_COMP(out_idx);
+    const uint feature_out = FEATURE_NUM_IDX_COMP(linear_idx);
     const uint feature_max_val = feature_out + 1;
 #endif
 
-#if INPUT0_LAYOUT_BFWZYX
+#if INPUT0_DIMS == 6
 #ifdef REDUCE_W
     const uint w_out = 0;
     const uint w_max_val = INPUT0_SIZE_W;
 #else
-    const uint w_out = SIZE_W_IDX_COMP(out_idx);
+    const uint w_out = SIZE_W_IDX_COMP(linear_idx);
     const uint w_max_val = w_out + 1;
 #endif
 #else
@@ -48,12 +89,12 @@ KERNEL(reduce_vec)(const __global INPUT0_TYPE* data, __global OUTPUT_TYPE* outpu
     const uint w_max_val = 1;
 #endif
 
-#if INPUT0_LAYOUT_BFWZYX || INPUT0_LAYOUT_BFZYX
+#if INPUT0_DIMS == 6 || INPUT0_DIMS == 5
 #ifdef REDUCE_Z
     const uint z_out = 0;
     const uint z_max_val = INPUT0_SIZE_Z;
 #else
-    const uint z_out = SIZE_Z_IDX_COMP(out_idx);
+    const uint z_out = SIZE_Z_IDX_COMP(linear_idx);
     const uint z_max_val = z_out + 1;
 #endif
 #else
@@ -65,7 +106,7 @@ KERNEL(reduce_vec)(const __global INPUT0_TYPE* data, __global OUTPUT_TYPE* outpu
     const uint y_out = 0;
     const uint y_max_val = INPUT0_SIZE_Y;
 #else
-    const uint y_out = SIZE_Y_IDX_COMP(out_idx);
+    const uint y_out = SIZE_Y_IDX_COMP(linear_idx);
     const uint y_max_val = y_out + 1;
 #endif
 
@@ -73,23 +114,24 @@ KERNEL(reduce_vec)(const __global INPUT0_TYPE* data, __global OUTPUT_TYPE* outpu
     const uint x_out = 0;
     const uint x_max_val = INPUT0_SIZE_X;
 #else
-    const uint x_out = SIZE_X_IDX_COMP(out_idx);
+    const uint x_out = SIZE_X_IDX_COMP(linear_idx);
     const uint x_max_val = x_out + 1;
 #endif
-    OUTPUT_TYPE acc = OUTPUT_VAL_ZERO;
+    ACCUMULATOR_TYPE acc = ACCUMULATOR_VAL_ZERO;
     uint counter = 0;
-    for (uint b = batch_out; b < batch_max_val; ++b) {
-        for (uint f = feature_out; f < feature_max_val; ++f) {
-            for (uint w = w_out; w < w_max_val; ++w) {
-                for (uint z = z_out; z < z_max_val; ++z) {
-                    for (uint y = y_out; y < y_max_val; ++y) {
-                        for (uint x = x_out; x < x_max_val; ++x) {
-#ifdef INPUT0_LAYOUT_BFWZYX
-                            const uint input_idx = GET_DATA_INDEX_6D(INPUT0, b, f, w, z, y, x);
-#elif INPUT0_LAYOUT_BFZYX
-                            const uint input_idx = GET_DATA_INDEX_5D(INPUT0, b, f, z, y, x);
+    for (uint bi = batch_out; bi < batch_max_val; ++bi) {
+        for (uint fi = feature_out; fi < feature_max_val; ++fi) {
+            for (uint wi = w_out; wi < w_max_val; ++wi) {
+                for (uint zi = z_out; zi < z_max_val; ++zi) {
+                    for (uint yi = y_out; yi < y_max_val; ++yi) {
+                        for (uint xi = x_out; xi < x_max_val; ++xi) {
+#if INPUT0_DIMS == 6
+                            const uint input_idx = INPUT0_GET_INDEX(bi, fi, wi, zi, yi, xi);
+#elif INPUT0_DIMS == 5
+                            const uint input_idx = INPUT0_GET_INDEX(bi, fi, zi, yi, xi);
 #else
-                            const uint input_idx = GET_DATA_INDEX(INPUT0, b, f, y, x);
+                            const uint input_idx = INPUT0_GET_INDEX(bi, fi, yi, xi);
+
 #endif
 #ifdef REDUCE_SUM_MODE
                             acc += data[input_idx];
@@ -123,13 +165,21 @@ KERNEL(reduce_vec)(const __global INPUT0_TYPE* data, __global OUTPUT_TYPE* outpu
 #elif REDUCE_SUM_SQUARE_MODE
                             acc += data[input_idx] * data[input_idx];
 #elif REDUCE_L1_MODE
+                        #if !INPUT0_IS_FP
+                            acc += TO_ACCUMULATOR_TYPE(fabs(TO_FINAL_ACCUMULATOR_TYPE(data[input_idx])));
+                        #else
                             acc += fabs(data[input_idx]);
+                        #endif
 #elif REDUCE_L2_MODE
                             acc += data[input_idx] * data[input_idx];
 #elif REDUCE_LOG_SUM_MODE
                             acc += data[input_idx];
 #elif REDUCE_LOG_SUM_EXP_MODE
+                        #if !INPUT0_IS_FP
+                            acc += TO_ACCUMULATOR_TYPE(exp(TO_FINAL_ACCUMULATOR_TYPE(data[input_idx])));
+                        #else
                             acc += exp(data[input_idx]);
+                        #endif
 #endif
                             counter++;
                         }
@@ -138,15 +188,25 @@ KERNEL(reduce_vec)(const __global INPUT0_TYPE* data, __global OUTPUT_TYPE* outpu
             }
         }
     }
+
+    FINAL_ACCUMULATOR_TYPE final_acc = TO_FINAL_ACCUMULATOR_TYPE(acc);
 #if REDUCE_MEAN_MODE
-    if (counter != 0) acc /= counter;
+    if (counter != 0) final_acc /= counter;
 #endif
 #if REDUCE_L2_MODE
-    acc = sqrt(acc);
+    final_acc = sqrt(final_acc);
 #endif
 #if REDUCE_LOG_SUM_MODE || REDUCE_LOG_SUM_EXP_MODE
-    acc = log(acc);
+    final_acc = log(final_acc);
 #endif
 
-    output[out_idx] = acc;
+    OUTPUT_TYPE final_result;
+    ACTIVATION_TYPE reduce_result = TO_ACTIVATION_TYPE(final_acc);
+#if HAS_FUSED_OPS
+    FUSED_OPS;
+    final_result = FUSED_OPS_RESULT;
+#else
+    final_result = TO_OUTPUT_TYPE(ACTIVATION(reduce_result, ACTIVATION_PARAMS));
+#endif
+    output[out_idx] = final_result;
 }
index 08a81f7..5240434 100644 (file)
@@ -1777,6 +1777,7 @@ std::string FusedOpsCodeGenerator::GetOutputVarName(std::string input_var) const
     std::replace(input_var.begin(), input_var.end(), '[', '_');
     std::replace(input_var.begin(), input_var.end(), ']', '_');
     std::replace(input_var.begin(), input_var.end(), ' ', '_');
+    std::replace(input_var.begin(), input_var.end(), '.', '_');
     return input_var + "_out";
 }
 
index c25168c..599c32c 100644 (file)
@@ -120,6 +120,12 @@ attach_activation_gpu::attach_activation_gpu() {
         { std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), val_fw },
         { std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), val_fw },
         { std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), val_fw },
+        // bfwzyx
+        {std::make_tuple(engine_types::ocl, data_types::f32, format::bfwzyx), val_fw},
+        {std::make_tuple(engine_types::ocl, data_types::f16, format::bfwzyx), val_fw},
+        {std::make_tuple(engine_types::ocl, data_types::i32, format::bfwzyx), val_fw},
+        {std::make_tuple(engine_types::ocl, data_types::i8, format::bfwzyx), val_fw},
+        {std::make_tuple(engine_types::ocl, data_types::u8, format::bfwzyx), val_fw},
         // fs_b_yx_fsv32
         {std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw},
     });
index fc00e5f..1dacf79 100644 (file)
@@ -20,6 +20,7 @@
 #include "kernel_selector_helper.h"
 #include "reduce/reduce_kernel_selector.h"
 #include "reduce/reduce_kernel_ref.h"
+#include "reduce/reduce_kernel_b_fs_yx_fsv16.h"
 #include "error_handler.h"
 #include "data_inst.h"
 
@@ -91,12 +92,23 @@ attach_reduce_gpu::attach_reduce_gpu() {
     implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
     implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
     implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw);
     implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx), val_fw);
     implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx), val_fw);
     implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfzyx), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfzyx), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfzyx), val_fw);
     implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfwzyx), val_fw);
     implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfwzyx), val_fw);
     implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfwzyx), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfwzyx), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfwzyx), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::b_fs_yx_fsv16), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
+    implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
 }
 
 }  // namespace detail
index 5555d6a..53807eb 100644 (file)
@@ -52,6 +52,7 @@
 #include "cum_sum_inst.h"
 #include "embedding_bag_inst.h"
 #include "extract_image_patches_inst.h"
+#include "reduce_inst.h"
 #include <vector>
 #include <list>
 #include <memory>
@@ -368,6 +369,15 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
             return true;
         };
 
+        auto reduce_supports_fusings = [](reduce_node& node) -> bool {
+            auto keep_dims = node.as<reduce>().get_primitive()->keep_dims;
+
+            if (keep_dims)
+                return true;
+
+            return false;
+        };
+
         auto fuse_activation_f = [&](activation_node& activation_node) {
             auto& input_data = activation_node.get_dependency(0);
             if (input_data.get_users().size() != 1 || activation_node.get_dependencies().size() >= 3)
@@ -411,6 +421,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<space_to_batch>();
 
+            should_fuse |= input_data.is_type<reduce>() && reduce_supports_fusings(input_data.as<reduce>());
+
             if (!should_fuse)
                 return;
 
@@ -464,6 +476,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<space_to_batch>();
 
+            should_fuse |= input_data.is_type<reduce>() && reduce_supports_fusings(input_data.as<reduce>());
+
             if (!should_fuse)
                 return;
 
@@ -550,6 +564,10 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<eltwise>() && quantize_node.get_scale_shift_opt();
 
+            should_fuse |= input_data.is_type<reduce>() &&
+                           reduce_supports_fusings(input_data.as<reduce>())
+                           && quantize_node.get_scale_shift_opt();
+
             if (!should_fuse)
                 return;
 
@@ -580,7 +598,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
                                     (parent1->is_type<space_to_depth>()) ||
                                     (parent1->is_type<gemm>() && gemm_supports_fusings(parent1->as<gemm>())) ||
                                     (parent1->is_type<batch_to_space>()) || (parent1->is_type<space_to_batch>()) ||
-                                    (parent1->is_type<depth_to_space>() && dts_supports_fusings(parent1->as<depth_to_space>()));
+                                    (parent1->is_type<depth_to_space>() && dts_supports_fusings(parent1->as<depth_to_space>())) ||
+                                    (parent1->is_type<batch_to_space>()) || (parent1->is_type<space_to_batch>()) ||
+                                    (parent1->is_type<reduce>() && reduce_supports_fusings(parent1->as<reduce>()));
 
             bool can_fuse_parent2 = (parent2->is_type<convolution>() && conv_supports_fusings(parent2->as<convolution>())) ||
                                     (parent2->is_type<mvn>() && mvn_supports_fusings(parent2->as<mvn>())) ||
@@ -588,7 +608,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
                                     (parent2->is_type<space_to_depth>()) ||
                                     (parent2->is_type<gemm>() && gemm_supports_fusings(parent2->as<gemm>())) ||
                                     (parent2->is_type<batch_to_space>()) || (parent2->is_type<space_to_batch>()) ||
-                                    (parent2->is_type<depth_to_space>() && dts_supports_fusings(parent2->as<depth_to_space>()));
+                                    (parent2->is_type<depth_to_space>() && dts_supports_fusings(parent2->as<depth_to_space>())) ||
+                                    (parent2->is_type<batch_to_space>()) || (parent2->is_type<space_to_batch>()) ||
+                                    (parent2->is_type<reduce>() && reduce_supports_fusings(parent2->as<reduce>()));
 
             std::vector<bool> can_fuse_parents = { can_fuse_parent1, can_fuse_parent2 };
 
index e913228..0a0e596 100644 (file)
@@ -60,6 +60,7 @@
 #include "reorder_inst.h"
 #include "split_inst.h"
 #include "mvn_inst.h"
+#include "reduce_inst.h"
 #include "to_string_utils.h"
 #include "gpu/memory_gpu.h"
 
@@ -1166,7 +1167,8 @@ void program_impl::set_layout_optimizer_attributes(layout_optimizer& lo) {
                  prim.as<mvn>().input().get_output_layout().data_type != data_types::i8)
              || prim.as<mvn>().get_primitive()->across_channels) &&
             prim.type() != cldnn::arg_max_min::type_id() &&
-            prim.type() != cldnn::mutable_data::type_id())
+            prim.type() != cldnn::mutable_data::type_id() &&
+            prim.type() != cldnn::reduce::type_id())
             can_use_fsv16 = false;
 
         if (prim.type() == cldnn::quantize::type_id() &&
index d650948..082de60 100644 (file)
@@ -34,28 +34,57 @@ layout reduce_inst::calc_output_layout(reduce_node const& node) {
 
     auto input_layout = node.input(0).get_output_layout();
     auto input_format = input_layout.format;
+    auto format_dim = input_format.dimension();
     auto output_type = input_layout.data_type;
     auto mode = desc->mode;
     auto reduce_axes = desc->axes;
-
     auto in_dims = input_layout.size.sizes();
+
     for (size_t a = 0; a < reduce_axes.size(); a++) {
         in_dims[reduce_axes[a]] = 1;
     }
 
+    std::vector<int32_t> updated_dims;
     if (!desc->keep_dims) {
-        for (size_t a = 0; a < reduce_axes.size(); a++) {
-            in_dims.erase(in_dims.begin() + reduce_axes[a]);
-            in_dims.push_back(1);
+        // Get unreduced from b-f and x-w range
+        for (size_t b_f_index = 0; b_f_index < 2; b_f_index++) {
+            bool index_to_remove = std::find(reduce_axes.begin(), reduce_axes.end(), b_f_index) != reduce_axes.end();
+            if (!index_to_remove)
+                updated_dims.push_back(in_dims[b_f_index]);
         }
+        for (size_t x_w_index = format_dim - 1; x_w_index >= 2; x_w_index--) {
+            bool index_to_remove = std::find(reduce_axes.begin(), reduce_axes.end(), x_w_index) != reduce_axes.end();
+            if (!index_to_remove)
+                updated_dims.push_back(in_dims[x_w_index]);
+        }
+
+        if (input_format.dimension() == 4 && reduce_axes.size() == 1)
+            updated_dims.push_back(1);
+        if (updated_dims.size() > 2)
+            std::reverse(updated_dims.begin() + 2, updated_dims.end());
+
+        // Fill updated dims to format_dim size
+        while (updated_dims.size() < format_dim)
+            updated_dims.push_back(1);
+
+        in_dims = std::move(updated_dims);
     }
 
     std::vector<reduce_mode> reduce_bool_modes = {reduce_mode::logical_and, reduce_mode::logical_or};
-    if (std::find(reduce_bool_modes.begin(), reduce_bool_modes.end(), mode) != reduce_bool_modes.end()) output_type = data_types::i8;
+    if (std::find(reduce_bool_modes.begin(), reduce_bool_modes.end(), mode) != reduce_bool_modes.end())
+        output_type = data_types::i8;
+    else if (output_type == data_types::i8 || output_type == data_types::u8)
+        output_type = data_types::f32;
+
+    if (desc->output_data_type)
+        output_type = *desc->output_data_type;
+
+    if (node.has_fused_primitives())
+        output_type = node.get_fused_output_layout().data_type;
 
-    if (input_layout.format == format::bfwzyx)
+    if (format_dim == 6)
         return layout{output_type, input_format, tensor(batch(in_dims[0]), feature(in_dims[1]), spatial(in_dims[2], in_dims[3], in_dims[4], in_dims[5]))};
-    else if (input_layout.format == format::bfzyx)
+    else if (format_dim == 5)
         return layout{output_type, input_format, tensor(batch(in_dims[0]), feature(in_dims[1]), spatial(in_dims[2], in_dims[3], in_dims[4]))};
     else
         return layout{output_type, input_format, tensor(batch(in_dims[0]), feature(in_dims[1]), spatial(in_dims[2], in_dims[3]))};
index 053460e..0a0d949 100644 (file)
@@ -39,6 +39,7 @@
 #include "api/space_to_depth.hpp"
 #include "api/batch_to_space.hpp"
 #include "api/space_to_batch.hpp"
+#include "api/reduce.hpp"
 
 
 #include "test_utils/test_utils.h"
@@ -210,10 +211,10 @@ public:
             VF<uint8_t> rnd_vec = generate_random_1d<uint8_t>(s.count(), min_random, max_random);
             set_values(prim, rnd_vec);
         } else if (l.data_type == data_types::f16) {
-            VF<uint16_t> rnd_vec = generate_random_1d<uint16_t>(s.count(), min_random, max_random);
+            VF<uint16_t> rnd_vec = generate_random_1d<uint16_t>(s.count(), -1, 1);
             set_values(prim, rnd_vec);
         } else {
-            VF<float> rnd_vec = generate_random_1d<float>(s.count(), min_random, max_random);
+            VF<float> rnd_vec = generate_random_1d<float>(s.count(), -1, 1);
             set_values(prim, rnd_vec);
         }
 
@@ -2500,11 +2501,11 @@ TEST_P(gemm_3in_quantize_i8, basic) {
 
 INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_3in_quantize_i8,
     ::testing::ValuesIn(std::vector<gemm_test_params>{
-                        gemm_test_params{ CASE_GEMM_3IN_FP32_1, 4, 5 },
                         gemm_test_params{ CASE_GEMM_3IN_FP16_1, 4, 5 },
                         gemm_test_params{ CASE_GEMM_3IN_S8S8_1, 4, 5 },
                         gemm_test_params{ CASE_GEMM_3IN_S8S8_2, 4, 5 },
                         gemm_test_params{ CASE_GEMM_3IN_S8S8_3, 4, 5 },
+                      //gemm_test_params{ CASE_GEMM_3IN_FP32_1, 4, 5 },
 }), );
 
 class gemm_2in_quantize_u8 : public GemmFusingTest {};
@@ -2527,11 +2528,11 @@ TEST_P(gemm_2in_quantize_u8, basic) {
 
 INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_2in_quantize_u8,
     ::testing::ValuesIn(std::vector<gemm_test_params>{
-                        gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 },
                         gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 4 },
                         gemm_test_params{ CASE_GEMM_2IN_U8U8_1, 3, 4 },
                         gemm_test_params{ CASE_GEMM_2IN_U8U8_2, 3, 4 },
                         gemm_test_params{ CASE_GEMM_2IN_U8U8_3, 3, 4 },
+                      //gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 },
 }), );
 
 class gemm_2in_scale : public GemmFusingTest {};
@@ -2563,7 +2564,6 @@ TEST_P(gemm_2in_scale, fp16_scale_out) {
     execute(p);
 }
 
-
 INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_2in_scale,
     ::testing::ValuesIn(std::vector<gemm_test_params>{
                         gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 },
@@ -6252,3 +6252,265 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
                             eltwise_test_params{CASE_ELTWISE_U8_FP16_2},
                             eltwise_test_params{CASE_ELTWISE_U8_FP16_3},
                         }), );
+
+/* ----------------------------------------------------------------------------------------------------- */
+/* ---------------------------------------- Reduce cases ----------------------------------------------- */
+/* ----------------------------------------------------------------------------------------------------- */
+struct reduce_test_params {
+    cldnn::tensor in_shape;
+    cldnn::tensor out_shape;
+    cldnn::data_types data_type;
+    cldnn::format input_format;
+    data_types default_type;
+    cldnn::format default_format;
+    size_t expected_fused_primitives;
+    size_t expected_not_fused_primitives;
+    cldnn::reduce_mode reduce_mode;
+    std::vector<uint16_t> reduce_axes;
+    bool keep_dims;
+    std::string kernel_name;
+};
+
+#define CASE_REDUCE_F32_0 {3, 7, 5, 7}, {3, 7, 5, 7}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_1 {3, 7, 5, 7}, {3, 7, 5, 7}, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_2 {2, 4, 8, 4, 4}, {2, 4, 8, 4, 4}, data_types::f32, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_3 {16, 16, 16, 8, 8, 8}, {16, 16, 16, 8, 8, 8}, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_4 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+
+#define CASE_REDUCE_F16_0 {3, 7, 5, 7}, {3, 7, 5, 7}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_1 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_2 {2, 4, 8, 4, 4}, {2, 4, 8, 4, 4}, data_types::f16, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_3 {3, 5, 3, 5, 7, 7}, {3, 5, 3, 5, 7, 7}, data_types::f16, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_4 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+
+#define CASE_REDUCE_I32_0 {3, 7, 5, 7}, {3, 7, 5, 7}, data_types::i32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_1 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::i32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_2 {2, 4, 8, 4, 4}, {2, 4, 8, 4, 4}, data_types::i32, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_3 {3, 5, 3, 5, 7, 7}, {3, 5, 3, 5, 7, 7}, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_4 {3, 5, 3, 5, 7, 7}, {3, 5, 3, 5, 7, 7}, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
+
+#define CASE_REDUCE_I8_0 {3, 7, 5, 7}, {3, 7, 5, 7}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_1 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::i8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_2 {2, 4, 8, 4, 4}, {2, 4, 8, 4, 4}, data_types::i8, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_3 {3, 5, 3, 5, 7, 7}, {3, 5, 3, 5, 7, 7}, data_types::i8, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_4 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+
+#define CASE_REDUCE_U8_0 {3, 7, 5, 7}, {3, 7, 5, 7},data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_1 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_2 {2, 4, 8, 4, 4}, {2, 4, 8, 4, 4}, data_types::u8, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_3 {3, 5, 3, 5, 7, 7}, {3, 5, 3, 5, 7, 7}, data_types::u8, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_4 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+
+
+class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
+public:
+    void execute(reduce_test_params& p) {
+        auto input_prim = get_mem(get_input_layout(p));
+
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
+
+        network_fused.set_input_data("input", input_prim);
+        network_not_fused.set_input_data("input", input_prim);
+
+        compare(network_not_fused, network_fused, p);
+    }
+
+    void update_out_shape(reduce_test_params& p) {
+       for (auto& axis : p.reduce_axes) {
+            switch (axis) {
+                case 0:  // batch
+                    p.out_shape.batch[0] = 1;
+                    break;
+                case 1:  // feature
+                    p.out_shape.feature[0] = 1;
+                    break;
+                case 2:  // x
+                    p.out_shape.spatial[0] = 1;
+                    break;
+                case 3:  // y
+                    p.out_shape.spatial[1] = 1;
+                    break;
+                case 4:  // z
+                    p.out_shape.spatial[2] = 1;
+                    break;
+                case 5:  // w
+                    p.out_shape.spatial[3] = 1;
+                    break;
+            }
+        }
+    }
+
+    layout get_input_layout(reduce_test_params& p) { return layout{p.data_type, p.input_format, p.in_shape}; }
+    layout get_per_channel_layout(reduce_test_params& p) {
+        return layout{p.default_type, p.default_format, tensor{1, p.in_shape.feature[0], 1, 1}};
+    }
+};
+
+class reduce_eltwise_activation_quantize : public ReduceFusingTest {};
+TEST_P(reduce_eltwise_activation_quantize, basic) {
+    auto p = GetParam();
+    update_out_shape(p);
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)),
+                      data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)),
+                      data("out_lo", get_mem(get_single_element_layout(p), -128)),
+                      data("out_hi", get_mem(get_single_element_layout(p), 127)),
+                      data("eltwise_data", get_mem(get_output_layout(p))),
+                      reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims),
+                      eltwise("eltwise", {"reduce", "eltwise_data"}, eltwise_mode::sum, p.default_type),
+                      activation("activation", "eltwise", activation_func::relu),
+                      quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
+                      reorder("output_reorder", "quantize", p.default_format, data_types::f32));
+
+    tolerance = 1.f;
+    execute(p);
+}
+
+TEST_P(reduce_eltwise_activation_quantize, per_channel) {
+    auto p = GetParam();
+    update_out_shape(p);
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
+                      data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
+                      data("out_lo", get_mem(get_single_element_layout(p), -128)),
+                      data("out_hi", get_mem(get_single_element_layout(p), 127)),
+                      data("eltwise_data", get_mem(get_output_layout(p))),
+                      reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims),
+                      eltwise("eltwise", {"reduce", "eltwise_data"}, eltwise_mode::sum, p.default_type),
+                      activation("activation", "eltwise", activation_func::relu),
+                      quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
+                      reorder("output_reorder", "quantize", p.default_format, data_types::f32));
+
+    tolerance = 1.f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu,
+                        reduce_eltwise_activation_quantize,
+                        ::testing::ValuesIn(std::vector<reduce_test_params>{
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 5, reduce_mode::mean, {reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 5, reduce_mode::min, {reduce::along_x, reduce::along_y, reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_1, 2, 5, reduce_mode::sum, {reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F32_2, 2, 5, reduce_mode::mean, {reduce::along_f, reduce::along_x}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F32_1, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F32_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, {reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 5, reduce_mode::min, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_1, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F32_2, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F32_1, 2, 5, reduce_mode::mean, {reduce::along_x}, true, "reduce_ref"},
+
+                            reduce_test_params{CASE_REDUCE_F16_1, 2, 5, reduce_mode::mean, {reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_1, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_1, 2, 5, reduce_mode::min, {reduce::along_x, reduce::along_y, reduce::along_f}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_0, 2, 5, reduce_mode::sum, {reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F16_4, 2, 5, reduce_mode::mean, {reduce::along_f, reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F16_0, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F16_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F16_1, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_2, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_1, 2, 5, reduce_mode::sum, {reduce::along_y}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_2, 2, 5, reduce_mode::min, {reduce::along_x}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_4, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F16_0, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F16_4, 2, 5, reduce_mode::mean, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+
+                            reduce_test_params{CASE_REDUCE_I8_0, 2, 5, reduce_mode::mean, {reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_I8_0, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_I8_0, 2, 5, reduce_mode::min, {reduce::along_x, reduce::along_y, reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_I8_1, 2, 5, reduce_mode::sum, {reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_I8_2, 2, 5, reduce_mode::mean, {reduce::along_f, reduce::along_x}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_I8_1, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_I8_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_I8_0, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, {reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_I8_0, 2, 5, reduce_mode::min, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_I8_1, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_I8_2, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_I8_1, 2, 5, reduce_mode::mean, {reduce::along_x}, true, "reduce_ref"},
+
+                            reduce_test_params{CASE_REDUCE_U8_1, 2, 5, reduce_mode::mean, {reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_U8_1, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_U8_1, 2, 5, reduce_mode::min, {reduce::along_x, reduce::along_y, reduce::along_f}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, {reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, {reduce::along_f, reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_U8_1, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_U8_2, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_U8_1, 2, 5, reduce_mode::sum, {reduce::along_y}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_U8_2, 2, 5, reduce_mode::min, {reduce::along_x}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"}
+                        }), );
+
+class reduce_scale_activation : public ReduceFusingTest {};
+TEST_P(reduce_scale_activation, basic) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      data("scale_data", get_mem(get_single_element_layout(p), -0.125f)),
+                      reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims),
+                      scale("scale", "reduce", "scale_data"),
+                      activation("activation", "scale", activation_func::cos),
+                      reorder("output_reorder", "activation", p.default_format, data_types::f32));
+    tolerance = 1e-02f;
+    execute(p);
+}
+
+TEST_P(reduce_scale_activation, per_channel) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      data("scale_data", get_mem(get_per_channel_layout(p), -0.125f)),
+                      reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims),
+                      scale("scale", "reduce", "scale_data"),
+                      activation("activation", "scale", activation_func::cos),
+                      reorder("output_reorder", "activation", p.default_format, data_types::f32));
+    tolerance = 1e-02f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu,
+                        reduce_scale_activation,
+                        ::testing::ValuesIn(std::vector<reduce_test_params>{
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::max, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_1, 2, 4, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::min, {reduce::along_x, reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_2, 2, 4, reduce_mode::mean, {reduce::along_x, reduce::along_y}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::l1, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::l1, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::min, {reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::sum, {reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+
+                            reduce_test_params{CASE_REDUCE_F16_0, 2, 4, reduce_mode::max, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F16_1, 2, 4, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_0, 2, 4, reduce_mode::min, {reduce::along_x, reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F16_2, 2, 4, reduce_mode::mean, {reduce::along_x, reduce::along_y}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_0, 2, 4, reduce_mode::min, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                            reduce_test_params{CASE_REDUCE_F16_0, 2, 4, reduce_mode::sum, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+                        }), );
+
+INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
+                        reduce_eltwise_activation_quantize,
+                        ::testing::ValuesIn(std::vector<reduce_test_params>{
+                            // No layout format available for quantize/scale
+                            reduce_test_params{CASE_REDUCE_F32_3, 2, 4, reduce_mode::l1, {reduce::along_x}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_F16_3, 2, 4, reduce_mode::min, {reduce::along_x}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_I32_2, 2, 4, reduce_mode::max, {reduce::along_x, reduce::along_y}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_I32_3, 2, 4, reduce_mode::sum, {reduce::along_x}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_I8_3, 2, 4, reduce_mode::mean, {reduce::along_x}, true, "reduce_ref"},
+                            reduce_test_params{CASE_REDUCE_U8_3, 2, 4, reduce_mode::l2, {reduce::along_x}, true, "reduce_ref"}
+                        }), );
index 0b65ef1..07d15be 100644 (file)
@@ -1,5 +1,5 @@
-/*
-// Copyright (c) 2019 Intel Corporation
+/*
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
 // limitations under the License.
 */
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
 #include <gtest/gtest.h>
 #include <api/input_layout.hpp>
 #include "api/reduce.hpp"
 #include "test_utils/test_utils.h"
 #include <api/data.hpp>
 #include "test_utils/float16.h"
+#include <cmath>
+#include <algorithm>
 
 using namespace cldnn;
 using namespace tests;
 
+template <typename InputT>
+struct accumulator_type {
+    using type = float;
+};
+
+template <typename InputT>
+struct output_type {
+    using type = float;
+};
+
+template <typename AccT>
+AccT get_min_value() {
+    return std::numeric_limits<AccT>::lowest();
+}
+
+template<>
+int get_min_value<int>() {
+    return std::numeric_limits<int>::min();
+}
+
+template <typename VecT>
+struct Comparator {
+    std::vector<VecT> order;
+    bool operator()(const int lhs, const int rhs) const {
+        auto lhs_index = std::distance(order.begin(), std::find(order.begin(), order.end(), lhs));
+        auto rhs_index = std::distance(order.begin(), std::find(order.begin(), order.end(), rhs));
+        return lhs_index < rhs_index;
+    }
+};
+
+template <typename InputT, typename AccT>
+struct reduce_accumulator {
+    AccT set_accumulator_value(cldnn::reduce_mode reduce_mode) {
+        AccT acc;
+        if (reduce_mode == cldnn::reduce_mode::max)
+            acc = get_min_value<AccT>();
+        else if (reduce_mode == cldnn::reduce_mode::min)
+            acc = std::numeric_limits<AccT>::max();
+        else if (reduce_mode == cldnn::reduce_mode::prod || reduce_mode == cldnn::reduce_mode::logical_and)
+            acc = 1;
+        else
+            acc = 0;
+
+        return acc;
+    };
+
+    AccT accumulate(AccT& acc, AccT& input_val, cldnn::reduce_mode reduce_mode) {
+        if (reduce_mode == cldnn::reduce_mode::sum || reduce_mode ==  cldnn::reduce_mode::mean ||
+            reduce_mode == cldnn::reduce_mode::log_sum)
+            acc += input_val;
+        else if (reduce_mode == cldnn::reduce_mode::max)
+            acc = input_val > acc ? input_val : acc;
+        else if (reduce_mode == cldnn::reduce_mode::min)
+            acc = input_val < acc ? input_val : acc;
+        else if (reduce_mode == cldnn::reduce_mode::prod)
+            acc = acc * input_val;
+        else if (reduce_mode == cldnn::reduce_mode::logical_and)
+            acc = acc && input_val;
+        else if (reduce_mode == cldnn::reduce_mode::logical_or)
+            acc = acc || input_val;
+        else if (reduce_mode == cldnn::reduce_mode::sum_square)
+            acc += input_val * input_val;
+        else if (reduce_mode == cldnn::reduce_mode::l1)
+            acc += abs(input_val);
+        else if (reduce_mode == cldnn::reduce_mode::l2)
+            acc += input_val * input_val;
+        else if (reduce_mode == cldnn::reduce_mode::log_sum_exp)
+            acc += exp(input_val);
+
+        return acc;
+    };
+
+    AccT get(AccT& acc, size_t counter, cldnn::reduce_mode reduce_mode) {
+        if (reduce_mode == cldnn::reduce_mode::mean)
+            acc /= counter;
+        else if (reduce_mode == cldnn::reduce_mode::l2)
+            acc = sqrt(acc);
+        else if (reduce_mode == cldnn::reduce_mode::log_sum || reduce_mode == cldnn::reduce_mode::log_sum_exp)
+            acc = log(acc);
+
+        return acc;
+    };
+
+    std::map<char, int, Comparator<char>> create_coords_map(std::vector<char>& coords) {
+        auto coord_cmp = Comparator<char>();
+        coord_cmp.order = coords;
+        std::map<char, int, Comparator<char>> coord_map({}, coord_cmp);
+        int index = 0;
+        for (auto& coord : coord_cmp.order) {
+            coord_map[coord] = index;
+            index++;
+        }
+
+        return coord_map;
+    }
+
+    void remap(std::vector<size_t> &out_dims,
+                    std::map<char, int, Comparator<char>>& remap_coords,
+                    std::vector<uint16_t>& axis_to_remove,
+                    std::map<uint16_t, int, Comparator<uint16_t>>& axes_map,
+                    std::vector<char>& coords,
+                    int dims) {
+
+        if (dims == 5) {
+            remap_coords.erase('w');
+        } else if (dims == 4) {
+            remap_coords.erase('w');
+            remap_coords.erase('z');
+        }
+
+        // Dimensions reshape
+        std::vector<size_t> updated_dims;
+        std::vector<char> updated_coords;
+
+        for (int index = 0; index < static_cast<int>(out_dims.size()); index++) {
+            if ((dims == 4 && (index == 2 || index == 3)) || (dims == 5 && index == 2))
+                continue;
+
+            auto index_to_remove = std::find(axis_to_remove.begin(), axis_to_remove.end(), axes_map.find(index)->second) !=
+                     axis_to_remove.end();
+            if ((out_dims[index] != 1) || (out_dims[index] == 1 && !index_to_remove)) {
+                updated_dims.push_back(out_dims[index]);
+                updated_coords.push_back(coords[index]);
+            }
+        }
+
+           if (updated_dims.size() > 2) {
+            if (dims == 4) {
+                updated_dims.insert(updated_dims.begin() + 2, 2, 1);
+            } else {
+                updated_dims.insert(updated_dims.begin() + 2, out_dims.size() - updated_dims.size(), 1);
+            }
+        }
+
+        while (updated_dims.size() < out_dims.size())
+            updated_dims.push_back(1);
+
+        out_dims = std::move(updated_dims);
+
+        // Coordinates remap
+        std::map<uint16_t, int, std::greater<uint16_t>> ordered_axes;
+
+        for (auto& axis : axis_to_remove)
+            ordered_axes[axes_map.find(axis)->second] = axis;
+
+        int i = 0;
+        for (auto& coord : coords) {
+            if ((dims == 4 && (coord == 'w' || coord == 'z')) || (dims == 5 && coord == 'w'))
+                continue;
+
+            if (ordered_axes.find(remap_coords[coord]) != ordered_axes.end()) {
+                if (dims != 4)
+                    updated_coords.insert(updated_coords.begin() + 2 + i, 1, coord);
+                else
+                    updated_coords.push_back(coord);
+                ++i;
+            }
+        }
+
+        int j = 0;
+        for (auto& coord : coords) {
+            if ((dims == 4 && (coord == 'w' || coord == 'z')) || (dims == 5 && coord == 'w'))
+                continue;
+
+            auto temp_coords = updated_coords.at(j);
+            remap_coords[coord] = static_cast<int>(std::distance(coords.begin(), std::find(coords.begin(), coords.end(), temp_coords)));
+            ++j;
+        }
+
+        if (dims == 4) {
+            remap_coords['w'] = 2;
+            remap_coords['z'] = 3;
+        } else if (dims == 5) {
+            remap_coords['w'] = 2;
+        }
+
+    }
+};
+
+template <typename InputT, typename AccT = typename accumulator_type<InputT>::type, typename OutputT = typename output_type<InputT>::type>
+VVVVVVF<OutputT> reference_reduce(VVVVVVF<InputT>& input,
+                                  reduce_mode reduce_mode,
+                                  std::vector<uint16_t> reduce_axis,
+                                  const int batch,
+                                  const int input_f,
+                                  const int input_w,
+                                  const int input_z,
+                                  const int input_y,
+                                  const int input_x,
+                                  const int dims,
+                                  bool keepDims = false) {
+
+    auto reduce = reduce_accumulator<InputT, AccT>();
+
+    auto axis_cmp = Comparator<uint16_t>();
+    axis_cmp.order = {reduce::along_b, reduce::along_f, reduce::along_w, reduce::along_z, reduce::along_y, reduce::along_x};
+    std::map<uint16_t, int, Comparator<uint16_t>> axes_map({}, axis_cmp);
+
+    int index = 0;
+    for (auto& axis : axis_cmp.order) {
+        axes_map[axis] = index;
+        index++;
+    }
+
+    // Initial input order is b, f, x, y, w
+    std::vector<size_t> input_dims = {
+        input.size(),                   // b
+        input[0].size(),                // f
+        input[0][0][0][0][0].size(),    // w
+        input[0][0][0][0].size(),       // z
+        input[0][0][0].size(),          // y
+        input[0][0].size(),             // x
+    };
+
+    VVVVVVF<AccT> previous(input_dims[0],
+                           VVVVVF<AccT>(input_dims[1],
+                           VVVVF<AccT>(input_dims[2],
+                           VVVF<AccT>(input_dims[3],
+                           VVF<AccT>(input_dims[4],
+                           VF<AccT>(input_dims[5], 0))))));
+
+    for (size_t bi = 0; bi < input_dims[0]; ++bi)
+        for (size_t fi = 0; fi < input_dims[1]; ++fi)
+            for (size_t wi = 0; wi < input_dims[2]; ++wi)
+                for (size_t zi = 0; zi < input_dims[3]; ++zi)
+                    for (size_t yi = 0; yi < input_dims[4]; ++yi)
+                        for (size_t xi = 0; xi < input_dims[5]; ++xi)
+                            previous[bi][fi][wi][zi][yi][xi] = static_cast<AccT>(input[bi][fi][xi][yi][zi][wi]);
+
+    std::vector<size_t> temp_dims = input_dims;
+    size_t max_counter_value = 1;
+
+    for (auto& axis : reduce_axis) {
+        auto out_dims = temp_dims;
+        out_dims[axes_map.at(axis)] = 1;
+        VVVVVVF<AccT> temp_output(out_dims[0],
+                                  VVVVVF<AccT>(out_dims[1],
+                                  VVVVF<AccT>(out_dims[2],
+                                  VVVF<AccT>(out_dims[3],
+                                  VVF<AccT>(out_dims[4],
+                                  VF<AccT>(out_dims[5], reduce.set_accumulator_value(reduce_mode)))))));
+
+        max_counter_value *= input_dims[axes_map.at(axis)];
+
+        for (size_t bi = 0; bi < temp_dims[0]; ++bi)
+            for (size_t fi = 0; fi < temp_dims[1]; ++fi)
+                for (size_t wi = 0; wi < temp_dims[2]; ++wi)
+                    for (size_t zi = 0; zi < temp_dims[3]; ++zi)
+                        for (size_t yi = 0; yi < temp_dims[4]; ++yi) {
+                            for (size_t xi = 0; xi < temp_dims[5]; ++xi) {
+                                auto input_val = static_cast<AccT>(previous[bi][fi][wi][zi][yi][xi]);
+
+                                AccT acc = static_cast<AccT>(temp_output[bi % out_dims[0]][fi % out_dims[1]]
+                                                                        [wi % out_dims[2]][zi % out_dims[3]]
+                                                                        [yi % out_dims[4]][xi % out_dims[5]]);
+
+                                temp_output[bi % out_dims[0]][fi % out_dims[1]]
+                                           [wi % out_dims[2]][zi % out_dims[3]]
+                                           [yi % out_dims[4]][xi % out_dims[5]] = reduce.accumulate(acc, input_val, reduce_mode);
+                            }
+                        }
+        if (&axis == &reduce_axis.back() || reduce_mode != cldnn::reduce_mode::mean)
+            if (reduce_mode == cldnn::reduce_mode::mean || reduce_mode == cldnn::reduce_mode::l2 ||
+                reduce_mode == cldnn::reduce_mode::log_sum || reduce_mode == cldnn::reduce_mode::log_sum_exp) {
+                for (size_t bi = 0; bi < temp_output.size(); ++bi)
+                    for (size_t fi = 0; fi < temp_output[0].size(); ++fi)
+                        for (size_t wi = 0; wi < temp_output[0][0].size(); ++wi)
+                            for (size_t zi = 0; zi < temp_output[0][0][0].size(); ++zi)
+                                for (size_t yi = 0; yi < temp_output[0][0][0][0].size(); ++yi) {
+                                    for (size_t xi = 0; xi < temp_output[0][0][0][0][0].size(); ++xi) {
+                                        auto current_acc_val = static_cast<AccT>(temp_output[bi % out_dims[0]][fi % out_dims[1]][wi % out_dims[2]]
+                                                                                            [zi % out_dims[3]][yi % out_dims[4]][xi % out_dims[5]]);
+                                            temp_output[bi % out_dims[0]][fi % out_dims[1]][wi % out_dims[2]]
+                                                       [zi % out_dims[3]][yi % out_dims[4]][xi % out_dims[5]] = reduce.get(current_acc_val, max_counter_value, reduce_mode);
+                                    }
+                                }
+            }
+
+        previous = std::move(temp_output);
+        temp_dims = {previous.size(),                 // b
+                     previous[0].size(),              // f
+                     previous[0][0].size(),           // w
+                     previous[0][0][0].size(),        // z
+                     previous[0][0][0][0].size(),     // y
+                     previous[0][0][0][0][0].size(),  // x
+        };
+    }
+
+    VVVVVVF<AccT> output;
+
+    if (keepDims) {
+        output = std::move(previous);
+    } else {
+        std::vector<size_t> actual_dims = temp_dims;
+        std::vector<char> coords = {'b', 'f', 'w', 'z', 'y', 'x'};
+        std::map<char, int, Comparator<char>> remap_coords = reduce.create_coords_map(coords);
+        reduce.remap(actual_dims, remap_coords, reduce_axis, axes_map, coords, dims);
+
+        VVVVVVF<AccT>actual_output(actual_dims[0],
+                                   VVVVVF<AccT>(actual_dims[1],
+                                   VVVVF<AccT>(actual_dims[2],
+                                   VVVF<AccT>(actual_dims[3],
+                                   VVF<AccT>(actual_dims[4],
+                                   VF<AccT>(actual_dims[5], 0))))));
+
+        for (size_t bi = 0; bi < previous.size(); ++bi)
+            for (size_t fi = 0; fi < previous[0].size(); ++fi)
+                for (size_t wi = 0; wi < previous[0][0].size(); ++wi)
+                    for (size_t zi = 0; zi < previous[0][0][0].size(); ++zi)
+                        for (size_t yi = 0; yi < previous[0][0][0][0].size(); ++yi)
+                            for (size_t xi = 0; xi < previous[0][0][0][0][0].size(); ++xi) {
+                                std::vector<size_t> coords = {bi, fi, wi, zi, yi, xi};
+                                actual_output[coords.at(remap_coords['b'])][coords.at(remap_coords['f'])]
+                                             [coords.at(remap_coords['w'])][coords.at(remap_coords['z'])]
+                                             [coords.at(remap_coords['y'])][coords.at(remap_coords['x'])] = previous[bi][fi][wi][zi][yi][xi];
+                            }
+
+        output = std::move(actual_output);
+    }
+
+    VVVVVVF<OutputT> final_output(output.size(),
+                                  VVVVVF<OutputT>(output[0].size(),
+                                  VVVVF<OutputT>(output[0][0].size(),
+                                  VVVF<OutputT>(output[0][0][0].size(),
+                                  VVF<OutputT>(output[0][0][0][0].size(),
+                                  VF<OutputT>(output[0][0][0][0][0].size(), 0))))));
+
+    for (size_t bi = 0; bi < output.size(); ++bi)
+        for (size_t fi = 0; fi < output[0].size(); ++fi)
+            for (size_t wi = 0; wi < output[0][0].size(); ++wi)
+                for (size_t zi = 0; zi < output[0][0][0].size(); ++zi)
+                    for (size_t yi = 0; yi < output[0][0][0][0].size(); ++yi)
+                        for (size_t xi = 0; xi < output[0][0][0][0][0].size(); ++xi)
+                            final_output[bi][fi][wi][zi][yi][xi] = static_cast<OutputT>(output[bi][fi][wi][zi][yi][xi]);
+
+    return final_output;
+}
+
+using TestParamType_general_reduce_gpu = ::testing::tuple<int, int, int,          // 0, 1, 2  -  b, f, w
+                                                          int, int, int,          // 3, 4, 5  -  z, y, x
+                                                          format,                 // 6  -  input_dt format
+                                                          reduce_mode,            // 7  -  reduce mode
+                                                          std::vector<uint16_t>,  // 8  -  reduce axis
+                                                          std::string,            // 9  -  kernel name
+                                                          bool,                   // 10 -  keepDims
+                                                          data_types,             // 11 -  input_dt
+                                                          bool,                   // 12 -  force_output_dt
+                                                          data_types>;            // 13 -  output_dt
+
+ struct general_reduce_gpu : public ::testing::TestWithParam<TestParamType_general_reduce_gpu> {
+    static std::string PrintToStringParamName(testing::TestParamInfo<TestParamType_general_reduce_gpu> param_info) {
+        const std::vector<uint16_t> reduce_axes = testing::get<8>(param_info.param);
+        std::string string_axes;
+        for (auto& axis : reduce_axes) string_axes += std::to_string(axis) + "_";
+
+        // Readable name
+        return "in_b_" + std::to_string(testing::get<0>(param_info.param)) +
+               "_f_" + std::to_string(testing::get<1>(param_info.param)) +
+               "_w_" + std::to_string(testing::get<2>(param_info.param)) +
+               "_z_" + std::to_string(testing::get<3>(param_info.param)) +
+               "_y_" + std::to_string(testing::get<4>(param_info.param)) +
+               "_x_" + std::to_string(testing::get<5>(param_info.param)) +
+               "_format_" + std::to_string(testing::get<6>(param_info.param)) +
+               "_reduce_mode_" + std::to_string(static_cast<std::underlying_type<cldnn::reduce_mode>::type>(testing::get<7>(param_info.param))) +
+               "_axes_" + string_axes +
+               "_kernel_name_" + testing::get<9>(param_info.param) +
+               "_keepDims_" + std::to_string(testing::get<10>(param_info.param));
+    }
+};
+
+template <data_types InputT>
+struct input_data_type {
+    using type = float;
+};
+
+template <>
+struct input_data_type <data_types::i8> {
+    using type = int8_t;
+};
+
+template <>
+struct input_data_type <data_types::u8> {
+    using type = uint8_t;
+};
+
+template <data_types OutputT>
+struct output_data_type {
+    using type = float;
+};
+
+template <>
+struct output_data_type<data_types::i8> {
+    using type = int8_t;
+};
+
+template <>
+struct output_data_type<data_types::u8> {
+    using type = uint8_t;
+};
+
+template <data_types InputT, data_types OutputT>
+class ReduceTestBase : public ::testing::TestWithParam<TestParamType_general_reduce_gpu> {
+protected:
+    cldnn::engine engine = get_test_engine();
+    int batch_num, input_f, input_w, input_z, input_y, input_x;
+    cldnn::format input_format = format::any;
+    cldnn::reduce_mode reduce_mode;
+    std::vector<uint16_t> reduce_axis;
+    std::string kernel_name;
+    bool keep_dims;
+    cldnn::data_types input_dt;
+    cldnn::data_types output_dt;
+    bool force_output_dt;
+
+    ReduceTestBase() {
+        this->batch_num = testing::get<0>(GetParam());
+        this->input_f = testing::get<1>(GetParam());
+        this->input_w = testing::get<2>(GetParam());
+        this->input_z = testing::get<3>(GetParam());
+        this->input_y = testing::get<4>(GetParam());
+        this->input_x = testing::get<5>(GetParam());
+        this->input_format = testing::get<6>(GetParam());
+        this->reduce_mode = testing::get<7>(GetParam());
+        this->reduce_axis = testing::get<8>(GetParam());
+        this->kernel_name = testing::get<9>(GetParam());
+        this->keep_dims = testing::get<10>(GetParam());
+        this->input_dt = testing::get<11>(GetParam());
+        this->force_output_dt = testing::get<12>(GetParam());
+        this->output_dt = testing::get<13>(GetParam());
+    }
+
+public:
+    void execute() {
+        int input_dim = static_cast<int>(input_format.dimension());
+        cldnn::format layout_format = input_format;
+
+        if (input_dim == 4)
+            layout_format = format::bfyx;
+        else if (input_dim == 5)
+            layout_format = format::bfzyx;
+        else
+            layout_format = format::bfwzyx;
+
+        using input_t = typename input_data_type<InputT>::type;
+        using output_t = typename output_data_type<OutputT>::type;
+
+        auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y, input_z, input_w));
+        auto input_data = generate_random_6d<input_t>(batch_num, input_f, input_x, input_y, input_z, input_w, 1, 10);
+        auto input_lay = layout(input_dt, layout_format, input_size);
+        auto input_mem = memory::allocate(engine, input_lay);
+
+        {
+            auto input_ptr = input_mem.pointer<input_t>();
+            for (int fi = 0; fi < input_f; fi++)
+                for (int wi = 0; wi < input_w; wi++)
+                    for (int zi = 0; zi < input_z; zi++)
+                        for (int yi = 0; yi < input_y; yi++)
+                            for (int xi = 0; xi < input_x; xi++) {
+                                for (int bi = 0; bi < batch_num; bi++) {
+                                    tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, wi));
+                                    size_t offset = input_lay.get_linear_offset(coords);
+                                    input_ptr[offset] = input_data[bi][fi][xi][yi][zi][wi];
+                                }
+                            }
+        }
+
+        auto reference_result = reference_reduce(input_data, reduce_mode, reduce_axis, batch_num,
+                                                 input_f, input_w, input_z, input_y,
+                                                 input_x, input_dim, keep_dims);
+        topology topology;
+        auto red = reduce("reduce", "input", reduce_mode, reduce_axis, keep_dims);
+        if (force_output_dt) {
+            red.output_data_type = output_dt;
+        }
+        topology.add(input_layout("input", input_mem.get_layout()));
+        topology.add(red);
+        build_options options;
+        options.set_option(build_option::optimize_data(true));
+        implementation_desc reduce_impl = {input_format, kernel_name};
+        options.set_option(build_option::force_implementations({{"reduce", reduce_impl}}));
+        network network(engine, topology, options);
+        network.set_input_data("input", input_mem);
+
+        network.execute();
+
+        auto out_mem = network.get_output("reduce").get_memory();
+        auto out_ptr = out_mem.pointer<output_t>();
+        auto out_lay = out_mem.get_layout();
+
+        ASSERT_EQ(out_lay.size.sizes()[0], reference_result.size());                 // b
+        ASSERT_EQ(out_lay.size.sizes()[1], reference_result[0].size());              // f
+        ASSERT_EQ(out_lay.size.spatial[3], reference_result[0][0].size());           // w
+        ASSERT_EQ(out_lay.size.spatial[2], reference_result[0][0][0].size());        // z
+        ASSERT_EQ(out_lay.size.spatial[1], reference_result[0][0][0][0].size());     // y
+        ASSERT_EQ(out_lay.size.spatial[0], reference_result[0][0][0][0][0].size());  // x
+
+        for (size_t bi = 0; bi < reference_result.size(); bi++)
+            for (size_t fi = 0; fi < reference_result[0].size(); fi++)
+                for (size_t wi = 0; wi < reference_result[0][0].size(); wi++)
+                    for (size_t zi = 0; zi < reference_result[0][0][0].size(); zi++)
+                        for (size_t yi = 0; yi < reference_result[0][0][0][0].size(); yi++) {
+                            for (size_t xi = 0; xi < reference_result[0][0][0][0][0].size(); xi++) {
+                                tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, wi));
+                                size_t offset = out_lay.get_linear_offset(coords);
+                                auto val = out_ptr[offset];
+                                auto val_ref = static_cast<output_t>(reference_result[bi][fi][wi][zi][yi][xi]);
+                                auto equal = are_equal(val_ref, val, 1e-1f);
+
+                                if (!equal)
+                                    std::cout << "Reference value at batch: " << bi << " output_f: " << fi
+                                              << " y: " << yi << " x: " << xi << " = " << val_ref << " Val = " << val
+                                              << std::endl;
+                                EXPECT_TRUE(equal);
+                            }
+                        }
+    }
+};
+
+class general_reduce_gpu_i8_i8 : public ReduceTestBase<data_types::i8, data_types::i8> {};
+TEST_P(general_reduce_gpu_i8_i8, base) { execute(); }
+
+class general_reduce_gpu_i8_f32 : public ReduceTestBase<data_types::i8, data_types::f32> {};
+TEST_P(general_reduce_gpu_i8_f32, base) { execute(); }
+
+class general_reduce_gpu_f32_f32 : public ReduceTestBase<data_types::f32, data_types::f32> {};
+TEST_P(general_reduce_gpu_f32_f32, base) { execute(); }
+
+
+ INSTANTIATE_TEST_CASE_P(reduce_gpu_b_fs_yx_fsv16_i8_i8,
+                        general_reduce_gpu_i8_i8,
+                        ::testing::Values(
+                            TestParamType_general_reduce_gpu(2, 12, 1, 1, 3, 2, format::b_fs_yx_fsv16, reduce_mode::logical_or, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, true, data_types::i8),
+                            TestParamType_general_reduce_gpu(2, 3, 1, 1, 8, 5, format::b_fs_yx_fsv16, reduce_mode::logical_and, {reduce::along_b, reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, true, data_types::i8),
+                            TestParamType_general_reduce_gpu(3, 3, 1, 1, 3, 6, format::b_fs_yx_fsv16, reduce_mode::logical_or, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, true, data_types::i8),
+                            TestParamType_general_reduce_gpu(3, 5, 1, 1, 3, 2, format::b_fs_yx_fsv16, reduce_mode::logical_and, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, true, data_types::i8),
+                            TestParamType_general_reduce_gpu(3, 7, 1, 1, 3, 2, format::b_fs_yx_fsv16, reduce_mode::logical_or, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, true, data_types::i8),
+                            TestParamType_general_reduce_gpu(1, 3, 1, 1, 6, 12, format::b_fs_yx_fsv16, reduce_mode::logical_and, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, true, data_types::i8)
+                            ),
+                            general_reduce_gpu::PrintToStringParamName);
+
+ INSTANTIATE_TEST_CASE_P(reduce_gpu_b_fs_yx_fsv16_i8_f32,
+                        general_reduce_gpu_i8_f32,
+                        ::testing::Values(
+                            TestParamType_general_reduce_gpu(3, 3, 1, 1, 3, 2, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 3, 1, 1, 3, 3, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 3, 1, 1, 2, 11, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 3, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(26, 12, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 12, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 4, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 13, 12, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(1, 2, 1, 1, 5, 5, format::b_fs_yx_fsv16, reduce_mode::prod, {reduce::along_x, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 26, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 18, 11, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 17, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 16, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 15, 8, format::b_fs_yx_fsv16, reduce_mode::log_sum_exp, {reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 14, 11, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 12, 8, format::b_fs_yx_fsv16, reduce_mode::sum_square, {reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 12, 11, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+
+                            TestParamType_general_reduce_gpu(7, 3, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(26, 12, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 12, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 4, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 13, 9, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(2, 5, 1, 1, 3, 3, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_x, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 26, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 18, 11, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 17, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 16, 15, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 15, 8, format::b_fs_yx_fsv16, reduce_mode::log_sum_exp, {reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 3, 1, 1, 14, 11, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 12, 8, format::b_fs_yx_fsv16, reduce_mode::sum_square, {reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 12, 11, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32)
+                            ),
+                            general_reduce_gpu::PrintToStringParamName);
+
+ INSTANTIATE_TEST_CASE_P(reduce_gpu_b_fs_yx_fsv16_f32_f32,
+                        general_reduce_gpu_f32_f32,
+                        ::testing::Values(
+                            TestParamType_general_reduce_gpu(7, 3, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(26, 12, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 12, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 4, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 13, 12, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(1, 2, 1, 1, 5, 5, format::b_fs_yx_fsv16, reduce_mode::prod, {reduce::along_x, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 26, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 18, 11, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 17, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 16, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 15, 8, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 14, 11, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 12, 8, format::b_fs_yx_fsv16, reduce_mode::sum_square, {reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 12, 11, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+
+                            TestParamType_general_reduce_gpu(7, 3, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(26, 12, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 12, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 4, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 13, 9, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(1, 34, 1, 1, 13, 13, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 26, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 18, 11, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 17, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 16, 15, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 15, 8, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 14, 11, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 12, 8, format::b_fs_yx_fsv16, reduce_mode::sum_square, {reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 34, 1, 1, 12, 11, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32)
+                        ),
+                        general_reduce_gpu::PrintToStringParamName);
+
+ INSTANTIATE_TEST_CASE_P(reduce_gpu_ref_f32_f32,
+                        general_reduce_gpu_f32_f32,
+                        ::testing::Values(
+                            TestParamType_general_reduce_gpu(2, 4, 4, 5, 8, 8, format::bfwzyx, reduce_mode::mean, {reduce::along_f, reduce::along_y, reduce::along_w, reduce::along_z}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 16, 6, 3, 8, 15, format::bfwzyx, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 16, 3, 7, 12, 12, format::bfwzyx, reduce_mode::mean, {reduce::along_b, reduce::along_y, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 9, 3, 7, 7, 17, format::bfwzyx, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 6, 3, 7, 3, 8, format::bfwzyx, reduce_mode::mean, {reduce::along_y, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(2, 3, 4, 5, 6, 7, format::bfwzyx, reduce_mode::mean, {reduce::along_f, reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 8, 5, 5, 4, 4, format::bfwzyx, reduce_mode::mean, {reduce::along_z}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 8, 5, 5, 3, 6, format::bfwzyx, reduce_mode::mean, {reduce::along_w}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 8, 5, 5, 8, 8, format::bfwzyx, reduce_mode::mean, {reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 8, 5, 5, 3, 6, format::bfwzyx, reduce_mode::mean, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 8, 5, 5, 3, 6, format::bfwzyx, reduce_mode::mean, {reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 16, 4, 5, 3, 6, format::bfwzyx, reduce_mode::mean, {reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 8, 2, 4, 2, 5, format::bfwzyx, reduce_mode::mean, {reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 2, 1, 7, 8, 3, format::bfzyx, reduce_mode::log_sum_exp, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 11, 1, 7, 2, 2, format::bfzyx, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(8, 4, 1, 7, 2, 2, format::bfzyx, reduce_mode::l2, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(4, 5, 1, 7, 12, 4, format::bfzyx, reduce_mode::l1, {reduce::along_y, reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 7, 12, 4, format::bfzyx, reduce_mode::sum, {reduce::along_x, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 7, 4, 12, format::bfzyx, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 4, 1, 7, 12, 12, format::bfzyx, reduce_mode::min, {reduce::along_y, reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 11, 1, 1, 7, 17, format::bfyx, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 6, 1, 7, 2, 9, format::bfzyx, reduce_mode::l1, {reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 6, 1, 7, 2, 9, format::bfzyx, reduce_mode::l1, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 6, 1, 7, 2, 9, format::bfzyx, reduce_mode::l1, {reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 6, 1, 7, 2, 9, format::bfzyx, reduce_mode::l1, {reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 7, 1, 1, 7, 17, format::bfyx, reduce_mode::sum, {reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 9, 1, 1, 7, 17, format::bfyx, reduce_mode::l2, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(5, 5, 1, 1, 17, 17, format::bfyx, reduce_mode::mean, {reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 12, 1, 1, 7, 17, format::bfyx, reduce_mode::log_sum, {reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+
+                            TestParamType_general_reduce_gpu(7, 3, 6, 6, 12, 12, format::bfwzyx, reduce_mode::log_sum_exp, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 4, 6, 6, 12, 12, format::bfwzyx, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 6, 6, 4, 7, format::bfwzyx, reduce_mode::l2, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 16, 6, 6, 7, 12, format::bfwzyx, reduce_mode::l1, {reduce::along_y, reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 6, 6, 7, 2, format::bfwzyx, reduce_mode::sum, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(8, 16, 6, 6, 3, 7, format::bfwzyx, reduce_mode::mean, {reduce::along_x, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(2, 16, 6, 6, 12, 3, format::bfwzyx, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 16, 6, 6, 8, 12, format::bfwzyx, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 11, 1, 6, 7, 17, format::bfzyx, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 7, 1, 6, 7, 3, format::bfzyx, reduce_mode::sum, {reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 9, 1, 6, 7, 7, format::bfzyx, reduce_mode::l2, {reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(5, 5, 1, 6, 8, 3, format::bfzyx, reduce_mode::mean, {reduce::along_y}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 8, 1, 6, 7, 3, format::bfzyx, reduce_mode::log_sum, {reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 12, 4, format::bfyx, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 4, 1, 1, 12, 12, format::bfyx, reduce_mode::mean, {reduce::along_b, reduce::along_y, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(4, 3, 1, 1, 7, 12, format::bfyx, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(16, 16, 1, 1, 4, 11, format::bfyx, reduce_mode::mean, {reduce::along_y, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(2, 16, 1, 1, 3, 6, format::bfyx, reduce_mode::mean, {reduce::along_f, reduce::along_y}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 4, 1, 1, 3, 6, format::bfyx, reduce_mode::mean, {reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 32, 1, 1, 3, 6, format::bfyx, reduce_mode::mean, {reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(17, 4, 1, 1, 12, 15, format::bfyx, reduce_mode::mean, {reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32)
+                        ), general_reduce_gpu::PrintToStringParamName);
+
+INSTANTIATE_TEST_CASE_P(DISABLED_reduce_gpu_ref_f32_f32,
+                        general_reduce_gpu_f32_f32,
+                        ::testing::Values(
+                            TestParamType_general_reduce_gpu(1, 7, 1, 1, 4, 3,format::bfyx, reduce_mode::mean, {reduce::along_b, reduce::along_f, reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 7, 7, 6, 4, 3, format::bfwzyx, reduce_mode::l1, {reduce::along_b, reduce::along_x, reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(1, 5, 7, 6, 4, 3, format::bfwzyx, reduce_mode::l2, {reduce::along_x, reduce::along_w}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(1, 2, 1, 2, 4, 3, format::bfzyx, reduce_mode::prod, {reduce::along_z, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(2, 7, 1, 1, 4, 3, format::fyxb, reduce_mode::sum, {reduce::along_b, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(1, 7, 1, 1, 4, 3, format::bfyx, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(12, 2, 1, 1, 4, 3, format::yxfb, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_ref" ,false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(1, 7, 1, 1, 4, 3, format::bfyx, reduce_mode::mean, {reduce::along_b, reduce::along_x, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 7, 1, 1, 4, 3, format::b_fs_yx_fsv4, reduce_mode::l2, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(9, 7, 1, 1, 4, 3, format::b_fs_yx_fsv16, reduce_mode::prod, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(9, 9, 1, 1, 4, 3, format::b_fs_yx_fsv32, reduce_mode::l1, {reduce::along_b, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(11, 10, 1, 1, 4, 3, format::bs_fs_yx_bsv16_fsv16, reduce_mode::max, {reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(12, 7, 1, 1, 4, 3, format::fs_b_yx_fsv32, reduce_mode::l2, {reduce::along_x, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(13, 7, 1, 1, 4, 3, format::bs_fs_zyx_bsv16_fsv16, reduce_mode::sum, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(14, 7, 1, 1, 4, 3, format::b_fs_zyx_fsv16, reduce_mode::sum, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(18, 7, 1, 1, 4, 3,  format::b_fs_zyx_fsv32, reduce_mode::sum, {reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+
+                            TestParamType_general_reduce_gpu(5, 7, 1, 1, 4, 3, format::bfwzyx, reduce_mode::sum, {reduce::along_x, reduce::along_w}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 7, 1, 1, 4, 3, format::bfzyx, reduce_mode::max, {reduce::along_z, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 7, 1, 1, 4, 3, format::fyxb, reduce_mode::min, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(11, 7, 1, 1, 4, 3, format::byxf, reduce_mode::prod, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(12, 7, 1, 1, 4, 3, format::bfyx, reduce_mode::l1, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(13, 7, 1, 1, 4, 3, format::yxfb, reduce_mode::sum, {reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(14, 7, 1, 1, 4, 3, format::bfyx, reduce_mode::l2, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(9, 7, 1, 1, 4, 3, format::b_fs_yx_fsv4, reduce_mode::log_sum, {reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(8, 7, 1, 1, 4, 3, format::b_fs_yx_fsv16, reduce_mode::log_sum_exp, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(7, 7, 1, 1, 4, 3, format::b_fs_yx_fsv32, reduce_mode::l1, {reduce::along_b, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(6, 7, 1, 1, 4, 3, format::bs_fs_yx_bsv16_fsv16, reduce_mode::max, {reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(1, 12, 1, 1, 4, 3, format::fs_b_yx_fsv32, reduce_mode::l2, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(3, 7, 1, 1, 4, 3, format::bs_fs_zyx_bsv16_fsv16, reduce_mode::sum, {reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(1, 7, 1, 1, 4, 12, format::b_fs_zyx_fsv16, reduce_mode::sum, {reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+                            TestParamType_general_reduce_gpu(1, 7, 1, 1, 8, 3, format::b_fs_zyx_fsv32, reduce_mode::sum, {reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32)
+                        ),
+                        general_reduce_gpu::PrintToStringParamName);
+
 TEST(reduce_gpu, common_bfyx) {
     const auto& engine = get_test_engine();
     auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, 1, 1}});