return stdOp->input_value(0).get_shape().size() <= 5lu && stdOp->input_value(0).get_shape().size() == stdOp->get_output_shape(0).size();
}
+ // Reduce node implementation with reduce along features performs better with Reshape->Pooling->Reshape pattern
+ if (auto redOp = std::dynamic_pointer_cast<const ::ngraph::opset1::ReduceMean>(node)) {
+ auto reduction_axes = redOp->get_reduction_axes().to_vector();
+ bool reduce_along_f = redOp->get_reduction_axes().size() == 1 && std::count(reduction_axes.begin(), reduction_axes.end(), 1) != 0;
+ return !reduce_along_f;
+ }
+ if (auto redOp = std::dynamic_pointer_cast<const ::ngraph::opset1::ReduceMax>(node)) {
+ auto reduction_axes = redOp->get_reduction_axes().to_vector();
+ bool reduce_along_f = redOp->get_reduction_axes().size() == 1 && std::count(reduction_axes.begin(), reduction_axes.end(), 1) != 0;
+ return !reduce_along_f;
+ }
+ if (auto redOp = std::dynamic_pointer_cast<const ::ngraph::opset1::ReduceSum>(node)) {
+ auto reduction_axes = redOp->get_reduction_axes().to_vector();
+ bool reduce_along_f = redOp->get_reduction_axes().size() == 1 && std::count(reduction_axes.begin(), reduction_axes.end(), 1) != 0;
+ return !reduce_along_f;
+ }
+
return std::dynamic_pointer_cast<const ::ngraph::opset2::Gelu>(node) ||
std::dynamic_pointer_cast<const ::ngraph::opset3::ShuffleChannels>(node) ||
std::dynamic_pointer_cast<const ::ngraph::opset2::BatchToSpace>(node) ||
static_cast<int32_t>(reduce->keep_dims));
topology.add(reducePrim);
- AddPrimitiveToProfiler(reduceLayerName, layer);
+
+ auto reorderLayerName = reduceLayerName + "_reorder";
+ cldnn::format out_format = cldnn::format::any;
+ auto out_dt = DataTypeFromPrecision(reduce->outData[0]->getTensorDesc().getPrecision());
+ if (!reduce->keep_dims && reduceDimNumber > 4) {
+ if (reduceDimNumber - rawAxes.size() == 6)
+ out_format = cldnn::format::bfwzyx;
+ else if (reduceDimNumber - rawAxes.size() == 5)
+ out_format = cldnn::format::bfzyx;
+ else if (reduceDimNumber - rawAxes.size() <= 4)
+ out_format = cldnn::format::bfyx;
+
+ auto reorder_prim = cldnn::reorder(reorderLayerName, reduceLayerName, out_format, out_dt);
+ topology.add(reorder_prim);
+ AddPrimitiveToProfiler(reduceLayerName, layer, reorderLayerName);
+ } else {
+ AddPrimitiveToProfiler(reduceLayerName, layer);
+ }
}
void Program::CreateOneHotPrimitive(cldnn::topology& topology, InferenceEngine::CNNLayerPtr &layer) {
} // namespace pass
} // namespace ngraph
+class ConvertReduceBase : public ngraph::pass::MatcherPass {
+public:
+ template <class T>
+ ngraph::matcher_pass_callback convert_reduce_to_pooling();
+};
+
class ngraph::pass::ConvertReduceToPooling: public ngraph::pass::GraphRewrite {
public:
ConvertReduceToPooling() {
}
};
-template <class T>
-ngraph::matcher_pass_callback convert_reduce_to_pooling();
-
-class ngraph::pass::ConvertReduceMeanToPooling: public ngraph::pass::MatcherPass {
+class ngraph::pass::ConvertReduceMeanToPooling: public ConvertReduceBase {
public:
ConvertReduceMeanToPooling() {
auto m = std::make_shared<ngraph::pattern::Matcher>(ngraph::pattern::wrap_type<opset1::ReduceMean>(), "ConvertReduceMean");
}
};
-class ngraph::pass::ConvertReduceMaxToPooling: public ngraph::pass::MatcherPass {
+class ngraph::pass::ConvertReduceMaxToPooling: public ConvertReduceBase {
public:
ConvertReduceMaxToPooling() {
auto m = std::make_shared<ngraph::pattern::Matcher>(ngraph::pattern::wrap_type<opset1::ReduceMax>(), "ConvertReduceMax");
}
};
-class ngraph::pass::ConvertReduceSumToPooling: public ngraph::pass::MatcherPass {
+class ngraph::pass::ConvertReduceSumToPooling: public ConvertReduceBase {
public:
ConvertReduceSumToPooling() {
auto m = std::make_shared<ngraph::pattern::Matcher>(ngraph::pattern::wrap_type<opset1::ReduceSum>(), "ConvertReduceSum");
};
template <class T>
-ngraph::matcher_pass_callback convert_reduce_to_pooling() {
- return [](ngraph::pattern::Matcher& m) {
+ngraph::matcher_pass_callback ConvertReduceBase::convert_reduce_to_pooling() {
+ return [&](ngraph::pattern::Matcher& m) {
auto reduce = std::dynamic_pointer_cast<T>(m.get_match_root());
- if (!reduce) {
+
+ if (!reduce || m_transformation_callback(reduce)) {
return false;
}
--- /dev/null
+// Copyright (C) 20120 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/reduce_ops.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+ const std::vector<InferenceEngine::Precision> netPrecisions = {
+ InferenceEngine::Precision::FP32,
+ };
+
+ const std::vector<std::vector<size_t>> inputShapes = {
+ std::vector<size_t>{1, 2, 4, 4},
+ std::vector<size_t>{3, 2, 5, 6},
+ };
+
+ const std::vector<std::vector<int>> axes = {
+ {0, 2},
+ {1, 3}
+ };
+
+ std::vector<CommonTestUtils::OpType> opTypes = {
+ CommonTestUtils::OpType::SCALAR,
+ CommonTestUtils::OpType::VECTOR,
+ };
+
+ const std::vector<ngraph::helpers::ReductionType> reductionTypes = {
+ ngraph::helpers::ReductionType::Mean,
+ ngraph::helpers::ReductionType::Min,
+ ngraph::helpers::ReductionType::Max,
+ ngraph::helpers::ReductionType::Sum,
+ ngraph::helpers::ReductionType::Prod,
+ };
+
+ const auto paramsOneAxis = testing::Combine(
+ testing::Values(std::vector<int>{0}),
+ testing::ValuesIn(opTypes),
+ testing::Values(true, false),
+ testing::ValuesIn(reductionTypes),
+ testing::ValuesIn(netPrecisions),
+ testing::ValuesIn(inputShapes),
+ testing::Values(CommonTestUtils::DEVICE_GPU));
+
+ INSTANTIATE_TEST_CASE_P(
+ ReduceOneAxis,
+ ReduceOpsLayerTest,
+ paramsOneAxis,
+ ReduceOpsLayerTest::getTestCaseName);
+
+ const auto params = testing::Combine(
+ testing::ValuesIn(axes),
+ testing::Values(opTypes[1]),
+ testing::Values(true, false),
+ testing::ValuesIn(reductionTypes),
+ testing::ValuesIn(netPrecisions),
+ testing::ValuesIn(inputShapes),
+ testing::Values(CommonTestUtils::DEVICE_GPU));
+
+ INSTANTIATE_TEST_CASE_P(
+ Reduce,
+ ReduceOpsLayerTest,
+ params,
+ ReduceOpsLayerTest::getTestCaseName);
+
+} // namespace
+++ /dev/null
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "reduce_tests.hpp"
-
-INSTANTIATE_TEST_CASE_P(
- smoke_GPU_TestsReduceSum, ReduceTestsShared,
- ::testing::Values(
- // Params: library_name, reduce_type, keep_dims, in_shape, input_tensor, axes_for_reduction, out_shape, reference
- reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 0 },{ 1, 3, 4 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ -3 },{ 1, 3, 4 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 2 },{ 2, 3, 1 },{ 10, 26, 42, 58, 74, 90 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4, 1, 1 },{},{ 2 },{ 2, 3, 1, 1, 1 },{ 10, 26, 42, 58, 74, 90 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ -1 },{ 2, 3, 1 },{ 10, 26, 42, 58, 74, 90 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 0, 2 },{ 1, 3, 1 },{ 68, 100, 132 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 1, 2 },{ 2, 1, 1 },{ 78, 222 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 2, 1 },{ 2, 1, 1 },{ 78, 222 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 0, 1, 2 },{},{ 300 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSum", true,{ 2, 3, 4 },{},{ 0, -2, 2 },{},{ 300 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", true,{ 2, 3, 4 },{},{ 2, 2, 0, 2, 0 },{ 1, 3, 1 },{ 68, 100, 132 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 0 },{ 3, 4 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ -3 },{ 3, 4 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 2 },{ 2, 3 },{ 10, 26, 42, 58, 74, 90 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ -1 },{ 2, 3 },{ 10, 26, 42, 58, 74, 90 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 0, 2 },{ 3 },{ 68, 100, 132 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 1, 2 },{ 2 },{ 78, 222 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 2, 1 },{ 2 },{ 78, 222 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 0, 1, 2 },{},{ 300 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 0, -2, 2 },{},{ 300 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 2, 3, 4 },{},{ 2, 2, 0, 2, 0 },{ 3 },{ 68, 100, 132 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", true,{ 1, 2, 3, 4, 1 },{},{ 1 },{ 1, 1, 3, 4, 1 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } },
- reduce_test_params{ "GPU", "I32", "ReduceSum", false,{ 1, 2, 3, 4, 1 },{},{ 1 },{ 1, 3, 4, 1 },{ 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36 } }
-));
-
-INSTANTIATE_TEST_CASE_P(
- smoke_GPU_TestsReduce, ReduceTestsShared,
- ::testing::Values(
- // Params: library_name, reduce_type, keep_dims, in_shape, input_tensor, axes_for_reduction, out_shape, reference
- reduce_test_params{ "GPU", "FP32", "ReduceAnd", true,{ 2, 2, 2 },{1, 0, 1, 1, 0, 1, 1, 0},{ 2 },{ 2, 2, 1 },{ 0, 1, 0, 0} },
- reduce_test_params{ "GPU", "FP32", "ReduceAnd", false, { 2, 2, 2 },{1, 0, 1, 1, 0, 1, 1, 0},{ 0, 1, 2 },{ },{ 0 } },
- reduce_test_params{ "GPU", "FP32", "ReduceL1", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{ } },
- reduce_test_params{ "GPU", "FP32", "ReduceL1", true, { 3, 2, 2 },{},{ 2 },{ 3, 2, 1 },{ 3, 7, 11, 15, 19, 23 } },
- reduce_test_params{ "GPU", "FP32", "ReduceL1", false, { 3, 2, 2 },{},{ 2 },{ 3, 2 },{ 3, 7, 11, 15, 19, 23 } },
- reduce_test_params{ "GPU", "FP32", "ReduceL1", false, { 3, 2, 2 },{},{ 0, 1, 2 },{ },{ 78 } },
- reduce_test_params{ "GPU", "FP32", "ReduceL2", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
- reduce_test_params{ "GPU", "FP32", "ReduceL2", true,{ 3, 2, 2 },{},{ 2 },{ 3, 2, 1 },{ 2.23606798f, 5.f, 7.81024968f, 10.63014581f, 13.45362405f, 16.2788206f } },
- reduce_test_params{ "GPU", "FP32", "ReduceL2", false,{ 3, 2, 2 },{},{ 2 },{ 3, 2 },{ 2.23606798f, 5.f, 7.81024968f, 10.63014581f, 13.45362405f, 16.2788206f } },
- reduce_test_params{ "GPU", "FP32", "ReduceL2", false,{ 3, 2, 2 },{},{ 0, 1, 2 },{ },{ 25.49509757f } },
- reduce_test_params{ "GPU", "FP32", "ReduceLogSum", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
- reduce_test_params{ "GPU", "FP32", "ReduceLogSum", true,{ 3, 2, 2 },{ },{ 1 },{ 3, 1, 2 },{ } },
- reduce_test_params{ "GPU", "FP32", "ReduceLogSum", false,{ 3, 2, 2 },{ },{ 1 },{ 3, 2 },{ } },
- reduce_test_params{ "GPU", "FP32", "ReduceLogSum", false,{ 3, 2, 2 },{ },{ 0, 1, 2 },{},{ } },
- reduce_test_params{ "GPU", "FP32", "ReduceLogSumExp", true,{ 5, 5, 2 },{},{ 2 },{ 5, 5, 1 },{} },
- reduce_test_params{ "GPU", "FP32", "ReduceLogSumExp", true,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 1, 2 },{ 20.f, 2.31326175f, 40.00004578f, 2.31326175f, 60.00671387f, 2.31326175f } },
- reduce_test_params{ "GPU", "FP32", "ReduceLogSumExp", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 2 },{ 20.f, 2.31326175f, 40.00004578f, 2.31326175f, 60.00671387f, 2.31326175f } },
- reduce_test_params{ "GPU", "FP32", "ReduceLogSumExp", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 0, 1, 2 },{},{ 60.00671387f } },
- reduce_test_params{ "GPU", "FP32", "ReduceMax", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
- reduce_test_params{ "GPU", "FP32", "ReduceMax", true,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 1, 2 },{ 20, 2, 40, 2, 60, 2 } },
- reduce_test_params{ "GPU", "FP32", "ReduceMax", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 2 },{ 20, 2, 40, 2, 60, 2 } },
- reduce_test_params{ "GPU", "FP32", "ReduceMax", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 0, 1, 2 },{},{ 60 } },
- reduce_test_params{ "GPU", "FP32", "ReduceMean", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
- reduce_test_params{ "GPU", "FP32", "ReduceMean", true, { 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 1, 2 },{ 12.5f, 1.5f, 35.f, 1.5f, 57.5f, 1.5f } },
- reduce_test_params{ "GPU", "FP32", "ReduceMean", false, { 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 2 },{ 12.5f, 1.5f, 35.f, 1.5f, 57.5f, 1.5f } },
- reduce_test_params{ "GPU", "FP32", "ReduceMean", false, { 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 0, 1, 2 },{ },{ 18.25f } },
- reduce_test_params{ "GPU", "FP32", "ReduceMin", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
- reduce_test_params{ "GPU", "FP32", "ReduceMin", true,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 1, 2 },{ 5, 1, 30, 1, 55, 1 } },
- reduce_test_params{ "GPU", "FP32", "ReduceMin", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 1 },{ 3, 2 },{ 5, 1, 30, 1, 55, 1 } },
- reduce_test_params{ "GPU", "FP32", "ReduceMin", false,{ 3, 2, 2 },{ 5, 1, 20, 2, 30, 1, 40, 2, 55, 1, 60, 2 },{ 0, 1, 2 },{},{ 1 } },
- reduce_test_params{ "GPU", "FP32", "ReduceOr", true,{ 2, 2, 2 },{1, 0, 1, 1, 0, 0, 1, 0},{ 2 },{ 2, 2, 1 },{1, 1, 0, 1 } },
- reduce_test_params{ "GPU", "FP32", "ReduceOr", false, { 2, 2, 2 },{},{ 0, 1, 2 },{ },{ 1 } },
- reduce_test_params{ "GPU", "FP32", "ReduceProd", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
- reduce_test_params{ "GPU", "FP32", "ReduceProd", true,{ 3, 2, 2 },{},{ 1 },{ 3, 1, 2 },{ 3, 8, 35, 48, 99, 120 } },
- reduce_test_params{ "GPU", "FP32", "ReduceProd", false,{ 3, 2, 2 },{},{ 1 },{ 3, 2 },{ 3, 8, 35, 48, 99, 120 } },
- reduce_test_params{ "GPU", "FP32", "ReduceProd", false,{ 3, 2, 2 },{},{ 0, 1, 2 },{ },{ 4.790016e+08 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSumSquare", true,{ 10, 10, 2 },{},{ 2 },{ 10, 10, 1 },{} },
- reduce_test_params{ "GPU", "FP32", "ReduceSumSquare", true, { 3, 2, 2 },{},{ 1 },{ 3, 1, 2 },{ 10, 20, 74, 100, 202, 244 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSumSquare", false, { 3, 2, 2 },{},{ 1 },{ 3, 2 },{ 10, 20, 74, 100, 202, 244 } },
- reduce_test_params{ "GPU", "FP32", "ReduceSumSquare", false, { 3, 2, 2 },{},{ 0, 1, 2 },{ },{ 650 } }
-));
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "reduce_kernel_b_fs_yx_fsv16.h"
+#include "kernel_selector_utils.h"
+#include <vector>
+#include <string>
+#include "common_tools.h"
+
+namespace kernel_selector {
+
+static const size_t SIMD = 16;
+using NDims = std::vector<kernel_selector::Tensor::Dim>;
+
+static size_t calc_read_offset(const reduce_params& params) {
+ auto read_offset = 1;
+ if (BytesPerElement(params.inputs[0].GetDType()) == 4)
+ read_offset = 4;
+ else if (BytesPerElement(params.inputs[0].GetDType()) == 2)
+ read_offset = 8;
+ else if (BytesPerElement(params.inputs[0].GetDType()) == 1)
+ read_offset = 16;
+ return read_offset;
+}
+
+static NDims calc_in_dims(const reduce_params& params) {
+ auto input = params.inputs[0];
+ auto in_dims = input.GetDims();
+ auto reduce_axes = params.reduceAxes;
+
+ std::vector<size_t> ordered_axes = {0, 1, 3, 2};
+ std::reverse(in_dims.begin(), in_dims.end());
+ for (size_t a = 0; a < params.reduceAxes.size(); a++) {
+ in_dims[ordered_axes[params.reduceAxes[a]]].v = 1;
+ }
+
+ return in_dims;
+}
+
+ParamsKey ReduceKernel_b_fs_yx_fsv16::GetSupportedKey() const {
+ ParamsKey k;
+ k.EnableInputDataType(Datatype::F16);
+ k.EnableInputDataType(Datatype::F32);
+ k.EnableInputDataType(Datatype::INT32);
+ k.EnableInputDataType(Datatype::INT8);
+ k.EnableInputDataType(Datatype::UINT8);
+ k.EnableOutputDataType(Datatype::F16);
+ k.EnableOutputDataType(Datatype::F32);
+ k.EnableOutputDataType(Datatype::INT32);
+ k.EnableOutputDataType(Datatype::INT8);
+ k.EnableOutputDataType(Datatype::UINT8);
+ k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+ k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+ k.EnableTensorOffset();
+ k.EnableTensorPitches();
+ k.EnableBatching();
+ k.EnableDifferentTypes();
+ return k;
+}
+
+CommonDispatchData ReduceKernel_b_fs_yx_fsv16::SetDefault(const reduce_params& params, const optional_params&) const {
+ CommonDispatchData runInfo;
+
+ auto in_dims = calc_in_dims(params);
+ std::vector<size_t> global = {16,
+ CeilDiv(in_dims[3].v, calc_read_offset(params)) * in_dims[2].v, // X, Y
+ CeilDiv(in_dims[1].v, SIMD) * in_dims[0].v}; // F, B
+
+ runInfo.gws0 = global[0];
+ runInfo.gws1 = global[1];
+ runInfo.gws2 = global[2];
+
+ runInfo.lws0 = SIMD;
+ runInfo.lws1 = 1;
+ runInfo.lws2 = 1;
+
+ return runInfo;
+}
+
+JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& params) const {
+ auto jit = ReduceKernelBase::GetJitConstants(params);
+ auto in_dims = calc_in_dims(params);
+ auto read_offset = calc_read_offset(params);
+
+ // Universal output sizes for keep dims = true/false cases
+ jit.AddConstant(MakeJitConstant("COMMON_OUTPUT_SIZE_X", in_dims[3].v));
+ jit.AddConstant(MakeJitConstant("COMMON_OUTPUT_SIZE_Y", in_dims[2].v));
+ jit.AddConstant(MakeJitConstant("COMMON_OUTPUT_FEATURE_NUM", in_dims[1].v));
+ jit.AddConstant(MakeJitConstant("COMMON_OUTPUT_BATCH_NUM", in_dims[0].v));
+ jit.AddConstant(MakeJitConstant("READ_OFFSET", read_offset));
+ jit.AddConstant(MakeJitConstant("BLOCK_READ(ptr,offset)", "DT_INPUT_BLOCK_READ" + std::to_string(read_offset) + "(ptr,offset)"));
+ jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+ jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+ jit.Merge(MakeTypeJitConstants(GetFinalAccumulatorType(params), "FINAL_ACCUMULATOR"));
+
+ if (!params.fused_ops.empty()) {
+ auto input_dt = GetActivationType(params);
+ std::vector<std::string> idx_order = {"b", "f", "y", "x"};
+ std::string var_name = "reduce_result";
+
+ bool cant_handle_vec16 = read_offset > 8 ? true : false;
+ size_t vec_size = cant_handle_vec16 ? 8 : read_offset;
+
+ FusedOpsConfiguration conf_scalar = {"_SCALAR",
+ idx_order,
+ var_name,
+ input_dt,
+ 1,
+ LoadType::LT_ALIGNED_READ,
+ BoundaryCheck::DISABLED,
+ IndexType::TENSOR_COORD,
+ Tensor::DataChannelName::X};
+
+ if (cant_handle_vec16) {
+ FusedOpsConfiguration conf_vector_1 = {"_VECTOR_1",
+ idx_order,
+ var_name+".lo",
+ input_dt,
+ vec_size,
+ LoadType::LT_ALIGNED_READ,
+ BoundaryCheck::DISABLED,
+ IndexType::TENSOR_COORD,
+ Tensor::DataChannelName::X};
+
+ std::vector<std::string> idx_order_vec_2 = {"b", "f", "y", "x + 8"};
+ FusedOpsConfiguration conf_vector_2 = {"_VECTOR_2",
+ idx_order_vec_2,
+ var_name+".hi",
+ input_dt,
+ vec_size,
+ LoadType::LT_ALIGNED_READ,
+ BoundaryCheck::DISABLED,
+ IndexType::TENSOR_COORD,
+ Tensor::DataChannelName::X};
+
+ jit.AddConstant(MakeJitConstant("FUSED_OPS_VECTOR", "{FUSED_OPS_VECTOR_1;final_result.lo=FUSED_OPS_RESULT_VECTOR_1;} {FUSED_OPS_VECTOR_2;final_result.hi=FUSED_OPS_RESULT_VECTOR_2;}"));
+ jit.AddConstant(MakeJitConstant("FUSED_OPS_RESULT_VECTOR", "final_result"));
+ jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar, conf_vector_1, conf_vector_2}));
+ } else {
+ FusedOpsConfiguration conf_vector = {"_VECTOR",
+ idx_order,
+ var_name,
+ input_dt,
+ vec_size,
+ LoadType::LT_ALIGNED_READ,
+ BoundaryCheck::DISABLED,
+ IndexType::TENSOR_COORD,
+ Tensor::DataChannelName::X};
+
+ jit.Merge(MakeFusedOpsJitConstants(params, {conf_vector, conf_scalar}));
+ }
+ }
+
+ return jit;
+}
+
+KernelsData ReduceKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, const optional_params& options) const {
+ return GetCommonKernelsData(params, options, FORCE_PRIORITY_6);
+}
+} // namespace kernel_selector
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "reduce_kernel_base.h"
+#include <vector>
+
+namespace kernel_selector {
+class ReduceKernel_b_fs_yx_fsv16 : public ReduceKernelBase {
+public:
+ ReduceKernel_b_fs_yx_fsv16() : ReduceKernelBase("reduce_gpu_b_fs_yx_fsv16") {}
+ virtual ~ReduceKernel_b_fs_yx_fsv16() {}
+ virtual CommonDispatchData SetDefault(const reduce_params& params, const optional_params&) const;
+ JitConstants GetJitConstants(const reduce_params& params) const override;
+ KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+ ParamsKey GetSupportedKey() const override;
+ std::vector<FusedOpType> GetSupportedFusedOps() const override {
+ return { FusedOpType::QUANTIZE,
+ FusedOpType::SCALE,
+ FusedOpType::ELTWISE,
+ FusedOpType::ACTIVATION };
+ }
+};
+} // namespace kernel_selector
--- /dev/null
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "reduce_kernel_base.h"
+#include "kernel_selector_utils.h"
+#include <vector>
+#include <string>
+#include "common_tools.h"
+
+namespace kernel_selector {
+
+bool ReduceKernelBase::Validate(const Params& p, const optional_params&) const {
+ auto& params = dynamic_cast<const reduce_params&>(p);
+
+ if (params.GetType() != KernelType::REDUCE) {
+ return false;
+ }
+
+ for (auto& fused_op : params.fused_ops) {
+ if (!IsFusedPrimitiveSupported(fused_op))
+ return false;
+ }
+
+ return true;
+}
+
+JitConstants ReduceKernelBase::GetJitConstants(const reduce_params& params) const {
+ JitConstants jit = MakeBaseParamsJitConstants(params);
+
+ jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", params.output.LogicalSize()));
+ jit.AddConstant(MakeJitConstant("REDUCE_" + toString(params.reduceMode) + "_MODE", 1));
+ jit.AddConstant(MakeJitConstant("KEEP_DIMS", params.keepDims));
+
+ auto inputDims = params.inputs[0].LogicalDims();
+ std::reverse(inputDims.begin(), inputDims.end());
+
+ auto convertAxesToIE = [&]() -> std::vector<int32_t> {
+ std::vector<int32_t> res;
+ auto sz = inputDims.size();
+
+ for (size_t i = 0; i < params.reduceAxes.size(); ++i) {
+ switch (params.reduceAxes[i]) {
+ case 0:
+ res.push_back(0);
+ break;
+ case 1:
+ res.push_back(1);
+ break;
+ case 2:
+ res.push_back(sz == 6 ? 5 : sz == 5 ? 4 : 3);
+ break;
+ case 3:
+ res.push_back(sz == 6 ? 4 : sz == 5 ? 3 : 2);
+ break;
+ case 4:
+ res.push_back(sz == 6 ? 3 : 2);
+ break;
+ case 5:
+ res.push_back(2);
+ break;
+ }
+ }
+ return res;
+ };
+
+ auto getDimSizeNameByNum = [&](size_t dim) -> std::string {
+ if (params.inputs[0].Dimentions() == 6) {
+ switch (dim) {
+ case 0:
+ return "BATCH_NUM";
+ case 1:
+ return "FEATURE_NUM";
+ case 2:
+ return "SIZE_W";
+ case 3:
+ return "SIZE_Z";
+ case 4:
+ return "SIZE_Y";
+ case 5:
+ return "SIZE_X";
+ }
+ } else if (params.inputs[0].Dimentions() == 5) {
+ switch (dim) {
+ case 0:
+ return "BATCH_NUM";
+ case 1:
+ return "FEATURE_NUM";
+ case 2:
+ return "SIZE_Z";
+ case 3:
+ return "SIZE_Y";
+ case 4:
+ return "SIZE_X";
+ }
+ } else if (params.inputs[0].Dimentions() == 4) {
+ switch (dim) {
+ case 0:
+ return "BATCH_NUM";
+ case 1:
+ return "FEATURE_NUM";
+ case 2:
+ return "SIZE_Y";
+ case 3:
+ return "SIZE_X";
+ }
+ }
+ return "";
+ };
+
+ auto convertedAxes = convertAxesToIE();
+
+ std::string divider;
+ for (size_t i = 0; i < params.reduceAxes.size(); ++i) {
+ divider += "INPUT0_" + getDimSizeNameByNum(convertedAxes[i]);
+ size_t range_check = i;
+ if (++range_check < params.reduceAxes.size())
+ divider += "*";
+ }
+ jit.AddConstant(MakeJitConstant("DIVIDER", divider));
+
+ const size_t kept_dims = inputDims.size() - params.reduceAxes.size();
+ if (kept_dims == 1) {
+ for (size_t i = 0; i < inputDims.size(); ++i)
+ if (std::find(convertedAxes.begin(), convertedAxes.end(), i) == convertedAxes.end())
+ jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", "index"));
+ } else {
+ size_t kept_cnt = 0;
+ for (size_t i = 0; i < inputDims.size(); ++i) {
+ if (std::find(convertedAxes.begin(), convertedAxes.end(), i) == convertedAxes.end()) {
+ if (kept_cnt == 0) {
+ std::string str = "(index ";
+ for (size_t j = i + 1; j < inputDims.size(); ++j) {
+ if (std::find(convertedAxes.begin(), convertedAxes.end(), j) == convertedAxes.end()) {
+ str += "/ INPUT0_" + getDimSizeNameByNum(j);
+ }
+ }
+ str += ")";
+ jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", str));
+ } else if (kept_cnt == kept_dims - 1) {
+ jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)",
+ "(index % INPUT0_" + getDimSizeNameByNum(i) + ")"));
+ } else {
+ std::string str = "(index ";
+ for (size_t j = i + 1; j < inputDims.size(); ++j) {
+ if (std::find(convertedAxes.begin(), convertedAxes.end(), j) == convertedAxes.end()) {
+ str += "/ INPUT0_" + getDimSizeNameByNum(j);
+ }
+ }
+ str += " % INPUT0_" + getDimSizeNameByNum(i) + ")";
+ jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", str));
+ }
+ kept_cnt += 1;
+ }
+ }
+ }
+
+ for (size_t a = 0; a < params.reduceAxes.size(); a++) {
+ switch (params.reduceAxes[a]) {
+ case 0:
+ jit.AddConstant(MakeJitConstant("REDUCE_BATCH", 1));
+ break;
+ case 1:
+ jit.AddConstant(MakeJitConstant("REDUCE_FEATURE", 1));
+ break;
+ case 2:
+ jit.AddConstant(MakeJitConstant("REDUCE_X", 1));
+ break;
+ case 3:
+ jit.AddConstant(MakeJitConstant("REDUCE_Y", 1));
+ break;
+ case 4:
+ jit.AddConstant(MakeJitConstant("REDUCE_Z", 1));
+ break;
+ case 5:
+ jit.AddConstant(MakeJitConstant("REDUCE_W", 1));
+ break;
+ }
+ }
+
+ return jit;
+}
+
+Datatype ReduceKernelBase::GetAccumulatorType(const reduce_params& params) const {
+ const auto& input_dt = params.inputs[0].GetDType();
+ const auto& reduce_mode = params.reduceMode;
+
+ if (reduce_mode == ReduceMode::MAX || reduce_mode == ReduceMode::MIN) {
+ return input_dt;
+ } else {
+ switch (input_dt) {
+ case Datatype::F32: return Datatype::F32;
+ case Datatype::F16: return Datatype::F32;
+ case Datatype::INT8: return Datatype::INT32;
+ case Datatype::UINT8: return Datatype::INT32;
+ default: return Datatype::F32;
+ }
+ }
+}
+
+Datatype ReduceKernelBase::GetFinalAccumulatorType(const reduce_params& params) const {
+ const auto& reduce_mode = params.reduceMode;
+
+ if (reduce_mode == ReduceMode::MEAN || reduce_mode == ReduceMode::LOG_SUM_EXP ||
+ reduce_mode == ReduceMode::LOG_SUM || reduce_mode == ReduceMode::L2 || reduce_mode == ReduceMode::L1) {
+ return Datatype::F32;
+ } else
+ return GetAccumulatorType(params);
+}
+
+Datatype ReduceKernelBase::GetActivationType(const reduce_params& params) const {
+ if (params.output.GetDType() == Datatype::F16)
+ return Datatype::F16;
+ else
+ return Datatype::F32;
+}
+
+KernelsData ReduceKernelBase::GetCommonKernelsData(const Params& p,
+ const optional_params& options,
+ float estimatedTime) const {
+ if (!Validate(p, options)) {
+ return {};
+ }
+
+ const reduce_params& params = static_cast<const reduce_params&>(p);
+ DispatchData runInfo = SetDefault(params, options);
+
+ KernelData kd = KernelData::Default<reduce_params>(params);
+
+ auto cldnn_jit = GetJitConstants(params);
+ auto entry_point = GetEntryPoint(kernelName, params.layerID, options);
+ auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+ auto& kernel = kd.kernels[0];
+ FillCLKernelData(kernel,
+ runInfo,
+ params.engineInfo,
+ kernelName,
+ jit,
+ entry_point,
+ DEFAULT,
+ false,
+ false,
+ 1,
+ GetFusedPrimitiveInputsCount(params));
+ kd.estimatedTime = estimatedTime;
+
+ return {kd};
+}
+} // namespace kernel_selector
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "common_kernel_base.h"
+#include "kernel_selector_params.h"
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// reduce_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct reduce_params : public base_params {
+ reduce_params() : base_params(KernelType::REDUCE), reduceMode(ReduceMode::MAX), keepDims(0) {}
+
+ ReduceMode reduceMode;
+ std::vector<uint16_t> reduceAxes;
+ int32_t keepDims;
+
+ virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// reduce_optional_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct reduce_optional_params : optional_params {
+ reduce_optional_params() : optional_params(KernelType::REDUCE) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// ReduceKernelBase
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class ReduceKernelBase : public common_kernel_base {
+public:
+ using common_kernel_base::common_kernel_base;
+ using DispatchData = CommonDispatchData;
+
+ virtual ~ReduceKernelBase() {}
+
+protected:
+ bool Validate(const Params&, const optional_params&) const override;
+ virtual JitConstants GetJitConstants(const reduce_params& params) const;
+ virtual CommonDispatchData SetDefault(const reduce_params& params, const optional_params&) const = 0;
+ Datatype GetAccumulatorType(const reduce_params& p) const;
+ Datatype GetFinalAccumulatorType(const reduce_params& p) const;
+ Datatype GetActivationType(const reduce_params& params) const;
+ KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
+};
+} // namespace kernel_selector
#include "kernel_selector_utils.h"
#include <vector>
#include <string>
+#include "common_tools.h"
namespace kernel_selector {
ParamsKey ReduceKernelRef::GetSupportedKey() const {
k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
-
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
-
- k.EnableInputLayout(DataLayout::bfyx);
- k.EnableOutputLayout(DataLayout::bfyx);
- k.EnableInputLayout(DataLayout::bfzyx);
- k.EnableOutputLayout(DataLayout::bfzyx);
- k.EnableInputLayout(DataLayout::bfwzyx);
- k.EnableOutputLayout(DataLayout::bfwzyx);
+ k.EnableAllInputLayout();
+ k.EnableAllOutputLayout();
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
CommonDispatchData ReduceKernelRef::SetDefault(const reduce_params& params, const optional_params&) const {
CommonDispatchData runInfo;
- std::vector<size_t> global = {params.output.LogicalSize(), 1, 1};
+ std::vector<size_t> global = {params.output.X().v * params.output.Y().v,
+ params.output.Z().v * params.output.W().v,
+ params.output.Batch().v * params.output.Feature().v};
auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
}
JitConstants ReduceKernelRef::GetJitConstants(const reduce_params& params) const {
- JitConstants jit = MakeBaseParamsJitConstants(params);
-
- jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", params.output.LogicalSize()));
- jit.AddConstant(MakeJitConstant("REDUCE_" + toString(params.reduceMode) + "_MODE", 1));
- jit.AddConstant(MakeJitConstant("KEEP_DIMS", params.keepDims));
-
- auto inputDims = params.inputs[0].LogicalDims();
- std::reverse(inputDims.begin(), inputDims.end());
-
- auto convertAxesToIE = [&]() -> std::vector<int32_t> {
- std::vector<int32_t> res;
- auto sz = inputDims.size();
-
- for (size_t i = 0; i < params.reduceAxes.size(); ++i) {
- switch (params.reduceAxes[i]) {
- case 0: res.push_back(0); break;
- case 1: res.push_back(1); break;
- case 2: res.push_back(sz == 6 ? 5 : sz == 5 ? 4 : 3); break;
- case 3: res.push_back(sz == 6 ? 4 : sz == 5 ? 3 : 2); break;
- case 4: res.push_back(sz == 6 ? 3 : 2); break;
- case 5: res.push_back(2); break;
- }
- }
- return res;
- };
-
- auto getDimSizeNameByNum = [&](size_t dim) -> std::string {
- if (params.inputs[0].GetLayout() == DataLayout::bfwzyx) {
- switch (dim) {
- case 0: return "BATCH_NUM";
- case 1: return "FEATURE_NUM";
- case 2: return "SIZE_W";
- case 3: return "SIZE_Z";
- case 4: return "SIZE_Y";
- case 5: return "SIZE_X";
- }
- } else if (params.inputs[0].GetLayout() == DataLayout::bfzyx) {
- switch (dim) {
- case 0: return "BATCH_NUM";
- case 1: return "FEATURE_NUM";
- case 2: return "SIZE_Z";
- case 3: return "SIZE_Y";
- case 4: return "SIZE_X";
- }
- } else if (params.inputs[0].GetLayout() == DataLayout::bfyx) {
- switch (dim) {
- case 0: return "BATCH_NUM";
- case 1: return "FEATURE_NUM";
- case 2: return "SIZE_Y";
- case 3: return "SIZE_X";
- }
- }
- return "";
- };
-
- auto convertedAxes = convertAxesToIE();
-
- const size_t kept_dims = inputDims.size() - params.reduceAxes.size();
- if (kept_dims == 1) {
- for (size_t i = 0; i < inputDims.size(); ++i)
- if (std::find(convertedAxes.begin(), convertedAxes.end(), i) == convertedAxes.end())
- jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", "index"));
- } else {
- size_t kept_cnt = 0;
- for (size_t i = 0; i < inputDims.size(); ++i) {
- if (std::find(convertedAxes.begin(), convertedAxes.end(), i) == convertedAxes.end()) {
- if (kept_cnt == 0) {
- std::string str = "(index ";
- for (size_t j = i + 1; j < inputDims.size(); ++j) {
- if (std::find(convertedAxes.begin(), convertedAxes.end(), j) == convertedAxes.end()) {
- str += "/ INPUT0_" + getDimSizeNameByNum(j);
- }
- }
- str += ")";
- jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", str));
- } else if (kept_cnt == kept_dims - 1) {
- jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", "(index % INPUT0_" + getDimSizeNameByNum(i) + ")"));
- } else {
- std::string str = "(index ";
- for (size_t j = i + 1; j < inputDims.size(); ++j) {
- if (std::find(convertedAxes.begin(), convertedAxes.end(), j) == convertedAxes.end()) {
- str += "/ INPUT0_" + getDimSizeNameByNum(j);
- }
- }
- str += " % INPUT0_" + getDimSizeNameByNum(i) + ")";
- jit.AddConstant(MakeJitConstant(getDimSizeNameByNum(i) + "_IDX_COMP(index)", str));
- }
- kept_cnt += 1;
- }
- }
- }
+ auto jit = ReduceKernelBase::GetJitConstants(params);
+
+ jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+ jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+ jit.Merge(MakeTypeJitConstants(GetFinalAccumulatorType(params), "FINAL_ACCUMULATOR"));
+
+ if (!params.fused_ops.empty()) {
+ auto input_dt = GetActivationType(params);
- for (size_t a = 0; a < params.reduceAxes.size(); a++) {
- switch (params.reduceAxes[a]) {
- case 0: jit.AddConstant(MakeJitConstant("REDUCE_BATCH", 1)); break;
- case 1: jit.AddConstant(MakeJitConstant("REDUCE_FEATURE", 1)); break;
- case 2: jit.AddConstant(MakeJitConstant("REDUCE_X", 1)); break;
- case 3: jit.AddConstant(MakeJitConstant("REDUCE_Y", 1)); break;
- case 4: jit.AddConstant(MakeJitConstant("REDUCE_Z", 1)); break;
- case 5: jit.AddConstant(MakeJitConstant("REDUCE_W", 1)); break;
+ std::vector<std::string> idx_order;
+ switch (DataTensor::ChannelsCount(params.inputs[0].GetLayout())) {
+ case 6: idx_order = {"b", "f", "w", "z", "y", "x" }; break;
+ case 5: idx_order = {"b", "f", "z", "y", "x" }; break;
+ default: idx_order = {"b", "f", "y", "x" }; break;
}
+
+ FusedOpsConfiguration conf = {"",
+ idx_order,
+ "reduce_result",
+ input_dt,
+ 1,
+ LoadType::LT_UNALIGNED,
+ BoundaryCheck::DISABLED,
+ IndexType::TENSOR_COORD,
+ Tensor::DataChannelName::X};
+
+ jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
}
return jit;
}
KernelsData ReduceKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
- KernelData kd = KernelData::Default<reduce_params>(params);
- reduce_params& newParams = *static_cast<reduce_params*>(kd.params.get());
-
- assert(params.GetType() == KernelType::REDUCE);
-
- auto runInfo = SetDefault(newParams, options);
- auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
- auto cldnn_jit = GetJitConstants(newParams);
- std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
- auto& kernel = kd.kernels[0];
-
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
-
- kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-
- return {kd};
+ return GetCommonKernelsData(params, options, DONT_USE_IF_HAVE_SOMETHING_ELSE);
}
+
} // namespace kernel_selector
/*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#pragma once
-#include "common_kernel_base.h"
+#include "reduce_kernel_base.h"
#include <vector>
namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// reduce_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct reduce_params : public base_params {
- reduce_params() : base_params(KernelType::REDUCE), reduceMode(ReduceMode::MAX), keepDims(0) {}
-
- ReduceMode reduceMode;
- std::vector<uint16_t> reduceAxes;
- int32_t keepDims;
-
- virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// reduce_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct reduce_optional_params : optional_params {
- reduce_optional_params() : optional_params(KernelType::REDUCE) {}
-};
-
-class ReduceKernelRef : public common_kernel_base {
+class ReduceKernelRef : public ReduceKernelBase {
public:
- ReduceKernelRef() : common_kernel_base("reduce_ref") {}
+ ReduceKernelRef() : ReduceKernelBase("reduce_ref") {}
virtual ~ReduceKernelRef() {}
- virtual JitConstants GetJitConstants(const reduce_params& params) const;
virtual CommonDispatchData SetDefault(const reduce_params& params, const optional_params&) const;
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
+ JitConstants GetJitConstants(const reduce_params& params) const override;
+ std::vector<FusedOpType> GetSupportedFusedOps() const override {
+ return { FusedOpType::QUANTIZE,
+ FusedOpType::SCALE,
+ FusedOpType::ELTWISE,
+ FusedOpType::ACTIVATION };
+ }
};
} // namespace kernel_selector
/*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#include "reduce_kernel_selector.h"
#include "reduce_kernel_ref.h"
+#include "reduce_kernel_b_fs_yx_fsv16.h"
namespace kernel_selector {
-reduce_kernel_selector::reduce_kernel_selector() { Attach<ReduceKernelRef>(); }
+reduce_kernel_selector::reduce_kernel_selector() {
+ Attach<ReduceKernelRef>();
+ Attach<ReduceKernel_b_fs_yx_fsv16>();
+}
KernelsData reduce_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
return GetNaiveBestKernel(params, options, KernelType::REDUCE);
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/include_all.cl"
+
+#define SIMD 16
+#define FSV 16
+#define unroll_for __attribute__((opencl_unroll_hint(READ_OFFSET))) for
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
+#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
+
+#if !defined REDUCE_BATCH
+ #define REDUCE_BATCH 0
+#endif
+#if !defined REDUCE_FEATURE
+ #define REDUCE_FEATURE 0
+#endif
+#if !defined REDUCE_Y
+ #define REDUCE_Y 0
+#endif
+#if !defined REDUCE_X
+ #define REDUCE_X 0
+#endif
+
+#define INPUT_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_OFFSET)
+
+#define ACCUMULATOR_VEC MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, READ_OFFSET)
+#define TO_ACCUMULATOR_VEC CAT(convert_, ACCUMULATOR_VEC)
+#define FINAL_ACCUMULATOR_VEC MAKE_VECTOR_TYPE(FINAL_ACCUMULATOR_TYPE, READ_OFFSET)
+
+#define ACTIVATION_VEC MAKE_VECTOR_TYPE(ACTIVATION_TYPE, READ_OFFSET)
+#define TO_ACTIVATION_VEC CAT(convert_, ACTIVATION_VEC)
+
+#define OUTPUT_VEC MAKE_VECTOR_TYPE(OUTPUT_TYPE, READ_OFFSET)
+#define TO_OUTPUT_VEC CAT(convert_, OUTPUT_VEC)
+
+#define REDUCE_BFY_BY_FY_Y REDUCE_BATCH && REDUCE_FEATURE && REDUCE_Y && !REDUCE_X || REDUCE_BATCH && REDUCE_Y && !REDUCE_FEATURE && !REDUCE_X || \
+ REDUCE_FEATURE && REDUCE_Y && !REDUCE_BATCH && !REDUCE_X|| REDUCE_Y && !REDUCE_BATCH && !REDUCE_FEATURE && !REDUCE_X
+
+#define REDUCE_F REDUCE_FEATURE && !REDUCE_BATCH && !REDUCE_Y && !REDUCE_X
+
+#define NEED_SUB_GROUP_REDUCE REDUCE_FEATURE
+
+#if REDUCE_MAX_MODE
+ #define INIT_VAL ACCUMULATOR_VAL_MIN
+ #define INPUT_INIT_VAL INPUT0_VAL_MIN
+#elif REDUCE_MIN_MODE
+ #define INIT_VAL ACCUMULATOR_VAL_MAX
+ #define INPUT_INIT_VAL INPUT0_VAL_MAX
+#elif REDUCE_PROD_MODE || REDUCE_AND_MODE
+ #define INIT_VAL ACCUMULATOR_VAL_ONE
+ #define INPUT_INIT_VAL INPUT0_VAL_ONE
+#else
+ #define INIT_VAL ACCUMULATOR_VAL_ZERO
+ #define INPUT_INIT_VAL INPUT0_VAL_ZERO
+#endif
+
+inline ACCUMULATOR_TYPE FUNC(apply_reduce)(ACCUMULATOR_TYPE acc, ACCUMULATOR_TYPE input) {
+ #if REDUCE_SUM_MODE || REDUCE_MEAN_MODE || REDUCE_LOG_SUM_MODE
+ acc += input;
+ #elif REDUCE_MAX_MODE
+ acc = ACCUMULATOR_MAX_FUNC(acc, input);
+ #elif REDUCE_MIN_MODE
+ acc = ACCUMULATOR_MIN_FUNC(acc, input);
+ #elif REDUCE_PROD_MODE
+ acc *= input;
+ #elif REDUCE_AND_MODE
+ acc = acc && input;
+ #elif REDUCE_OR_MODE
+ acc = acc || input;
+ #elif REDUCE_SUM_SQUARE_MODE || REDUCE_L2_MODE
+ acc += input * input;
+ #elif REDUCE_L1_MODE
+ #if !INPUT0_IS_FP
+ acc += TO_ACCUMULATOR_TYPE(fabs(TO_FINAL_ACCUMULATOR_TYPE(input)));
+ #else
+ acc += fabs(input);
+ #endif
+ #elif REDUCE_LOG_SUM_EXP_MODE
+ #if !INPUT0_IS_FP
+ acc += TO_ACCUMULATOR_TYPE(exp(TO_FINAL_ACCUMULATOR_TYPE(input)));
+ #else
+ acc += exp(input);
+ #endif
+ #endif
+
+ return acc;
+}
+
+inline ACCUMULATOR_TYPE FUNC(sub_group_reduce)(ACCUMULATOR_TYPE acc) {
+ #if NEED_SUB_GROUP_REDUCE
+ #if REDUCE_SUM_MODE || REDUCE_MEAN_MODE || REDUCE_LOG_SUM_MODE
+ acc = sub_group_reduce_add(acc);
+ #elif REDUCE_MAX_MODE
+ acc = sub_group_reduce_max(acc);
+ #elif REDUCE_MIN_MODE
+ acc = sub_group_reduce_min(acc);
+ #elif REDUCE_PROD_MODE
+ ACCUMULATOR_TYPE next = ACCUMULATOR_VAL_ONE;
+ acc *= intel_sub_group_shuffle_down(acc, next, 8);
+ acc *= intel_sub_group_shuffle_down(acc, next, 4);
+ acc *= intel_sub_group_shuffle_down(acc, next, 2);
+ acc *= intel_sub_group_shuffle_down(acc, next, 1);
+ acc = intel_sub_group_shuffle(acc, 0);
+ #elif REDUCE_AND_MODE
+ acc = sub_group_all(acc);
+ #elif REDUCE_OR_MODE
+ acc = sub_group_any(acc);
+ #elif REDUCE_SUM_SQUARE_MODE || REDUCE_L2_MODE
+ acc = sub_group_reduce_add(acc);
+ #elif REDUCE_L1_MODE
+ acc = sub_group_reduce_add(acc);
+ #elif REDUCE_LOG_SUM_EXP_MODE
+ acc = sub_group_reduce_add(acc);
+ #endif
+ #endif
+
+ return acc;
+}
+
+inline FINAL_ACCUMULATOR_TYPE FUNC(final_reduce)(FINAL_ACCUMULATOR_TYPE acc) {
+ #if REDUCE_MEAN_MODE
+ acc /= DIVIDER;
+ #elif REDUCE_L2_MODE
+ acc = sqrt(acc);
+ #elif REDUCE_LOG_SUM_MODE || REDUCE_LOG_SUM_EXP_MODE
+ acc = log(acc);
+ #endif
+
+ return acc;
+}
+
+inline uint FUNC(calc_linear_offset)(uint b, uint f, uint y, uint x) {
+ uint index = b * COMMON_OUTPUT_SIZE_X * COMMON_OUTPUT_SIZE_Y * COMMON_OUTPUT_FEATURE_NUM +
+ f * COMMON_OUTPUT_SIZE_X * COMMON_OUTPUT_SIZE_Y +
+ y * COMMON_OUTPUT_SIZE_X +
+ x;
+
+ return index;
+}
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+KERNEL(reduce_fsv16)(
+ const __global INPUT0_TYPE* data,
+ __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+ , FUSED_OPS_DECLS
+#endif
+)
+{
+ const uint xy = (uint)get_global_id(1) * READ_OFFSET;
+ const uint x = xy % ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
+ const uint y = xy / ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
+ const uint bf = (uint)get_global_id(2) * SIMD;
+ const uint b = bf / ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
+ const uint f = bf % ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
+
+#if KEEP_DIMS
+ const uint out_idx = OUTPUT_GET_INDEX(b, f, y, x);
+#else
+ #if REDUCE_BATCH && REDUCE_FEATURE && REDUCE_X // BFX
+ const uint out_idx = OUTPUT_GET_INDEX(y, x, b, f);
+ #elif REDUCE_BATCH && REDUCE_FEATURE && REDUCE_Y // BFY
+ const uint out_idx = OUTPUT_GET_INDEX(x, b, f, y);
+ #elif REDUCE_FEATURE && REDUCE_X // FX
+ const uint out_idx = OUTPUT_GET_INDEX(b, y, f, x);
+ #elif REDUCE_BATCH && REDUCE_X // BX
+ const uint out_idx = OUTPUT_GET_INDEX(f + get_sub_group_local_id(), y, b, x);
+ #elif REDUCE_BATCH && REDUCE_Y // BY
+ const uint out_idx = OUTPUT_GET_INDEX(f + get_sub_group_local_id(), x, b, y);
+ #elif REDUCE_FEATURE && REDUCE_Y // FY
+ const uint out_idx = OUTPUT_GET_INDEX(b, x, f, y);
+ #elif REDUCE_BATCH && REDUCE_FEATURE // BF
+ const uint out_idx = OUTPUT_GET_INDEX(y, x, b, f);
+ #elif REDUCE_FEATURE // F
+ const uint out_idx = OUTPUT_GET_INDEX(b + get_sub_group_local_id(), y, x, f);
+ #elif REDUCE_BATCH // B
+ const uint out_idx = OUTPUT_GET_INDEX(f + get_sub_group_local_id(), y, x, b);
+ #elif REDUCE_Y // Y
+ const uint out_idx = OUTPUT_GET_INDEX(b, f, x, y);
+ #else
+ const uint out_idx = OUTPUT_GET_INDEX(b, f, y, x);
+ #endif
+#endif
+
+ const uint linear_idx = FUNC_CALL(calc_linear_offset)(b, f, y, x);
+ if (linear_idx >= COMPUTATIONAL_OPERATIONS_NUMBER)
+ return;
+
+ const uint input_x_pitch = FSV;
+ const uint input_y_pitch = input_x_pitch * (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X);
+ const uint input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y);
+ const uint input_batch_pitch = input_fs_pitch * ((INPUT0_PAD_BEFORE_FEATURE_NUM + INPUT0_FEATURE_NUM + INPUT0_PAD_AFTER_FEATURE_NUM + FSV - 1) / FSV);
+
+ const uint output_x_pitch = FSV;
+ const uint output_y_pitch = FSV * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
+
+#if REDUCE_BATCH
+ const uint batch_out = 0;
+ const uint batch_max_val = INPUT0_BATCH_NUM;
+#else
+ const uint batch_out = BATCH_NUM_IDX_COMP(linear_idx);
+ const uint batch_max_val = batch_out + 1;
+#endif
+
+#if REDUCE_FEATURE
+ const uint feature_out = 0;
+ const uint feature_max_val = INPUT0_FEATURE_NUM;
+#else
+ const uint feature_out = FEATURE_NUM_IDX_COMP(linear_idx);
+ const uint feature_max_val = feature_out + 1;
+#endif
+
+#if REDUCE_Y
+ const uint y_out = 0;
+ const uint y_max_val = INPUT0_SIZE_Y;
+#else
+ const uint y_out = SIZE_Y_IDX_COMP(linear_idx);
+ const uint y_max_val = y_out + 1;
+#endif
+
+#if REDUCE_X
+ const uint x_out = 0;
+ const uint x_max_val = INPUT0_SIZE_X / READ_OFFSET;
+ const uint x_leftover_start = x_max_val * READ_OFFSET;
+ const uint x_leftover_end = INPUT0_SIZE_X;
+#else
+ const uint x_out = SIZE_X_IDX_COMP(linear_idx);
+ const uint x_max_val = x_out + 1;
+ const uint x_leftover_start = x_out;
+ const uint x_leftover_end = x_max_val;
+#endif
+
+uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) * input_fs_pitch + y_out * input_y_pitch + x_out * input_x_pitch;
+
+#if REDUCE_X
+ ACCUMULATOR_TYPE acc = INIT_VAL;
+ for (uint bi = batch_out; bi < batch_max_val; ++bi) {
+ for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
+ for (uint yi = y_out; yi < y_max_val; ++yi) {
+ for (uint xi = x_out; xi < x_max_val; ++xi) {
+ INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
+ #if (REDUCE_MAX_MODE || REDUCE_MIN_MODE || REDUCE_PROD_MODE || REDUCE_AND_MODE || REDUCE_LOG_SUM_EXP_MODE) && REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+ if (fi + FSV <= INPUT0_FEATURE_NUM)
+ input = BLOCK_READ(data, offset);
+ else
+ if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
+ for (int i = 0; i < READ_OFFSET; ++i)
+ input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
+ #else
+ input = BLOCK_READ(data, offset);
+ #endif
+ unroll_for (int i = 0; i < READ_OFFSET; ++i)
+ acc = FUNC_CALL(apply_reduce)(acc, input[i]);
+ offset += input_x_pitch * READ_OFFSET;
+ }
+ #if INPUT0_SIZE_X % READ_OFFSET != 0
+ for (uint xi = x_leftover_start; xi < x_leftover_end; ++xi) {
+ INPUT0_TYPE leftovers = INIT_VAL;
+ #if (REDUCE_MAX_MODE || REDUCE_MIN_MODE || REDUCE_PROD_MODE || REDUCE_AND_MODE || REDUCE_LOG_SUM_EXP_MODE) && REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+ if (fi + FSV <= INPUT0_FEATURE_NUM)
+ leftovers = DT_INPUT_BLOCK_READ(data, offset);
+ else
+ if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
+ leftovers = data[offset + get_sub_group_local_id()];
+ #else
+ leftovers = DT_INPUT_BLOCK_READ(data, offset);
+ #endif
+ acc = FUNC_CALL(apply_reduce)(acc, leftovers);
+ offset += input_x_pitch;
+ }
+ #endif
+ offset += input_y_pitch - INPUT0_SIZE_X * input_x_pitch;
+ }
+ offset += input_fs_pitch - ((y_max_val - y_out) * input_y_pitch);
+ }
+ offset += input_batch_pitch - ((((feature_max_val - feature_out) + FSV - 1) / FSV) * input_fs_pitch);
+ }
+
+ FINAL_ACCUMULATOR_TYPE final_acc;
+ acc = FUNC_CALL(sub_group_reduce)(acc);
+ final_acc = FUNC_CALL(final_reduce)(TO_FINAL_ACCUMULATOR_TYPE(acc));
+
+ OUTPUT_TYPE final_result;
+ ACTIVATION_TYPE reduce_result = TO_ACTIVATION_TYPE(final_acc);
+ #if HAS_FUSED_OPS
+ FUSED_OPS_SCALAR;
+ final_result = FUSED_OPS_RESULT_SCALAR;
+ #else
+ final_result = TO_OUTPUT_TYPE(ACTIVATION(reduce_result, ACTIVATION_PARAMS));
+ #endif
+
+ #if (REDUCE_FEATURE && REDUCE_X || REDUCE_BATCH && REDUCE_X) && !KEEP_DIMS
+ output[out_idx] = final_result;
+ #elif REDUCE_BATCH && REDUCE_Y && REDUCE_X || REDUCE_BATCH && REDUCE_X || REDUCE_Y && REDUCE_X || REDUCE_X && !REDUCE_FEATURE
+ DT_OUTPUT_BLOCK_WRITE(output + out_idx, 0, final_result);
+ #else
+ if (get_sub_group_local_id() == 0)
+ output[out_idx] = final_result;
+ #endif
+#else
+ ACCUMULATOR_VEC acc = (ACCUMULATOR_VEC)(INIT_VAL);
+ for (uint bi = batch_out; bi < batch_max_val; ++bi) {
+ for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
+ for (uint yi = y_out; yi < y_max_val; ++yi) {
+ for (uint xi = x_out; xi < x_max_val; ++xi) {
+ INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
+ #if (REDUCE_MAX_MODE || REDUCE_MIN_MODE || REDUCE_PROD_MODE || REDUCE_AND_MODE || REDUCE_LOG_SUM_EXP_MODE) && REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+ if (fi + FSV <= INPUT0_FEATURE_NUM)
+ input = BLOCK_READ(data, offset);
+ else
+ if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
+ for (int i = 0; i < READ_OFFSET; ++i)
+ input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
+ #else
+ input = BLOCK_READ(data, offset);
+ #endif
+ unroll_for (int i = 0; i < READ_OFFSET; ++i)
+ acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]);
+ offset += input_x_pitch;
+ }
+ offset += input_y_pitch - (x_max_val - x_out) * input_x_pitch;
+ }
+ offset += input_fs_pitch - ((y_max_val - y_out) * input_y_pitch);
+ }
+ offset += input_batch_pitch - ((((feature_max_val - feature_out) + FSV - 1) / FSV) * input_fs_pitch);
+ }
+
+ FINAL_ACCUMULATOR_VEC final_acc;
+ unroll_for (uint i = 0; i < READ_OFFSET; ++i) {
+ acc[i] = FUNC_CALL(sub_group_reduce)(acc[i]);
+ final_acc[i] = FUNC_CALL(final_reduce)(TO_FINAL_ACCUMULATOR_TYPE(acc[i]));
+ }
+
+ OUTPUT_VEC final_result;
+ ACTIVATION_VEC reduce_result = TO_ACTIVATION_VEC(final_acc);
+
+#if HAS_FUSED_OPS
+ FUSED_OPS_VECTOR;
+ final_result = (OUTPUT_VEC)(FUSED_OPS_RESULT_VECTOR);
+#else
+ final_result = TO_OUTPUT_VEC(ACTIVATION(reduce_result, ACTIVATION_PARAMS));
+#endif
+
+ unroll_for (uint i = 0; i < READ_OFFSET; ++i) {
+ if(COMMON_OUTPUT_SIZE_X % READ_OFFSET == 0 || x + i < COMMON_OUTPUT_SIZE_X) {
+ #if REDUCE_BATCH && REDUCE_FEATURE && REDUCE_Y && !REDUCE_X && !KEEP_DIMS
+ output[out_idx + output_x_pitch * i] = final_result[i];
+ #elif REDUCE_FEATURE && REDUCE_Y && !KEEP_DIMS
+ if (get_sub_group_local_id() == 0)
+ output[out_idx + i] = final_result[i];
+ #elif REDUCE_BATCH && REDUCE_Y && !KEEP_DIMS
+ output[out_idx + i] = final_result[i];
+ #elif REDUCE_BATCH && REDUCE_Y && REDUCE_X && !KEEP_DIMS
+ output[out_idx + get_sub_group_local_id() + output_y_pitch * i] = final_result[i];
+ #elif REDUCE_BFY_BY_FY_Y
+ output[out_idx + get_sub_group_local_id() + output_x_pitch * i] = final_result[i];
+ #elif REDUCE_BATCH && REDUCE_FEATURE && !KEEP_DIMS
+ if (get_sub_group_local_id() == 0)
+ output[out_idx + i] = final_result[i];
+ #elif REDUCE_BATCH && !KEEP_DIMS
+ output[out_idx + output_y_pitch * i] = final_result[i];
+ #elif REDUCE_BATCH && !REDUCE_FEATURE
+ DT_OUTPUT_BLOCK_WRITE(output + out_idx + output_x_pitch * i, 0, final_result[i]);
+ #elif REDUCE_BATCH && REDUCE_FEATURE
+ if (get_sub_group_local_id() == 0)
+ output[out_idx + output_x_pitch * i] = final_result[i];
+ #elif REDUCE_F && !KEEP_DIMS
+ if (get_sub_group_local_id() == 0)
+ output[out_idx + output_y_pitch * i] = final_result[i];
+ #elif REDUCE_F
+ if (get_sub_group_local_id() == 0)
+ output[out_idx + output_x_pitch * i] = final_result[i];
+ #endif
+ }
+ }
+#endif
+}
+
+#undef SIMD
+#undef FSV
+#undef unroll_for
+#undef BLOCK_READ
+#undef READ_OFFSET
+#undef INPUT_VEC
+#undef ACCUMULATOR_VEC
+#undef TO_ACCUMULATOR_VEC
+#undef FINAL_ACCUMULATOR_VEC
+#undef ACTIVATION_VEC
+#undef TO_ACTIVATION_VEC
+#undef OUTPUT_VEC
+#undef TO_OUTPUT_VEC
+#undef REDUCE_BFY_BY_FY_Y
+#undef REDUCE_F
+#undef NEED_SUB_GROUP_REDUCE
+#undef INIT_VAL
+#undef INPUT_INIT_VAL
+#undef REDUCE_BATCH
+#undef REDUCE_FEATURE
+#undef REDUCE_Y
+#undef REDUCE_X
#include "include/include_all.cl"
-KERNEL(reduce_vec)(const __global INPUT0_TYPE* data, __global OUTPUT_TYPE* output) {
- const uint out_idx = get_global_id(0);
+inline uint FUNC(calc_linear_offset)(uint b, uint f, uint w, uint z, uint y, uint x)
+{
+ uint index = b * OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z * OUTPUT_SIZE_W * OUTPUT_FEATURE_NUM +
+ f * OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z * OUTPUT_SIZE_W +
+ w * OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z +
+ z * OUTPUT_SIZE_X * OUTPUT_SIZE_Y +
+ y * OUTPUT_SIZE_X +
+ x;
- if (out_idx >= COMPUTATIONAL_OPERATIONS_NUMBER) return;
+ return index;
+}
+
+KERNEL(reduce_ref)(
+ const __global INPUT0_TYPE* data,
+ __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+ , FUSED_OPS_DECLS
+#endif
+)
+{
+ const uint xy = (uint)get_global_id(0);
+ const uint wz = (uint)get_global_id(1);
+ const uint bf = (uint)get_global_id(2);
+ const uint x = xy % OUTPUT_SIZE_X;
+ const uint y = xy / OUTPUT_SIZE_X;
+
+ const uint b = bf / OUTPUT_FEATURE_NUM;
+ const uint f = bf % OUTPUT_FEATURE_NUM;
+#if INPUT0_DIMS == 4
+ const uint w = 0;
+ const uint z = 0;
+ const uint out_idx = OUTPUT_GET_INDEX(b, f, y, x);
+#elif INPUT0_DIMS == 5
+ const uint z = wz % OUTPUT_SIZE_Z;
+ const uint w = 0;
+ const uint out_idx = OUTPUT_GET_INDEX(b, f, z, y, x);
+#elif INPUT0_DIMS == 6
+ const uint z = wz % OUTPUT_SIZE_Z;
+ const uint w = wz / OUTPUT_SIZE_Z;
+ const uint out_idx = OUTPUT_GET_INDEX(b, f, w, z, y, x);
+#endif
+
+ const uint linear_idx = FUNC_CALL(calc_linear_offset)(b, f, w, z, y, x);
+ if (linear_idx >= COMPUTATIONAL_OPERATIONS_NUMBER)
+ return;
#ifdef REDUCE_BATCH
const uint batch_out = 0;
const uint batch_max_val = INPUT0_BATCH_NUM;
#else
- const uint batch_out = BATCH_NUM_IDX_COMP(out_idx);
+ const uint batch_out = BATCH_NUM_IDX_COMP(linear_idx);
const uint batch_max_val = batch_out + 1;
#endif
const uint feature_out = 0;
const uint feature_max_val = INPUT0_FEATURE_NUM;
#else
- const uint feature_out = FEATURE_NUM_IDX_COMP(out_idx);
+ const uint feature_out = FEATURE_NUM_IDX_COMP(linear_idx);
const uint feature_max_val = feature_out + 1;
#endif
-#if INPUT0_LAYOUT_BFWZYX
+#if INPUT0_DIMS == 6
#ifdef REDUCE_W
const uint w_out = 0;
const uint w_max_val = INPUT0_SIZE_W;
#else
- const uint w_out = SIZE_W_IDX_COMP(out_idx);
+ const uint w_out = SIZE_W_IDX_COMP(linear_idx);
const uint w_max_val = w_out + 1;
#endif
#else
const uint w_max_val = 1;
#endif
-#if INPUT0_LAYOUT_BFWZYX || INPUT0_LAYOUT_BFZYX
+#if INPUT0_DIMS == 6 || INPUT0_DIMS == 5
#ifdef REDUCE_Z
const uint z_out = 0;
const uint z_max_val = INPUT0_SIZE_Z;
#else
- const uint z_out = SIZE_Z_IDX_COMP(out_idx);
+ const uint z_out = SIZE_Z_IDX_COMP(linear_idx);
const uint z_max_val = z_out + 1;
#endif
#else
const uint y_out = 0;
const uint y_max_val = INPUT0_SIZE_Y;
#else
- const uint y_out = SIZE_Y_IDX_COMP(out_idx);
+ const uint y_out = SIZE_Y_IDX_COMP(linear_idx);
const uint y_max_val = y_out + 1;
#endif
const uint x_out = 0;
const uint x_max_val = INPUT0_SIZE_X;
#else
- const uint x_out = SIZE_X_IDX_COMP(out_idx);
+ const uint x_out = SIZE_X_IDX_COMP(linear_idx);
const uint x_max_val = x_out + 1;
#endif
- OUTPUT_TYPE acc = OUTPUT_VAL_ZERO;
+ ACCUMULATOR_TYPE acc = ACCUMULATOR_VAL_ZERO;
uint counter = 0;
- for (uint b = batch_out; b < batch_max_val; ++b) {
- for (uint f = feature_out; f < feature_max_val; ++f) {
- for (uint w = w_out; w < w_max_val; ++w) {
- for (uint z = z_out; z < z_max_val; ++z) {
- for (uint y = y_out; y < y_max_val; ++y) {
- for (uint x = x_out; x < x_max_val; ++x) {
-#ifdef INPUT0_LAYOUT_BFWZYX
- const uint input_idx = GET_DATA_INDEX_6D(INPUT0, b, f, w, z, y, x);
-#elif INPUT0_LAYOUT_BFZYX
- const uint input_idx = GET_DATA_INDEX_5D(INPUT0, b, f, z, y, x);
+ for (uint bi = batch_out; bi < batch_max_val; ++bi) {
+ for (uint fi = feature_out; fi < feature_max_val; ++fi) {
+ for (uint wi = w_out; wi < w_max_val; ++wi) {
+ for (uint zi = z_out; zi < z_max_val; ++zi) {
+ for (uint yi = y_out; yi < y_max_val; ++yi) {
+ for (uint xi = x_out; xi < x_max_val; ++xi) {
+#if INPUT0_DIMS == 6
+ const uint input_idx = INPUT0_GET_INDEX(bi, fi, wi, zi, yi, xi);
+#elif INPUT0_DIMS == 5
+ const uint input_idx = INPUT0_GET_INDEX(bi, fi, zi, yi, xi);
#else
- const uint input_idx = GET_DATA_INDEX(INPUT0, b, f, y, x);
+ const uint input_idx = INPUT0_GET_INDEX(bi, fi, yi, xi);
+
#endif
#ifdef REDUCE_SUM_MODE
acc += data[input_idx];
#elif REDUCE_SUM_SQUARE_MODE
acc += data[input_idx] * data[input_idx];
#elif REDUCE_L1_MODE
+ #if !INPUT0_IS_FP
+ acc += TO_ACCUMULATOR_TYPE(fabs(TO_FINAL_ACCUMULATOR_TYPE(data[input_idx])));
+ #else
acc += fabs(data[input_idx]);
+ #endif
#elif REDUCE_L2_MODE
acc += data[input_idx] * data[input_idx];
#elif REDUCE_LOG_SUM_MODE
acc += data[input_idx];
#elif REDUCE_LOG_SUM_EXP_MODE
+ #if !INPUT0_IS_FP
+ acc += TO_ACCUMULATOR_TYPE(exp(TO_FINAL_ACCUMULATOR_TYPE(data[input_idx])));
+ #else
acc += exp(data[input_idx]);
+ #endif
#endif
counter++;
}
}
}
}
+
+ FINAL_ACCUMULATOR_TYPE final_acc = TO_FINAL_ACCUMULATOR_TYPE(acc);
#if REDUCE_MEAN_MODE
- if (counter != 0) acc /= counter;
+ if (counter != 0) final_acc /= counter;
#endif
#if REDUCE_L2_MODE
- acc = sqrt(acc);
+ final_acc = sqrt(final_acc);
#endif
#if REDUCE_LOG_SUM_MODE || REDUCE_LOG_SUM_EXP_MODE
- acc = log(acc);
+ final_acc = log(final_acc);
#endif
- output[out_idx] = acc;
+ OUTPUT_TYPE final_result;
+ ACTIVATION_TYPE reduce_result = TO_ACTIVATION_TYPE(final_acc);
+#if HAS_FUSED_OPS
+ FUSED_OPS;
+ final_result = FUSED_OPS_RESULT;
+#else
+ final_result = TO_OUTPUT_TYPE(ACTIVATION(reduce_result, ACTIVATION_PARAMS));
+#endif
+ output[out_idx] = final_result;
}
std::replace(input_var.begin(), input_var.end(), '[', '_');
std::replace(input_var.begin(), input_var.end(), ']', '_');
std::replace(input_var.begin(), input_var.end(), ' ', '_');
+ std::replace(input_var.begin(), input_var.end(), '.', '_');
return input_var + "_out";
}
{ std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), val_fw },
{ std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), val_fw },
{ std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), val_fw },
+ // bfwzyx
+ {std::make_tuple(engine_types::ocl, data_types::f32, format::bfwzyx), val_fw},
+ {std::make_tuple(engine_types::ocl, data_types::f16, format::bfwzyx), val_fw},
+ {std::make_tuple(engine_types::ocl, data_types::i32, format::bfwzyx), val_fw},
+ {std::make_tuple(engine_types::ocl, data_types::i8, format::bfwzyx), val_fw},
+ {std::make_tuple(engine_types::ocl, data_types::u8, format::bfwzyx), val_fw},
// fs_b_yx_fsv32
{std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw},
});
#include "kernel_selector_helper.h"
#include "reduce/reduce_kernel_selector.h"
#include "reduce/reduce_kernel_ref.h"
+#include "reduce/reduce_kernel_b_fs_yx_fsv16.h"
#include "error_handler.h"
#include "data_inst.h"
implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw);
implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx), val_fw);
implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx), val_fw);
implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfzyx), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfzyx), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfzyx), val_fw);
implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfwzyx), val_fw);
implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfwzyx), val_fw);
implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfwzyx), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfwzyx), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfwzyx), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::b_fs_yx_fsv16), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
+ implementation_map<reduce>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
}
} // namespace detail
#include "cum_sum_inst.h"
#include "embedding_bag_inst.h"
#include "extract_image_patches_inst.h"
+#include "reduce_inst.h"
#include <vector>
#include <list>
#include <memory>
return true;
};
+ auto reduce_supports_fusings = [](reduce_node& node) -> bool {
+ auto keep_dims = node.as<reduce>().get_primitive()->keep_dims;
+
+ if (keep_dims)
+ return true;
+
+ return false;
+ };
+
auto fuse_activation_f = [&](activation_node& activation_node) {
auto& input_data = activation_node.get_dependency(0);
if (input_data.get_users().size() != 1 || activation_node.get_dependencies().size() >= 3)
should_fuse |= input_data.is_type<space_to_batch>();
+ should_fuse |= input_data.is_type<reduce>() && reduce_supports_fusings(input_data.as<reduce>());
+
if (!should_fuse)
return;
should_fuse |= input_data.is_type<space_to_batch>();
+ should_fuse |= input_data.is_type<reduce>() && reduce_supports_fusings(input_data.as<reduce>());
+
if (!should_fuse)
return;
should_fuse |= input_data.is_type<eltwise>() && quantize_node.get_scale_shift_opt();
+ should_fuse |= input_data.is_type<reduce>() &&
+ reduce_supports_fusings(input_data.as<reduce>())
+ && quantize_node.get_scale_shift_opt();
+
if (!should_fuse)
return;
(parent1->is_type<space_to_depth>()) ||
(parent1->is_type<gemm>() && gemm_supports_fusings(parent1->as<gemm>())) ||
(parent1->is_type<batch_to_space>()) || (parent1->is_type<space_to_batch>()) ||
- (parent1->is_type<depth_to_space>() && dts_supports_fusings(parent1->as<depth_to_space>()));
+ (parent1->is_type<depth_to_space>() && dts_supports_fusings(parent1->as<depth_to_space>())) ||
+ (parent1->is_type<batch_to_space>()) || (parent1->is_type<space_to_batch>()) ||
+ (parent1->is_type<reduce>() && reduce_supports_fusings(parent1->as<reduce>()));
bool can_fuse_parent2 = (parent2->is_type<convolution>() && conv_supports_fusings(parent2->as<convolution>())) ||
(parent2->is_type<mvn>() && mvn_supports_fusings(parent2->as<mvn>())) ||
(parent2->is_type<space_to_depth>()) ||
(parent2->is_type<gemm>() && gemm_supports_fusings(parent2->as<gemm>())) ||
(parent2->is_type<batch_to_space>()) || (parent2->is_type<space_to_batch>()) ||
- (parent2->is_type<depth_to_space>() && dts_supports_fusings(parent2->as<depth_to_space>()));
+ (parent2->is_type<depth_to_space>() && dts_supports_fusings(parent2->as<depth_to_space>())) ||
+ (parent2->is_type<batch_to_space>()) || (parent2->is_type<space_to_batch>()) ||
+ (parent2->is_type<reduce>() && reduce_supports_fusings(parent2->as<reduce>()));
std::vector<bool> can_fuse_parents = { can_fuse_parent1, can_fuse_parent2 };
#include "reorder_inst.h"
#include "split_inst.h"
#include "mvn_inst.h"
+#include "reduce_inst.h"
#include "to_string_utils.h"
#include "gpu/memory_gpu.h"
prim.as<mvn>().input().get_output_layout().data_type != data_types::i8)
|| prim.as<mvn>().get_primitive()->across_channels) &&
prim.type() != cldnn::arg_max_min::type_id() &&
- prim.type() != cldnn::mutable_data::type_id())
+ prim.type() != cldnn::mutable_data::type_id() &&
+ prim.type() != cldnn::reduce::type_id())
can_use_fsv16 = false;
if (prim.type() == cldnn::quantize::type_id() &&
auto input_layout = node.input(0).get_output_layout();
auto input_format = input_layout.format;
+ auto format_dim = input_format.dimension();
auto output_type = input_layout.data_type;
auto mode = desc->mode;
auto reduce_axes = desc->axes;
-
auto in_dims = input_layout.size.sizes();
+
for (size_t a = 0; a < reduce_axes.size(); a++) {
in_dims[reduce_axes[a]] = 1;
}
+ std::vector<int32_t> updated_dims;
if (!desc->keep_dims) {
- for (size_t a = 0; a < reduce_axes.size(); a++) {
- in_dims.erase(in_dims.begin() + reduce_axes[a]);
- in_dims.push_back(1);
+ // Get unreduced from b-f and x-w range
+ for (size_t b_f_index = 0; b_f_index < 2; b_f_index++) {
+ bool index_to_remove = std::find(reduce_axes.begin(), reduce_axes.end(), b_f_index) != reduce_axes.end();
+ if (!index_to_remove)
+ updated_dims.push_back(in_dims[b_f_index]);
}
+ for (size_t x_w_index = format_dim - 1; x_w_index >= 2; x_w_index--) {
+ bool index_to_remove = std::find(reduce_axes.begin(), reduce_axes.end(), x_w_index) != reduce_axes.end();
+ if (!index_to_remove)
+ updated_dims.push_back(in_dims[x_w_index]);
+ }
+
+ if (input_format.dimension() == 4 && reduce_axes.size() == 1)
+ updated_dims.push_back(1);
+ if (updated_dims.size() > 2)
+ std::reverse(updated_dims.begin() + 2, updated_dims.end());
+
+ // Fill updated dims to format_dim size
+ while (updated_dims.size() < format_dim)
+ updated_dims.push_back(1);
+
+ in_dims = std::move(updated_dims);
}
std::vector<reduce_mode> reduce_bool_modes = {reduce_mode::logical_and, reduce_mode::logical_or};
- if (std::find(reduce_bool_modes.begin(), reduce_bool_modes.end(), mode) != reduce_bool_modes.end()) output_type = data_types::i8;
+ if (std::find(reduce_bool_modes.begin(), reduce_bool_modes.end(), mode) != reduce_bool_modes.end())
+ output_type = data_types::i8;
+ else if (output_type == data_types::i8 || output_type == data_types::u8)
+ output_type = data_types::f32;
+
+ if (desc->output_data_type)
+ output_type = *desc->output_data_type;
+
+ if (node.has_fused_primitives())
+ output_type = node.get_fused_output_layout().data_type;
- if (input_layout.format == format::bfwzyx)
+ if (format_dim == 6)
return layout{output_type, input_format, tensor(batch(in_dims[0]), feature(in_dims[1]), spatial(in_dims[2], in_dims[3], in_dims[4], in_dims[5]))};
- else if (input_layout.format == format::bfzyx)
+ else if (format_dim == 5)
return layout{output_type, input_format, tensor(batch(in_dims[0]), feature(in_dims[1]), spatial(in_dims[2], in_dims[3], in_dims[4]))};
else
return layout{output_type, input_format, tensor(batch(in_dims[0]), feature(in_dims[1]), spatial(in_dims[2], in_dims[3]))};
#include "api/space_to_depth.hpp"
#include "api/batch_to_space.hpp"
#include "api/space_to_batch.hpp"
+#include "api/reduce.hpp"
#include "test_utils/test_utils.h"
VF<uint8_t> rnd_vec = generate_random_1d<uint8_t>(s.count(), min_random, max_random);
set_values(prim, rnd_vec);
} else if (l.data_type == data_types::f16) {
- VF<uint16_t> rnd_vec = generate_random_1d<uint16_t>(s.count(), min_random, max_random);
+ VF<uint16_t> rnd_vec = generate_random_1d<uint16_t>(s.count(), -1, 1);
set_values(prim, rnd_vec);
} else {
- VF<float> rnd_vec = generate_random_1d<float>(s.count(), min_random, max_random);
+ VF<float> rnd_vec = generate_random_1d<float>(s.count(), -1, 1);
set_values(prim, rnd_vec);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_3in_quantize_i8,
::testing::ValuesIn(std::vector<gemm_test_params>{
- gemm_test_params{ CASE_GEMM_3IN_FP32_1, 4, 5 },
gemm_test_params{ CASE_GEMM_3IN_FP16_1, 4, 5 },
gemm_test_params{ CASE_GEMM_3IN_S8S8_1, 4, 5 },
gemm_test_params{ CASE_GEMM_3IN_S8S8_2, 4, 5 },
gemm_test_params{ CASE_GEMM_3IN_S8S8_3, 4, 5 },
+ //gemm_test_params{ CASE_GEMM_3IN_FP32_1, 4, 5 },
}), );
class gemm_2in_quantize_u8 : public GemmFusingTest {};
INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_2in_quantize_u8,
::testing::ValuesIn(std::vector<gemm_test_params>{
- gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 },
gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 4 },
gemm_test_params{ CASE_GEMM_2IN_U8U8_1, 3, 4 },
gemm_test_params{ CASE_GEMM_2IN_U8U8_2, 3, 4 },
gemm_test_params{ CASE_GEMM_2IN_U8U8_3, 3, 4 },
+ //gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 },
}), );
class gemm_2in_scale : public GemmFusingTest {};
execute(p);
}
-
INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_2in_scale,
::testing::ValuesIn(std::vector<gemm_test_params>{
gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 },
eltwise_test_params{CASE_ELTWISE_U8_FP16_2},
eltwise_test_params{CASE_ELTWISE_U8_FP16_3},
}), );
+
+/* ----------------------------------------------------------------------------------------------------- */
+/* ---------------------------------------- Reduce cases ----------------------------------------------- */
+/* ----------------------------------------------------------------------------------------------------- */
+struct reduce_test_params {
+ cldnn::tensor in_shape;
+ cldnn::tensor out_shape;
+ cldnn::data_types data_type;
+ cldnn::format input_format;
+ data_types default_type;
+ cldnn::format default_format;
+ size_t expected_fused_primitives;
+ size_t expected_not_fused_primitives;
+ cldnn::reduce_mode reduce_mode;
+ std::vector<uint16_t> reduce_axes;
+ bool keep_dims;
+ std::string kernel_name;
+};
+
+#define CASE_REDUCE_F32_0 {3, 7, 5, 7}, {3, 7, 5, 7}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_1 {3, 7, 5, 7}, {3, 7, 5, 7}, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_2 {2, 4, 8, 4, 4}, {2, 4, 8, 4, 4}, data_types::f32, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_3 {16, 16, 16, 8, 8, 8}, {16, 16, 16, 8, 8, 8}, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F32_4 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+
+#define CASE_REDUCE_F16_0 {3, 7, 5, 7}, {3, 7, 5, 7}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_1 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_2 {2, 4, 8, 4, 4}, {2, 4, 8, 4, 4}, data_types::f16, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_3 {3, 5, 3, 5, 7, 7}, {3, 5, 3, 5, 7, 7}, data_types::f16, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_F16_4 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+
+#define CASE_REDUCE_I32_0 {3, 7, 5, 7}, {3, 7, 5, 7}, data_types::i32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_1 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::i32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_2 {2, 4, 8, 4, 4}, {2, 4, 8, 4, 4}, data_types::i32, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_3 {3, 5, 3, 5, 7, 7}, {3, 5, 3, 5, 7, 7}, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I32_4 {3, 5, 3, 5, 7, 7}, {3, 5, 3, 5, 7, 7}, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
+
+#define CASE_REDUCE_I8_0 {3, 7, 5, 7}, {3, 7, 5, 7}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_1 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::i8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_2 {2, 4, 8, 4, 4}, {2, 4, 8, 4, 4}, data_types::i8, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_3 {3, 5, 3, 5, 7, 7}, {3, 5, 3, 5, 7, 7}, data_types::i8, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_I8_4 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+
+#define CASE_REDUCE_U8_0 {3, 7, 5, 7}, {3, 7, 5, 7},data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_1 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_2 {2, 4, 8, 4, 4}, {2, 4, 8, 4, 4}, data_types::u8, format::bfzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_3 {3, 5, 3, 5, 7, 7}, {3, 5, 3, 5, 7, 7}, data_types::u8, format::bfwzyx, data_types::f32, format::bfyx
+#define CASE_REDUCE_U8_4 {2, 8, 4, 4}, {2, 8, 4, 4}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+
+
+class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
+public:
+ void execute(reduce_test_params& p) {
+ auto input_prim = get_mem(get_input_layout(p));
+
+ network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+ network network_fused(this->engine, this->topology_fused, bo_fused);
+
+ network_fused.set_input_data("input", input_prim);
+ network_not_fused.set_input_data("input", input_prim);
+
+ compare(network_not_fused, network_fused, p);
+ }
+
+ void update_out_shape(reduce_test_params& p) {
+ for (auto& axis : p.reduce_axes) {
+ switch (axis) {
+ case 0: // batch
+ p.out_shape.batch[0] = 1;
+ break;
+ case 1: // feature
+ p.out_shape.feature[0] = 1;
+ break;
+ case 2: // x
+ p.out_shape.spatial[0] = 1;
+ break;
+ case 3: // y
+ p.out_shape.spatial[1] = 1;
+ break;
+ case 4: // z
+ p.out_shape.spatial[2] = 1;
+ break;
+ case 5: // w
+ p.out_shape.spatial[3] = 1;
+ break;
+ }
+ }
+ }
+
+ layout get_input_layout(reduce_test_params& p) { return layout{p.data_type, p.input_format, p.in_shape}; }
+ layout get_per_channel_layout(reduce_test_params& p) {
+ return layout{p.default_type, p.default_format, tensor{1, p.in_shape.feature[0], 1, 1}};
+ }
+};
+
+class reduce_eltwise_activation_quantize : public ReduceFusingTest {};
+TEST_P(reduce_eltwise_activation_quantize, basic) {
+ auto p = GetParam();
+ update_out_shape(p);
+ create_topologies(input_layout("input", get_input_layout(p)),
+ data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)),
+ data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)),
+ data("out_lo", get_mem(get_single_element_layout(p), -128)),
+ data("out_hi", get_mem(get_single_element_layout(p), 127)),
+ data("eltwise_data", get_mem(get_output_layout(p))),
+ reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims),
+ eltwise("eltwise", {"reduce", "eltwise_data"}, eltwise_mode::sum, p.default_type),
+ activation("activation", "eltwise", activation_func::relu),
+ quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
+ reorder("output_reorder", "quantize", p.default_format, data_types::f32));
+
+ tolerance = 1.f;
+ execute(p);
+}
+
+TEST_P(reduce_eltwise_activation_quantize, per_channel) {
+ auto p = GetParam();
+ update_out_shape(p);
+ create_topologies(input_layout("input", get_input_layout(p)),
+ data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
+ data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
+ data("out_lo", get_mem(get_single_element_layout(p), -128)),
+ data("out_hi", get_mem(get_single_element_layout(p), 127)),
+ data("eltwise_data", get_mem(get_output_layout(p))),
+ reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims),
+ eltwise("eltwise", {"reduce", "eltwise_data"}, eltwise_mode::sum, p.default_type),
+ activation("activation", "eltwise", activation_func::relu),
+ quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8),
+ reorder("output_reorder", "quantize", p.default_format, data_types::f32));
+
+ tolerance = 1.f;
+ execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu,
+ reduce_eltwise_activation_quantize,
+ ::testing::ValuesIn(std::vector<reduce_test_params>{
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 5, reduce_mode::mean, {reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 5, reduce_mode::min, {reduce::along_x, reduce::along_y, reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_1, 2, 5, reduce_mode::sum, {reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F32_2, 2, 5, reduce_mode::mean, {reduce::along_f, reduce::along_x}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F32_1, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F32_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, {reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 5, reduce_mode::min, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_1, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F32_2, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F32_1, 2, 5, reduce_mode::mean, {reduce::along_x}, true, "reduce_ref"},
+
+ reduce_test_params{CASE_REDUCE_F16_1, 2, 5, reduce_mode::mean, {reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_1, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_1, 2, 5, reduce_mode::min, {reduce::along_x, reduce::along_y, reduce::along_f}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_0, 2, 5, reduce_mode::sum, {reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F16_4, 2, 5, reduce_mode::mean, {reduce::along_f, reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F16_0, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F16_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F16_1, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_2, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_1, 2, 5, reduce_mode::sum, {reduce::along_y}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_2, 2, 5, reduce_mode::min, {reduce::along_x}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_4, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F16_0, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F16_4, 2, 5, reduce_mode::mean, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+
+ reduce_test_params{CASE_REDUCE_I8_0, 2, 5, reduce_mode::mean, {reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_I8_0, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_I8_0, 2, 5, reduce_mode::min, {reduce::along_x, reduce::along_y, reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_I8_1, 2, 5, reduce_mode::sum, {reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_I8_2, 2, 5, reduce_mode::mean, {reduce::along_f, reduce::along_x}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_I8_1, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_I8_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_I8_0, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, {reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_I8_0, 2, 5, reduce_mode::min, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_I8_1, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_I8_2, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_I8_1, 2, 5, reduce_mode::mean, {reduce::along_x}, true, "reduce_ref"},
+
+ reduce_test_params{CASE_REDUCE_U8_1, 2, 5, reduce_mode::mean, {reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_U8_1, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_f, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_U8_1, 2, 5, reduce_mode::min, {reduce::along_x, reduce::along_y, reduce::along_f}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, {reduce::along_f, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, {reduce::along_f, reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, {reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, {reduce::along_x, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_U8_1, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_U8_2, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_U8_1, 2, 5, reduce_mode::sum, {reduce::along_y}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_U8_2, 2, 5, reduce_mode::min, {reduce::along_x}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, {reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, {reduce::along_f}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"}
+ }), );
+
+class reduce_scale_activation : public ReduceFusingTest {};
+TEST_P(reduce_scale_activation, basic) {
+ auto p = GetParam();
+ create_topologies(input_layout("input", get_input_layout(p)),
+ data("scale_data", get_mem(get_single_element_layout(p), -0.125f)),
+ reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims),
+ scale("scale", "reduce", "scale_data"),
+ activation("activation", "scale", activation_func::cos),
+ reorder("output_reorder", "activation", p.default_format, data_types::f32));
+ tolerance = 1e-02f;
+ execute(p);
+}
+
+TEST_P(reduce_scale_activation, per_channel) {
+ auto p = GetParam();
+ create_topologies(input_layout("input", get_input_layout(p)),
+ data("scale_data", get_mem(get_per_channel_layout(p), -0.125f)),
+ reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims),
+ scale("scale", "reduce", "scale_data"),
+ activation("activation", "scale", activation_func::cos),
+ reorder("output_reorder", "activation", p.default_format, data_types::f32));
+ tolerance = 1e-02f;
+ execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu,
+ reduce_scale_activation,
+ ::testing::ValuesIn(std::vector<reduce_test_params>{
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::max, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_1, 2, 4, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::min, {reduce::along_x, reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_2, 2, 4, reduce_mode::mean, {reduce::along_x, reduce::along_y}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::l1, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::l1, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::min, {reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F32_0, 2, 4, reduce_mode::sum, {reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+
+ reduce_test_params{CASE_REDUCE_F16_0, 2, 4, reduce_mode::max, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F16_1, 2, 4, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_b}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_0, 2, 4, reduce_mode::min, {reduce::along_x, reduce::along_y}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F16_2, 2, 4, reduce_mode::mean, {reduce::along_x, reduce::along_y}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_0, 2, 4, reduce_mode::min, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ reduce_test_params{CASE_REDUCE_F16_0, 2, 4, reduce_mode::sum, {reduce::along_x}, true, "reduce_gpu_b_fs_yx_fsv16"},
+ }), );
+
+INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
+ reduce_eltwise_activation_quantize,
+ ::testing::ValuesIn(std::vector<reduce_test_params>{
+ // No layout format available for quantize/scale
+ reduce_test_params{CASE_REDUCE_F32_3, 2, 4, reduce_mode::l1, {reduce::along_x}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_F16_3, 2, 4, reduce_mode::min, {reduce::along_x}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_I32_2, 2, 4, reduce_mode::max, {reduce::along_x, reduce::along_y}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_I32_3, 2, 4, reduce_mode::sum, {reduce::along_x}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_I8_3, 2, 4, reduce_mode::mean, {reduce::along_x}, true, "reduce_ref"},
+ reduce_test_params{CASE_REDUCE_U8_3, 2, 4, reduce_mode::l2, {reduce::along_x}, true, "reduce_ref"}
+ }), );
-/*
-// Copyright (c) 2019 Intel Corporation
+/*
+// Copyright (c) 2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// limitations under the License.
*/
-///////////////////////////////////////////////////////////////////////////////////////////////////
#include <gtest/gtest.h>
#include <api/input_layout.hpp>
#include "api/reduce.hpp"
#include "test_utils/test_utils.h"
#include <api/data.hpp>
#include "test_utils/float16.h"
+#include <cmath>
+#include <algorithm>
using namespace cldnn;
using namespace tests;
+template <typename InputT>
+struct accumulator_type {
+ using type = float;
+};
+
+template <typename InputT>
+struct output_type {
+ using type = float;
+};
+
+template <typename AccT>
+AccT get_min_value() {
+ return std::numeric_limits<AccT>::lowest();
+}
+
+template<>
+int get_min_value<int>() {
+ return std::numeric_limits<int>::min();
+}
+
+template <typename VecT>
+struct Comparator {
+ std::vector<VecT> order;
+ bool operator()(const int lhs, const int rhs) const {
+ auto lhs_index = std::distance(order.begin(), std::find(order.begin(), order.end(), lhs));
+ auto rhs_index = std::distance(order.begin(), std::find(order.begin(), order.end(), rhs));
+ return lhs_index < rhs_index;
+ }
+};
+
+template <typename InputT, typename AccT>
+struct reduce_accumulator {
+ AccT set_accumulator_value(cldnn::reduce_mode reduce_mode) {
+ AccT acc;
+ if (reduce_mode == cldnn::reduce_mode::max)
+ acc = get_min_value<AccT>();
+ else if (reduce_mode == cldnn::reduce_mode::min)
+ acc = std::numeric_limits<AccT>::max();
+ else if (reduce_mode == cldnn::reduce_mode::prod || reduce_mode == cldnn::reduce_mode::logical_and)
+ acc = 1;
+ else
+ acc = 0;
+
+ return acc;
+ };
+
+ AccT accumulate(AccT& acc, AccT& input_val, cldnn::reduce_mode reduce_mode) {
+ if (reduce_mode == cldnn::reduce_mode::sum || reduce_mode == cldnn::reduce_mode::mean ||
+ reduce_mode == cldnn::reduce_mode::log_sum)
+ acc += input_val;
+ else if (reduce_mode == cldnn::reduce_mode::max)
+ acc = input_val > acc ? input_val : acc;
+ else if (reduce_mode == cldnn::reduce_mode::min)
+ acc = input_val < acc ? input_val : acc;
+ else if (reduce_mode == cldnn::reduce_mode::prod)
+ acc = acc * input_val;
+ else if (reduce_mode == cldnn::reduce_mode::logical_and)
+ acc = acc && input_val;
+ else if (reduce_mode == cldnn::reduce_mode::logical_or)
+ acc = acc || input_val;
+ else if (reduce_mode == cldnn::reduce_mode::sum_square)
+ acc += input_val * input_val;
+ else if (reduce_mode == cldnn::reduce_mode::l1)
+ acc += abs(input_val);
+ else if (reduce_mode == cldnn::reduce_mode::l2)
+ acc += input_val * input_val;
+ else if (reduce_mode == cldnn::reduce_mode::log_sum_exp)
+ acc += exp(input_val);
+
+ return acc;
+ };
+
+ AccT get(AccT& acc, size_t counter, cldnn::reduce_mode reduce_mode) {
+ if (reduce_mode == cldnn::reduce_mode::mean)
+ acc /= counter;
+ else if (reduce_mode == cldnn::reduce_mode::l2)
+ acc = sqrt(acc);
+ else if (reduce_mode == cldnn::reduce_mode::log_sum || reduce_mode == cldnn::reduce_mode::log_sum_exp)
+ acc = log(acc);
+
+ return acc;
+ };
+
+ std::map<char, int, Comparator<char>> create_coords_map(std::vector<char>& coords) {
+ auto coord_cmp = Comparator<char>();
+ coord_cmp.order = coords;
+ std::map<char, int, Comparator<char>> coord_map({}, coord_cmp);
+ int index = 0;
+ for (auto& coord : coord_cmp.order) {
+ coord_map[coord] = index;
+ index++;
+ }
+
+ return coord_map;
+ }
+
+ void remap(std::vector<size_t> &out_dims,
+ std::map<char, int, Comparator<char>>& remap_coords,
+ std::vector<uint16_t>& axis_to_remove,
+ std::map<uint16_t, int, Comparator<uint16_t>>& axes_map,
+ std::vector<char>& coords,
+ int dims) {
+
+ if (dims == 5) {
+ remap_coords.erase('w');
+ } else if (dims == 4) {
+ remap_coords.erase('w');
+ remap_coords.erase('z');
+ }
+
+ // Dimensions reshape
+ std::vector<size_t> updated_dims;
+ std::vector<char> updated_coords;
+
+ for (int index = 0; index < static_cast<int>(out_dims.size()); index++) {
+ if ((dims == 4 && (index == 2 || index == 3)) || (dims == 5 && index == 2))
+ continue;
+
+ auto index_to_remove = std::find(axis_to_remove.begin(), axis_to_remove.end(), axes_map.find(index)->second) !=
+ axis_to_remove.end();
+ if ((out_dims[index] != 1) || (out_dims[index] == 1 && !index_to_remove)) {
+ updated_dims.push_back(out_dims[index]);
+ updated_coords.push_back(coords[index]);
+ }
+ }
+
+ if (updated_dims.size() > 2) {
+ if (dims == 4) {
+ updated_dims.insert(updated_dims.begin() + 2, 2, 1);
+ } else {
+ updated_dims.insert(updated_dims.begin() + 2, out_dims.size() - updated_dims.size(), 1);
+ }
+ }
+
+ while (updated_dims.size() < out_dims.size())
+ updated_dims.push_back(1);
+
+ out_dims = std::move(updated_dims);
+
+ // Coordinates remap
+ std::map<uint16_t, int, std::greater<uint16_t>> ordered_axes;
+
+ for (auto& axis : axis_to_remove)
+ ordered_axes[axes_map.find(axis)->second] = axis;
+
+ int i = 0;
+ for (auto& coord : coords) {
+ if ((dims == 4 && (coord == 'w' || coord == 'z')) || (dims == 5 && coord == 'w'))
+ continue;
+
+ if (ordered_axes.find(remap_coords[coord]) != ordered_axes.end()) {
+ if (dims != 4)
+ updated_coords.insert(updated_coords.begin() + 2 + i, 1, coord);
+ else
+ updated_coords.push_back(coord);
+ ++i;
+ }
+ }
+
+ int j = 0;
+ for (auto& coord : coords) {
+ if ((dims == 4 && (coord == 'w' || coord == 'z')) || (dims == 5 && coord == 'w'))
+ continue;
+
+ auto temp_coords = updated_coords.at(j);
+ remap_coords[coord] = static_cast<int>(std::distance(coords.begin(), std::find(coords.begin(), coords.end(), temp_coords)));
+ ++j;
+ }
+
+ if (dims == 4) {
+ remap_coords['w'] = 2;
+ remap_coords['z'] = 3;
+ } else if (dims == 5) {
+ remap_coords['w'] = 2;
+ }
+
+ }
+};
+
+template <typename InputT, typename AccT = typename accumulator_type<InputT>::type, typename OutputT = typename output_type<InputT>::type>
+VVVVVVF<OutputT> reference_reduce(VVVVVVF<InputT>& input,
+ reduce_mode reduce_mode,
+ std::vector<uint16_t> reduce_axis,
+ const int batch,
+ const int input_f,
+ const int input_w,
+ const int input_z,
+ const int input_y,
+ const int input_x,
+ const int dims,
+ bool keepDims = false) {
+
+ auto reduce = reduce_accumulator<InputT, AccT>();
+
+ auto axis_cmp = Comparator<uint16_t>();
+ axis_cmp.order = {reduce::along_b, reduce::along_f, reduce::along_w, reduce::along_z, reduce::along_y, reduce::along_x};
+ std::map<uint16_t, int, Comparator<uint16_t>> axes_map({}, axis_cmp);
+
+ int index = 0;
+ for (auto& axis : axis_cmp.order) {
+ axes_map[axis] = index;
+ index++;
+ }
+
+ // Initial input order is b, f, x, y, w
+ std::vector<size_t> input_dims = {
+ input.size(), // b
+ input[0].size(), // f
+ input[0][0][0][0][0].size(), // w
+ input[0][0][0][0].size(), // z
+ input[0][0][0].size(), // y
+ input[0][0].size(), // x
+ };
+
+ VVVVVVF<AccT> previous(input_dims[0],
+ VVVVVF<AccT>(input_dims[1],
+ VVVVF<AccT>(input_dims[2],
+ VVVF<AccT>(input_dims[3],
+ VVF<AccT>(input_dims[4],
+ VF<AccT>(input_dims[5], 0))))));
+
+ for (size_t bi = 0; bi < input_dims[0]; ++bi)
+ for (size_t fi = 0; fi < input_dims[1]; ++fi)
+ for (size_t wi = 0; wi < input_dims[2]; ++wi)
+ for (size_t zi = 0; zi < input_dims[3]; ++zi)
+ for (size_t yi = 0; yi < input_dims[4]; ++yi)
+ for (size_t xi = 0; xi < input_dims[5]; ++xi)
+ previous[bi][fi][wi][zi][yi][xi] = static_cast<AccT>(input[bi][fi][xi][yi][zi][wi]);
+
+ std::vector<size_t> temp_dims = input_dims;
+ size_t max_counter_value = 1;
+
+ for (auto& axis : reduce_axis) {
+ auto out_dims = temp_dims;
+ out_dims[axes_map.at(axis)] = 1;
+ VVVVVVF<AccT> temp_output(out_dims[0],
+ VVVVVF<AccT>(out_dims[1],
+ VVVVF<AccT>(out_dims[2],
+ VVVF<AccT>(out_dims[3],
+ VVF<AccT>(out_dims[4],
+ VF<AccT>(out_dims[5], reduce.set_accumulator_value(reduce_mode)))))));
+
+ max_counter_value *= input_dims[axes_map.at(axis)];
+
+ for (size_t bi = 0; bi < temp_dims[0]; ++bi)
+ for (size_t fi = 0; fi < temp_dims[1]; ++fi)
+ for (size_t wi = 0; wi < temp_dims[2]; ++wi)
+ for (size_t zi = 0; zi < temp_dims[3]; ++zi)
+ for (size_t yi = 0; yi < temp_dims[4]; ++yi) {
+ for (size_t xi = 0; xi < temp_dims[5]; ++xi) {
+ auto input_val = static_cast<AccT>(previous[bi][fi][wi][zi][yi][xi]);
+
+ AccT acc = static_cast<AccT>(temp_output[bi % out_dims[0]][fi % out_dims[1]]
+ [wi % out_dims[2]][zi % out_dims[3]]
+ [yi % out_dims[4]][xi % out_dims[5]]);
+
+ temp_output[bi % out_dims[0]][fi % out_dims[1]]
+ [wi % out_dims[2]][zi % out_dims[3]]
+ [yi % out_dims[4]][xi % out_dims[5]] = reduce.accumulate(acc, input_val, reduce_mode);
+ }
+ }
+ if (&axis == &reduce_axis.back() || reduce_mode != cldnn::reduce_mode::mean)
+ if (reduce_mode == cldnn::reduce_mode::mean || reduce_mode == cldnn::reduce_mode::l2 ||
+ reduce_mode == cldnn::reduce_mode::log_sum || reduce_mode == cldnn::reduce_mode::log_sum_exp) {
+ for (size_t bi = 0; bi < temp_output.size(); ++bi)
+ for (size_t fi = 0; fi < temp_output[0].size(); ++fi)
+ for (size_t wi = 0; wi < temp_output[0][0].size(); ++wi)
+ for (size_t zi = 0; zi < temp_output[0][0][0].size(); ++zi)
+ for (size_t yi = 0; yi < temp_output[0][0][0][0].size(); ++yi) {
+ for (size_t xi = 0; xi < temp_output[0][0][0][0][0].size(); ++xi) {
+ auto current_acc_val = static_cast<AccT>(temp_output[bi % out_dims[0]][fi % out_dims[1]][wi % out_dims[2]]
+ [zi % out_dims[3]][yi % out_dims[4]][xi % out_dims[5]]);
+ temp_output[bi % out_dims[0]][fi % out_dims[1]][wi % out_dims[2]]
+ [zi % out_dims[3]][yi % out_dims[4]][xi % out_dims[5]] = reduce.get(current_acc_val, max_counter_value, reduce_mode);
+ }
+ }
+ }
+
+ previous = std::move(temp_output);
+ temp_dims = {previous.size(), // b
+ previous[0].size(), // f
+ previous[0][0].size(), // w
+ previous[0][0][0].size(), // z
+ previous[0][0][0][0].size(), // y
+ previous[0][0][0][0][0].size(), // x
+ };
+ }
+
+ VVVVVVF<AccT> output;
+
+ if (keepDims) {
+ output = std::move(previous);
+ } else {
+ std::vector<size_t> actual_dims = temp_dims;
+ std::vector<char> coords = {'b', 'f', 'w', 'z', 'y', 'x'};
+ std::map<char, int, Comparator<char>> remap_coords = reduce.create_coords_map(coords);
+ reduce.remap(actual_dims, remap_coords, reduce_axis, axes_map, coords, dims);
+
+ VVVVVVF<AccT>actual_output(actual_dims[0],
+ VVVVVF<AccT>(actual_dims[1],
+ VVVVF<AccT>(actual_dims[2],
+ VVVF<AccT>(actual_dims[3],
+ VVF<AccT>(actual_dims[4],
+ VF<AccT>(actual_dims[5], 0))))));
+
+ for (size_t bi = 0; bi < previous.size(); ++bi)
+ for (size_t fi = 0; fi < previous[0].size(); ++fi)
+ for (size_t wi = 0; wi < previous[0][0].size(); ++wi)
+ for (size_t zi = 0; zi < previous[0][0][0].size(); ++zi)
+ for (size_t yi = 0; yi < previous[0][0][0][0].size(); ++yi)
+ for (size_t xi = 0; xi < previous[0][0][0][0][0].size(); ++xi) {
+ std::vector<size_t> coords = {bi, fi, wi, zi, yi, xi};
+ actual_output[coords.at(remap_coords['b'])][coords.at(remap_coords['f'])]
+ [coords.at(remap_coords['w'])][coords.at(remap_coords['z'])]
+ [coords.at(remap_coords['y'])][coords.at(remap_coords['x'])] = previous[bi][fi][wi][zi][yi][xi];
+ }
+
+ output = std::move(actual_output);
+ }
+
+ VVVVVVF<OutputT> final_output(output.size(),
+ VVVVVF<OutputT>(output[0].size(),
+ VVVVF<OutputT>(output[0][0].size(),
+ VVVF<OutputT>(output[0][0][0].size(),
+ VVF<OutputT>(output[0][0][0][0].size(),
+ VF<OutputT>(output[0][0][0][0][0].size(), 0))))));
+
+ for (size_t bi = 0; bi < output.size(); ++bi)
+ for (size_t fi = 0; fi < output[0].size(); ++fi)
+ for (size_t wi = 0; wi < output[0][0].size(); ++wi)
+ for (size_t zi = 0; zi < output[0][0][0].size(); ++zi)
+ for (size_t yi = 0; yi < output[0][0][0][0].size(); ++yi)
+ for (size_t xi = 0; xi < output[0][0][0][0][0].size(); ++xi)
+ final_output[bi][fi][wi][zi][yi][xi] = static_cast<OutputT>(output[bi][fi][wi][zi][yi][xi]);
+
+ return final_output;
+}
+
+using TestParamType_general_reduce_gpu = ::testing::tuple<int, int, int, // 0, 1, 2 - b, f, w
+ int, int, int, // 3, 4, 5 - z, y, x
+ format, // 6 - input_dt format
+ reduce_mode, // 7 - reduce mode
+ std::vector<uint16_t>, // 8 - reduce axis
+ std::string, // 9 - kernel name
+ bool, // 10 - keepDims
+ data_types, // 11 - input_dt
+ bool, // 12 - force_output_dt
+ data_types>; // 13 - output_dt
+
+ struct general_reduce_gpu : public ::testing::TestWithParam<TestParamType_general_reduce_gpu> {
+ static std::string PrintToStringParamName(testing::TestParamInfo<TestParamType_general_reduce_gpu> param_info) {
+ const std::vector<uint16_t> reduce_axes = testing::get<8>(param_info.param);
+ std::string string_axes;
+ for (auto& axis : reduce_axes) string_axes += std::to_string(axis) + "_";
+
+ // Readable name
+ return "in_b_" + std::to_string(testing::get<0>(param_info.param)) +
+ "_f_" + std::to_string(testing::get<1>(param_info.param)) +
+ "_w_" + std::to_string(testing::get<2>(param_info.param)) +
+ "_z_" + std::to_string(testing::get<3>(param_info.param)) +
+ "_y_" + std::to_string(testing::get<4>(param_info.param)) +
+ "_x_" + std::to_string(testing::get<5>(param_info.param)) +
+ "_format_" + std::to_string(testing::get<6>(param_info.param)) +
+ "_reduce_mode_" + std::to_string(static_cast<std::underlying_type<cldnn::reduce_mode>::type>(testing::get<7>(param_info.param))) +
+ "_axes_" + string_axes +
+ "_kernel_name_" + testing::get<9>(param_info.param) +
+ "_keepDims_" + std::to_string(testing::get<10>(param_info.param));
+ }
+};
+
+template <data_types InputT>
+struct input_data_type {
+ using type = float;
+};
+
+template <>
+struct input_data_type <data_types::i8> {
+ using type = int8_t;
+};
+
+template <>
+struct input_data_type <data_types::u8> {
+ using type = uint8_t;
+};
+
+template <data_types OutputT>
+struct output_data_type {
+ using type = float;
+};
+
+template <>
+struct output_data_type<data_types::i8> {
+ using type = int8_t;
+};
+
+template <>
+struct output_data_type<data_types::u8> {
+ using type = uint8_t;
+};
+
+template <data_types InputT, data_types OutputT>
+class ReduceTestBase : public ::testing::TestWithParam<TestParamType_general_reduce_gpu> {
+protected:
+ cldnn::engine engine = get_test_engine();
+ int batch_num, input_f, input_w, input_z, input_y, input_x;
+ cldnn::format input_format = format::any;
+ cldnn::reduce_mode reduce_mode;
+ std::vector<uint16_t> reduce_axis;
+ std::string kernel_name;
+ bool keep_dims;
+ cldnn::data_types input_dt;
+ cldnn::data_types output_dt;
+ bool force_output_dt;
+
+ ReduceTestBase() {
+ this->batch_num = testing::get<0>(GetParam());
+ this->input_f = testing::get<1>(GetParam());
+ this->input_w = testing::get<2>(GetParam());
+ this->input_z = testing::get<3>(GetParam());
+ this->input_y = testing::get<4>(GetParam());
+ this->input_x = testing::get<5>(GetParam());
+ this->input_format = testing::get<6>(GetParam());
+ this->reduce_mode = testing::get<7>(GetParam());
+ this->reduce_axis = testing::get<8>(GetParam());
+ this->kernel_name = testing::get<9>(GetParam());
+ this->keep_dims = testing::get<10>(GetParam());
+ this->input_dt = testing::get<11>(GetParam());
+ this->force_output_dt = testing::get<12>(GetParam());
+ this->output_dt = testing::get<13>(GetParam());
+ }
+
+public:
+ void execute() {
+ int input_dim = static_cast<int>(input_format.dimension());
+ cldnn::format layout_format = input_format;
+
+ if (input_dim == 4)
+ layout_format = format::bfyx;
+ else if (input_dim == 5)
+ layout_format = format::bfzyx;
+ else
+ layout_format = format::bfwzyx;
+
+ using input_t = typename input_data_type<InputT>::type;
+ using output_t = typename output_data_type<OutputT>::type;
+
+ auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y, input_z, input_w));
+ auto input_data = generate_random_6d<input_t>(batch_num, input_f, input_x, input_y, input_z, input_w, 1, 10);
+ auto input_lay = layout(input_dt, layout_format, input_size);
+ auto input_mem = memory::allocate(engine, input_lay);
+
+ {
+ auto input_ptr = input_mem.pointer<input_t>();
+ for (int fi = 0; fi < input_f; fi++)
+ for (int wi = 0; wi < input_w; wi++)
+ for (int zi = 0; zi < input_z; zi++)
+ for (int yi = 0; yi < input_y; yi++)
+ for (int xi = 0; xi < input_x; xi++) {
+ for (int bi = 0; bi < batch_num; bi++) {
+ tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, wi));
+ size_t offset = input_lay.get_linear_offset(coords);
+ input_ptr[offset] = input_data[bi][fi][xi][yi][zi][wi];
+ }
+ }
+ }
+
+ auto reference_result = reference_reduce(input_data, reduce_mode, reduce_axis, batch_num,
+ input_f, input_w, input_z, input_y,
+ input_x, input_dim, keep_dims);
+ topology topology;
+ auto red = reduce("reduce", "input", reduce_mode, reduce_axis, keep_dims);
+ if (force_output_dt) {
+ red.output_data_type = output_dt;
+ }
+ topology.add(input_layout("input", input_mem.get_layout()));
+ topology.add(red);
+ build_options options;
+ options.set_option(build_option::optimize_data(true));
+ implementation_desc reduce_impl = {input_format, kernel_name};
+ options.set_option(build_option::force_implementations({{"reduce", reduce_impl}}));
+ network network(engine, topology, options);
+ network.set_input_data("input", input_mem);
+
+ network.execute();
+
+ auto out_mem = network.get_output("reduce").get_memory();
+ auto out_ptr = out_mem.pointer<output_t>();
+ auto out_lay = out_mem.get_layout();
+
+ ASSERT_EQ(out_lay.size.sizes()[0], reference_result.size()); // b
+ ASSERT_EQ(out_lay.size.sizes()[1], reference_result[0].size()); // f
+ ASSERT_EQ(out_lay.size.spatial[3], reference_result[0][0].size()); // w
+ ASSERT_EQ(out_lay.size.spatial[2], reference_result[0][0][0].size()); // z
+ ASSERT_EQ(out_lay.size.spatial[1], reference_result[0][0][0][0].size()); // y
+ ASSERT_EQ(out_lay.size.spatial[0], reference_result[0][0][0][0][0].size()); // x
+
+ for (size_t bi = 0; bi < reference_result.size(); bi++)
+ for (size_t fi = 0; fi < reference_result[0].size(); fi++)
+ for (size_t wi = 0; wi < reference_result[0][0].size(); wi++)
+ for (size_t zi = 0; zi < reference_result[0][0][0].size(); zi++)
+ for (size_t yi = 0; yi < reference_result[0][0][0][0].size(); yi++) {
+ for (size_t xi = 0; xi < reference_result[0][0][0][0][0].size(); xi++) {
+ tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, wi));
+ size_t offset = out_lay.get_linear_offset(coords);
+ auto val = out_ptr[offset];
+ auto val_ref = static_cast<output_t>(reference_result[bi][fi][wi][zi][yi][xi]);
+ auto equal = are_equal(val_ref, val, 1e-1f);
+
+ if (!equal)
+ std::cout << "Reference value at batch: " << bi << " output_f: " << fi
+ << " y: " << yi << " x: " << xi << " = " << val_ref << " Val = " << val
+ << std::endl;
+ EXPECT_TRUE(equal);
+ }
+ }
+ }
+};
+
+class general_reduce_gpu_i8_i8 : public ReduceTestBase<data_types::i8, data_types::i8> {};
+TEST_P(general_reduce_gpu_i8_i8, base) { execute(); }
+
+class general_reduce_gpu_i8_f32 : public ReduceTestBase<data_types::i8, data_types::f32> {};
+TEST_P(general_reduce_gpu_i8_f32, base) { execute(); }
+
+class general_reduce_gpu_f32_f32 : public ReduceTestBase<data_types::f32, data_types::f32> {};
+TEST_P(general_reduce_gpu_f32_f32, base) { execute(); }
+
+
+ INSTANTIATE_TEST_CASE_P(reduce_gpu_b_fs_yx_fsv16_i8_i8,
+ general_reduce_gpu_i8_i8,
+ ::testing::Values(
+ TestParamType_general_reduce_gpu(2, 12, 1, 1, 3, 2, format::b_fs_yx_fsv16, reduce_mode::logical_or, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, true, data_types::i8),
+ TestParamType_general_reduce_gpu(2, 3, 1, 1, 8, 5, format::b_fs_yx_fsv16, reduce_mode::logical_and, {reduce::along_b, reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, true, data_types::i8),
+ TestParamType_general_reduce_gpu(3, 3, 1, 1, 3, 6, format::b_fs_yx_fsv16, reduce_mode::logical_or, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, true, data_types::i8),
+ TestParamType_general_reduce_gpu(3, 5, 1, 1, 3, 2, format::b_fs_yx_fsv16, reduce_mode::logical_and, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, true, data_types::i8),
+ TestParamType_general_reduce_gpu(3, 7, 1, 1, 3, 2, format::b_fs_yx_fsv16, reduce_mode::logical_or, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, true, data_types::i8),
+ TestParamType_general_reduce_gpu(1, 3, 1, 1, 6, 12, format::b_fs_yx_fsv16, reduce_mode::logical_and, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, true, data_types::i8)
+ ),
+ general_reduce_gpu::PrintToStringParamName);
+
+ INSTANTIATE_TEST_CASE_P(reduce_gpu_b_fs_yx_fsv16_i8_f32,
+ general_reduce_gpu_i8_f32,
+ ::testing::Values(
+ TestParamType_general_reduce_gpu(3, 3, 1, 1, 3, 2, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 3, 1, 1, 3, 3, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 3, 1, 1, 2, 11, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 3, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(26, 12, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 12, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 4, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 13, 12, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(1, 2, 1, 1, 5, 5, format::b_fs_yx_fsv16, reduce_mode::prod, {reduce::along_x, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 26, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 18, 11, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 17, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 16, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 15, 8, format::b_fs_yx_fsv16, reduce_mode::log_sum_exp, {reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 14, 11, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 12, 8, format::b_fs_yx_fsv16, reduce_mode::sum_square, {reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 12, 11, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, false, data_types::f32),
+
+ TestParamType_general_reduce_gpu(7, 3, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(26, 12, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 12, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 4, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 13, 9, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(2, 5, 1, 1, 3, 3, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_x, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 26, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 18, 11, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 17, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 16, 15, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 15, 8, format::b_fs_yx_fsv16, reduce_mode::log_sum_exp, {reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 3, 1, 1, 14, 11, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 12, 8, format::b_fs_yx_fsv16, reduce_mode::sum_square, {reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 12, 11, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::i8, false, data_types::f32)
+ ),
+ general_reduce_gpu::PrintToStringParamName);
+
+ INSTANTIATE_TEST_CASE_P(reduce_gpu_b_fs_yx_fsv16_f32_f32,
+ general_reduce_gpu_f32_f32,
+ ::testing::Values(
+ TestParamType_general_reduce_gpu(7, 3, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(26, 12, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 12, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 4, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 13, 12, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(1, 2, 1, 1, 5, 5, format::b_fs_yx_fsv16, reduce_mode::prod, {reduce::along_x, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 26, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 18, 11, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 17, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 16, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 15, 8, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 14, 11, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 12, 8, format::b_fs_yx_fsv16, reduce_mode::sum_square, {reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 12, 11, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, false, data_types::f32),
+
+ TestParamType_general_reduce_gpu(7, 3, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(26, 12, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 12, 1, 1, 13, 11, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 4, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 13, 9, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(1, 34, 1, 1, 13, 13, format::b_fs_yx_fsv16, reduce_mode::sum, {reduce::along_x, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 26, 1, 1, 16, 8, format::b_fs_yx_fsv16, reduce_mode::mean, {reduce::along_x, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 18, 11, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 17, 8, format::b_fs_yx_fsv16, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 16, 15, format::b_fs_yx_fsv16, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 15, 8, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_b}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 14, 11, format::b_fs_yx_fsv16, reduce_mode::l2, {reduce::along_f}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 12, 8, format::b_fs_yx_fsv16, reduce_mode::sum_square, {reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 34, 1, 1, 12, 11, format::b_fs_yx_fsv16, reduce_mode::log_sum, {reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", true, data_types::f32, false, data_types::f32)
+ ),
+ general_reduce_gpu::PrintToStringParamName);
+
+ INSTANTIATE_TEST_CASE_P(reduce_gpu_ref_f32_f32,
+ general_reduce_gpu_f32_f32,
+ ::testing::Values(
+ TestParamType_general_reduce_gpu(2, 4, 4, 5, 8, 8, format::bfwzyx, reduce_mode::mean, {reduce::along_f, reduce::along_y, reduce::along_w, reduce::along_z}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 16, 6, 3, 8, 15, format::bfwzyx, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 16, 3, 7, 12, 12, format::bfwzyx, reduce_mode::mean, {reduce::along_b, reduce::along_y, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 9, 3, 7, 7, 17, format::bfwzyx, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 6, 3, 7, 3, 8, format::bfwzyx, reduce_mode::mean, {reduce::along_y, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(2, 3, 4, 5, 6, 7, format::bfwzyx, reduce_mode::mean, {reduce::along_f, reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 8, 5, 5, 4, 4, format::bfwzyx, reduce_mode::mean, {reduce::along_z}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 8, 5, 5, 3, 6, format::bfwzyx, reduce_mode::mean, {reduce::along_w}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 8, 5, 5, 8, 8, format::bfwzyx, reduce_mode::mean, {reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 8, 5, 5, 3, 6, format::bfwzyx, reduce_mode::mean, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 8, 5, 5, 3, 6, format::bfwzyx, reduce_mode::mean, {reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 16, 4, 5, 3, 6, format::bfwzyx, reduce_mode::mean, {reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 8, 2, 4, 2, 5, format::bfwzyx, reduce_mode::mean, {reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 2, 1, 7, 8, 3, format::bfzyx, reduce_mode::log_sum_exp, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 11, 1, 7, 2, 2, format::bfzyx, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(8, 4, 1, 7, 2, 2, format::bfzyx, reduce_mode::l2, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(4, 5, 1, 7, 12, 4, format::bfzyx, reduce_mode::l1, {reduce::along_y, reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 7, 12, 4, format::bfzyx, reduce_mode::sum, {reduce::along_x, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 7, 4, 12, format::bfzyx, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 4, 1, 7, 12, 12, format::bfzyx, reduce_mode::min, {reduce::along_y, reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 11, 1, 1, 7, 17, format::bfyx, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 6, 1, 7, 2, 9, format::bfzyx, reduce_mode::l1, {reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 6, 1, 7, 2, 9, format::bfzyx, reduce_mode::l1, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 6, 1, 7, 2, 9, format::bfzyx, reduce_mode::l1, {reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 6, 1, 7, 2, 9, format::bfzyx, reduce_mode::l1, {reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 7, 1, 1, 7, 17, format::bfyx, reduce_mode::sum, {reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 9, 1, 1, 7, 17, format::bfyx, reduce_mode::l2, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(5, 5, 1, 1, 17, 17, format::bfyx, reduce_mode::mean, {reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 12, 1, 1, 7, 17, format::bfyx, reduce_mode::log_sum, {reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+
+ TestParamType_general_reduce_gpu(7, 3, 6, 6, 12, 12, format::bfwzyx, reduce_mode::log_sum_exp, {reduce::along_b, reduce::along_y, reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 4, 6, 6, 12, 12, format::bfwzyx, reduce_mode::l1, {reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 6, 6, 4, 7, format::bfwzyx, reduce_mode::l2, {reduce::along_f, reduce::along_y, reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 16, 6, 6, 7, 12, format::bfwzyx, reduce_mode::l1, {reduce::along_y, reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 6, 6, 7, 2, format::bfwzyx, reduce_mode::sum, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(8, 16, 6, 6, 3, 7, format::bfwzyx, reduce_mode::mean, {reduce::along_x, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(2, 16, 6, 6, 12, 3, format::bfwzyx, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 16, 6, 6, 8, 12, format::bfwzyx, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 11, 1, 6, 7, 17, format::bfzyx, reduce_mode::l1, {reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 7, 1, 6, 7, 3, format::bfzyx, reduce_mode::sum, {reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 9, 1, 6, 7, 7, format::bfzyx, reduce_mode::l2, {reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(5, 5, 1, 6, 8, 3, format::bfzyx, reduce_mode::mean, {reduce::along_y}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 8, 1, 6, 7, 3, format::bfzyx, reduce_mode::log_sum, {reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 12, 4, format::bfyx, reduce_mode::mean, {reduce::along_x, reduce::along_y, reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 4, 1, 1, 12, 12, format::bfyx, reduce_mode::mean, {reduce::along_b, reduce::along_y, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(4, 3, 1, 1, 7, 12, format::bfyx, reduce_mode::sum, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(16, 16, 1, 1, 4, 11, format::bfyx, reduce_mode::mean, {reduce::along_y, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(2, 16, 1, 1, 3, 6, format::bfyx, reduce_mode::mean, {reduce::along_f, reduce::along_y}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 4, 1, 1, 3, 6, format::bfyx, reduce_mode::mean, {reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 32, 1, 1, 3, 6, format::bfyx, reduce_mode::mean, {reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(17, 4, 1, 1, 12, 15, format::bfyx, reduce_mode::mean, {reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32)
+ ), general_reduce_gpu::PrintToStringParamName);
+
+INSTANTIATE_TEST_CASE_P(DISABLED_reduce_gpu_ref_f32_f32,
+ general_reduce_gpu_f32_f32,
+ ::testing::Values(
+ TestParamType_general_reduce_gpu(1, 7, 1, 1, 4, 3,format::bfyx, reduce_mode::mean, {reduce::along_b, reduce::along_f, reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 7, 7, 6, 4, 3, format::bfwzyx, reduce_mode::l1, {reduce::along_b, reduce::along_x, reduce::along_y}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(1, 5, 7, 6, 4, 3, format::bfwzyx, reduce_mode::l2, {reduce::along_x, reduce::along_w}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(1, 2, 1, 2, 4, 3, format::bfzyx, reduce_mode::prod, {reduce::along_z, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(2, 7, 1, 1, 4, 3, format::fyxb, reduce_mode::sum, {reduce::along_b, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(1, 7, 1, 1, 4, 3, format::bfyx, reduce_mode::max, {reduce::along_y, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(12, 2, 1, 1, 4, 3, format::yxfb, reduce_mode::min, {reduce::along_y, reduce::along_b}, "reduce_ref" ,false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(1, 7, 1, 1, 4, 3, format::bfyx, reduce_mode::mean, {reduce::along_b, reduce::along_x, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 7, 1, 1, 4, 3, format::b_fs_yx_fsv4, reduce_mode::l2, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(9, 7, 1, 1, 4, 3, format::b_fs_yx_fsv16, reduce_mode::prod, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(9, 9, 1, 1, 4, 3, format::b_fs_yx_fsv32, reduce_mode::l1, {reduce::along_b, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(11, 10, 1, 1, 4, 3, format::bs_fs_yx_bsv16_fsv16, reduce_mode::max, {reduce::along_b}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(12, 7, 1, 1, 4, 3, format::fs_b_yx_fsv32, reduce_mode::l2, {reduce::along_x, reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(13, 7, 1, 1, 4, 3, format::bs_fs_zyx_bsv16_fsv16, reduce_mode::sum, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(14, 7, 1, 1, 4, 3, format::b_fs_zyx_fsv16, reduce_mode::sum, {reduce::along_f}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(18, 7, 1, 1, 4, 3, format::b_fs_zyx_fsv32, reduce_mode::sum, {reduce::along_x}, "reduce_ref", false, data_types::f32, false, data_types::f32),
+
+ TestParamType_general_reduce_gpu(5, 7, 1, 1, 4, 3, format::bfwzyx, reduce_mode::sum, {reduce::along_x, reduce::along_w}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 7, 1, 1, 4, 3, format::bfzyx, reduce_mode::max, {reduce::along_z, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 7, 1, 1, 4, 3, format::fyxb, reduce_mode::min, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(11, 7, 1, 1, 4, 3, format::byxf, reduce_mode::prod, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(12, 7, 1, 1, 4, 3, format::bfyx, reduce_mode::l1, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(13, 7, 1, 1, 4, 3, format::yxfb, reduce_mode::sum, {reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(14, 7, 1, 1, 4, 3, format::bfyx, reduce_mode::l2, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(9, 7, 1, 1, 4, 3, format::b_fs_yx_fsv4, reduce_mode::log_sum, {reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(8, 7, 1, 1, 4, 3, format::b_fs_yx_fsv16, reduce_mode::log_sum_exp, {reduce::along_x, reduce::along_f, reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(7, 7, 1, 1, 4, 3, format::b_fs_yx_fsv32, reduce_mode::l1, {reduce::along_b, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(6, 7, 1, 1, 4, 3, format::bs_fs_yx_bsv16_fsv16, reduce_mode::max, {reduce::along_b}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(1, 12, 1, 1, 4, 3, format::fs_b_yx_fsv32, reduce_mode::l2, {reduce::along_x, reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(3, 7, 1, 1, 4, 3, format::bs_fs_zyx_bsv16_fsv16, reduce_mode::sum, {reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(1, 7, 1, 1, 4, 12, format::b_fs_zyx_fsv16, reduce_mode::sum, {reduce::along_f}, "reduce_ref", true, data_types::f32, false, data_types::f32),
+ TestParamType_general_reduce_gpu(1, 7, 1, 1, 8, 3, format::b_fs_zyx_fsv32, reduce_mode::sum, {reduce::along_x}, "reduce_ref", true, data_types::f32, false, data_types::f32)
+ ),
+ general_reduce_gpu::PrintToStringParamName);
+
TEST(reduce_gpu, common_bfyx) {
const auto& engine = get_test_engine();
auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, 1, 1}});