ActivationKernelBase::DispatchData ActivationKernelBase::SetDefault(const activation_params& arg) const {
const auto& out = arg.output;
- DispatchData runInfo;
- std::vector<size_t> global;
- std::vector<size_t> local;
+ DispatchData dispatchData;
if (out.GetLayout() == DataLayout::yxfb) {
- global = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v};
- local = GetOptimalLocalWorkGroupSizes(global, arg.engineInfo);
+ dispatchData.gws = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v};
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo);
} else if (out.GetLayout() == DataLayout::b_fs_yx_fsv16) {
- global = {Align(out.Feature().v, 16) * out.Batch().v, out.X().v, out.Y().v};
- local = {16, 1, 1};
+ dispatchData.gws = {Align(out.Feature().v, 16) * out.Batch().v, out.X().v, out.Y().v};
+ dispatchData.lws = {16, 1, 1};
} else {
- global = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
- local = GetOptimalLocalWorkGroupSizes(global, arg.engineInfo);
+ dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo);
}
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
+ dispatchData.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
- runInfo.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
- runInfo.fp16UnitUsed = out.GetDType() == Datatype::F16;
-
- return runInfo;
+ return dispatchData;
}
JitConstants ActivationKernelBase::GetJitConstants(const activation_params& params, DispatchData) const {
activation_params& newParams = *static_cast<activation_params*>(kd.params.get());
const std::string kernel_id = GetEntryPoint(kernelName, params.layerID, options);
- auto runInfo = SetDefault(newParams);
- auto cldnn_jit = GetJitConstants(newParams, runInfo);
+ auto dispatchData = SetDefault(newParams);
+ auto cldnn_jit = GetJitConstants(newParams, dispatchData);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point,
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
DEFAULT, false, false, 1, GetFusedPrimitiveInputsCount(params));
if (!newParams.inputActivationParams.empty()) {
kernel.arguments.push_back({ArgumentDescriptor::Types::SLOPE, 0});
}
- kd.estimatedTime = runInfo.efficiency;
+ kd.estimatedTime = dispatchData.efficiency;
return {kd};
}
protected:
bool Validate(const Params& p, const optional_params& o) const override;
- virtual JitConstants GetJitConstants(const activation_params& params, DispatchData kd) const;
+ virtual JitConstants GetJitConstants(const activation_params& params, DispatchData dispatchData) const;
virtual DispatchData SetDefault(const activation_params& arg) const;
KernelsData GetCommonKernelsData(const Params& params, const optional_params& options) const;
};
}
ActivationKernelOpt::Parent::DispatchData ActivationKernelOpt::SetDefault(const activation_params& params) const {
- auto runInfo = Parent::SetDefault(params);
+ auto dispatchData = Parent::SetDefault(params);
const auto totalSize = params.inputs[0].LogicalSize();
- std::vector<size_t> global = {totalSize / NUM_COLS_WI};
- std::vector<size_t> local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { totalSize / NUM_COLS_WI, 1, 1 };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- runInfo.gws0 = global[0];
- runInfo.gws1 = 1;
- runInfo.gws2 = 1;
+ dispatchData.efficiency = FORCE_PRIORITY_6;
- runInfo.lws0 = local[0];
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
-
- runInfo.efficiency = FORCE_PRIORITY_6;
-
- return runInfo;
+ return dispatchData;
}
bool ActivationKernelOpt::Validate(const Params& p, const optional_params& o) const {
return true;
}
-JitConstants ActivationKernelOpt::GetJitConstants(const activation_params& params, DispatchData kd) const {
- auto jit = ActivationKernelBase::GetJitConstants(params, kd);
+JitConstants ActivationKernelOpt::GetJitConstants(const activation_params& params, DispatchData dispatchData) const {
+ auto jit = ActivationKernelBase::GetJitConstants(params, dispatchData);
auto input_dt = params.inputs[0].GetDType();
jit.AddConstant(MakeJitConstant("NUM_COLS_WI", NUM_COLS_WI));
static const int NUM_COLS_WI = 4;
DispatchData SetDefault(const activation_params& arg) const override;
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const activation_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const activation_params& params, DispatchData dispatchData) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {FusedOpType::QUANTIZE,
FusedOpType::SCALE,
return k;
}
-JitConstants ActivationKernelRef::GetJitConstants(const activation_params& params, DispatchData kd) const {
- auto jit = ActivationKernelBase::GetJitConstants(params, kd);
+JitConstants ActivationKernelRef::GetJitConstants(const activation_params& params, DispatchData dispatchData) const {
+ auto jit = ActivationKernelBase::GetJitConstants(params, dispatchData);
auto input_dt = params.inputs[0].GetDType();
if (!params.fused_ops.empty()) {
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
- JitConstants GetJitConstants(const activation_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const activation_params& params, DispatchData dispatchData) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {FusedOpType::QUANTIZE,
FusedOpType::SCALE,
}
const arg_max_min_params& orgParams = static_cast<const arg_max_min_params&>(params);
- DispatchData runInfo;
- runInfo.fp16UnitUsed = orgParams.inputs[0].GetDType() == Datatype::F16;
-
size_t sort_size = orgParams.argMaxMinSortType == ArgMaxMinSortType::VALUE ? getSortSize(orgParams) : 1;
- std::vector<size_t> local, global;
- global = { Align(getOperationNumber(orgParams), 32), sort_size, 1 };
- local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
+ DispatchData dispatchData;
+ dispatchData.gws = { Align(getOperationNumber(orgParams), 32), sort_size, 1 };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
KernelData kd = KernelData::Default<arg_max_min_params>(params);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
if (orgParams.outputs_num == 2) {
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
/*
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
ArgMaxMinKernelBase::DispatchData ArgMaxMinKernelBase::SetDefault(const arg_max_min_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+ dispatchData.gws = { 128, params.inputs[0].Batch().v, 1 };
+ dispatchData.lws = { 128, 1, 1 };
- // Determine global work sizes.
- kd.gws0 = 128;
- kd.gws1 = params.inputs[0].Batch().v;
- kd.gws2 = 1;
-
- kd.lws0 = 128;
- kd.lws1 = 1;
- kd.lws2 = 1;
-
- return kd;
+ return dispatchData;
}
KernelsData ArgMaxMinKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, float estimatedTime) const {
const arg_max_min_params& orgParams = static_cast<const arg_max_min_params&>(params);
- DispatchData runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<arg_max_min_params>(params);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kd.estimatedTime = estimatedTime;
/*
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
const arg_max_min_params& orgParams = static_cast<const arg_max_min_params&>(params);
- int topK = orgParams.topK;
- long size = (long)(orgParams.inputs[0].X().v * orgParams.inputs[0].Y().v * orgParams.inputs[0].Feature().v) / 8;
- long outSize = size / 16 * topK;
+ size_t topK = orgParams.topK;
+ size_t size = (size_t)(orgParams.inputs[0].X().v * orgParams.inputs[0].Y().v * orgParams.inputs[0].Feature().v) / 8;
+ size_t outSize = size / 16 * topK;
int kernelAmount = 1;
- for (; outSize > 128; outSize = (long)((outSize / 128 + 1) * topK)) {
+ for (; outSize > 128; outSize = (size_t)((outSize / 128 + 1) * topK)) {
kernelAmount++;
}
KernelData kd = KernelData::Default<arg_max_min_params>(params, kernelAmount);
newParams.inputs[0] = input;
auto& kernel = kd.kernels[i];
- DispatchData runInfo = SetDefault(newParams);
+ DispatchData dispatchData = SetDefault(newParams);
auto cldnnJit = GetJitConstants(newParams);
auto entryPoint = GetEntryPoint(kernelName, newParams.layerID, options);
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
- runInfo.fp16UnitUsed = orgParams.inputs[0].GetDType() == Datatype::F16;
+ dispatchData.gws = { Align(size, 16), orgParams.inputs[0].Batch().v, 1 };
+ dispatchData.lws = { 16, 1, 1 };
- runInfo.gws0 = Align(size, 16);
- runInfo.gws1 = orgParams.inputs[0].Batch().v; // B
- runInfo.gws2 = 1;
-
- runInfo.lws0 = 16;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
-
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entryPoint);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entryPoint);
size = (size / 128 + 1) * topK;
}
/*
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
const average_unpooling_params& params) const {
const auto& input = params.inputs[0];
- DispatchData kd;
+ DispatchData dispatchData;
if (input.GetLayout() == DataLayout::bfyx || input.GetLayout() == DataLayout::byxf) {
// Determine global work sizes.
- kd.gws2 = input.Batch().v * input.Feature().v; // B, F
- kd.gws0 = Align(input.X().v, 32); // X
- kd.gws1 = input.Y().v; // Y
+ dispatchData.gws = { Align(input.X().v, 32), // X
+ input.Y().v, // Y
+ input.Batch().v * input.Feature().v, // B, F
+ };
- kd.lws0 = 32;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws = { 32, 1, 1 };
} else {
// Determine global work sizes.
- kd.gws0 = input.Batch().v * input.Feature().v; // B, F
- kd.gws1 = input.X().v; // X
- kd.gws2 = input.Y().v; // Y
-
- kd.lws0 = std::min(std::max(kd.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
- while (kd.gws0 % kd.lws0 != 0) {
- --kd.lws0;
+ dispatchData.gws = { input.Batch().v * input.Feature().v, // B, F
+ input.X().v, // X
+ input.Y().v }; // Y
+
+ dispatchData.lws = {1, 1, 1};
+ dispatchData.lws[0] = std::min(std::max(dispatchData.gws[0], static_cast<size_t>(1)), static_cast<size_t>(32));
+ while (dispatchData.gws[0] % dispatchData.lws[0] != 0) {
+ --dispatchData.lws[0];
}
- kd.lws1 = 1;
- kd.lws2 = 1;
}
- return kd;
+ return dispatchData;
}
KernelsData AverageUnpoolingKernelBase::GetCommonKernelsData(const Params& params,
const average_unpooling_params& orgParams = static_cast<const average_unpooling_params&>(params);
- DispatchData runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<average_unpooling_params>(params);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kd.estimatedTime = estimatedTime;
return {kd};
}
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
CommonDispatchData BatchToSpaceKernelBase::SetDefault(const batch_to_space_params& params, const optional_params&) const {
const auto& out = params.output;
- CommonDispatchData runInfo;
- std::vector<size_t> global;
- std::vector<size_t> local;
-
+ CommonDispatchData dispatchData;
if (out.GetLayout() == DataLayout::b_fs_yx_fsv16 && out.Feature().v % 16 == 0) {
- global = { out.Batch().v, out.Feature().v, out.Y().v * out.X().v };
- local = {1, 16, 1};
+ dispatchData.gws = { out.Batch().v, out.Feature().v, out.Y().v * out.X().v };
+ dispatchData.lws = { 1, 16, 1 };
} else {
- global = { out.Batch().v, out.Feature().v, out.W().v * out.Z().v * out.Y().v * out.X().v };
- local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { out.Batch().v, out.Feature().v, out.W().v * out.Z().v * out.Y().v * out.X().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
}
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
JitConstants BatchToSpaceKernelBase::GetJitConstants(const batch_to_space_params& params) const {
return {};
}
- auto runInfo = SetDefault(newParams, options);
+ auto dispatchData = SetDefault(newParams, options);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point,
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
"", false, false, 1, GetFusedPrimitiveInputsCount(params));
kd.estimatedTime = estimatedTime;
return k;
}
-BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernel1x1::SetDefault(
- const binary_convolution_params& params,
- int) const {
- DispatchData kd = BinaryConvolutionKernelBase::SetDefault(params);
+BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernel1x1::SetDefault(const binary_convolution_params& params, int) const {
+ DispatchData dispatchData = BinaryConvolutionKernelBase::SetDefault(params);
const auto& out = params.output;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = Align(x * y, sub_group_size);
- kd.gws1 = CeilDiv(f, 2 * sub_group_size); // 1 WI calcs 32 OC
- kd.gws2 = b;
+ dispatchData.gws[0] = Align(x * y, sub_group_size);
+ dispatchData.gws[1] = CeilDiv(f, 2 * sub_group_size); // 1 WI calcs 32 OC
+ dispatchData.gws[2] = b;
- kd.lws0 = sub_group_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = sub_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- kd.efficiency = FORCE_PRIORITY_1;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
- return kd;
+ return dispatchData;
}
bool BinaryConvolutionKernel1x1::Validate(const Params& p, const optional_params& o) const {
}
JitConstants BinaryConvolutionKernel1x1::GetJitConstants(const binary_convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
jit.AddConstant(MakeJitConstant("INPUT0_FEATURE_NUM_PACKED", CeilDiv(params.inputs[0].Feature().v, ic_pack_size)));
return WeightsLayout::os_is_yx_osv32_isv32p;
}
JitConstants GetFusedPrimitivesJitConstants(const binary_convolution_params& params,
- const DispatchData& kd) const override;
+ const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const binary_convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const binary_convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const binary_convolution_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernel1x1_b_fs_yx_fsv16::SetDefault(
const binary_convolution_params& params,
int) const {
- DispatchData kd = BinaryConvolutionKernelBase::SetDefault(params);
+ DispatchData dispatchData = BinaryConvolutionKernelBase::SetDefault(params);
const auto& out = params.output;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = Align(x * y, sub_group_size);
- kd.gws1 = CeilDiv(f, sub_group_size); // 1 WI calcs 16 OC
- kd.gws2 = b;
+ dispatchData.gws[0] = Align(x * y, sub_group_size);
+ dispatchData.gws[1] = CeilDiv(f, sub_group_size); // 1 WI calcs 16 OC
+ dispatchData.gws[2] = b;
- kd.lws0 = sub_group_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws = { sub_group_size, 1, 1 };
- kd.efficiency = FORCE_PRIORITY_1;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
- return kd;
+ return dispatchData;
}
bool BinaryConvolutionKernel1x1_b_fs_yx_fsv16::Validate(const Params& p, const optional_params& o) const {
}
JitConstants BinaryConvolutionKernel1x1_b_fs_yx_fsv16::GetJitConstants(const binary_convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
jit.AddConstant(MakeJitConstant("INPUT0_FEATURE_NUM_PACKED", CeilDiv(params.inputs[0].Feature().v, ic_pack_size)));
return WeightsLayout::os_is_yx_osv32_isv32p;
}
JitConstants GetFusedPrimitivesJitConstants(const binary_convolution_params& params,
- const DispatchData& kd) const override;
+ const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const binary_convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const binary_convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const binary_convolution_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
}
JitConstants BinaryConvolutionKernelBase::GetJitConstants(const binary_convolution_params& params,
- const DispatchData& kd) const {
+ const DispatchData& dispatchData) const {
JitConstants jit = WeightBiasKernelBase::GetJitConstants(params);
- jit.Merge(GetFusedPrimitivesJitConstants(params, kd));
+ jit.Merge(GetFusedPrimitivesJitConstants(params, dispatchData));
jit.AddConstants({
MakeJitConstant("STRIDE", params.stride),
return {};
}
-bool BinaryConvolutionKernelBase::CheckWorkGroups(const BinaryConvolutionKernelBase::DispatchData& kd) {
- if (kd.gws0 == 0 || kd.gws1 == 0 || kd.gws2 == 0 || kd.lws0 == 0 || kd.lws1 == 0 || kd.lws2 == 0) {
+bool BinaryConvolutionKernelBase::CheckWorkGroups(const BinaryConvolutionKernelBase::DispatchData& dispatchData) {
+ if (dispatchData.gws.size() != 3 || dispatchData.lws.size() != 3)
return false;
- }
- if ((kd.gws0 % kd.lws0) != 0 || (kd.gws1 % kd.lws1) != 0 || (kd.gws2 % kd.lws2) != 0) {
- return false;
+ for (size_t i = 0; i < dispatchData.gws.size(); i++) {
+ if (dispatchData.gws[i] == 0 || dispatchData.lws[i] == 0)
+ return false;
+ if ((dispatchData.gws[i] % dispatchData.lws[i]) != 0)
+ return false;
}
return true;
}
-BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernelBase::SetDefault(
- const binary_convolution_params& params,
- int) const {
- DispatchData kd;
+BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernelBase::SetDefault(const binary_convolution_params& params,
+ int) const {
+ DispatchData dispatchData;
const auto& out = params.output;
- kd.fp16UnitUsed = out.GetDType() == Datatype::F16;
std::vector<size_t> global;
if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf) {
global = {out.X().v, out.Y().v, out.Feature().v * out.Batch().v};
auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- kd.cldnnStyle.blockWidth = 1;
- kd.cldnnStyle.blockHeight = 1;
- kd.cldnnStyle.prefetch = 0;
- kd.cldnnStyle.inputBlockArraySize = 0;
- kd.cldnnStyle.inputBlockWidth = 0;
-
- kd.gemmStyle.globalWorkSizeDX = 1;
- kd.gemmStyle.globalWorkSizeDY = 1;
- kd.gemmStyle.globalWorkSizeDZ = 1;
- kd.gemmStyle.subBlockDimK = 1;
- kd.gemmStyle.subBlockDimM = 0;
- kd.gemmStyle.subBlockDimN = 0;
- kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
- return kd;
+ dispatchData.gws = global;
+ dispatchData.lws = local;
+
+ dispatchData.cldnnStyle.blockWidth = 1;
+ dispatchData.cldnnStyle.blockHeight = 1;
+ dispatchData.cldnnStyle.prefetch = 0;
+ dispatchData.cldnnStyle.inputBlockArraySize = 0;
+ dispatchData.cldnnStyle.inputBlockWidth = 0;
+
+ dispatchData.gemmStyle.globalWorkSizeDX = 1;
+ dispatchData.gemmStyle.globalWorkSizeDY = 1;
+ dispatchData.gemmStyle.globalWorkSizeDZ = 1;
+ dispatchData.gemmStyle.subBlockDimK = 1;
+ dispatchData.gemmStyle.subBlockDimM = 0;
+ dispatchData.gemmStyle.subBlockDimN = 0;
+ dispatchData.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
+ return dispatchData;
}
KernelsData BinaryConvolutionKernelBase::GetCommonKernelsData(const Params& params,
if (NeedPaddedInput()) {
kd.reorderInput = CovolutionBinaryUpdateInputParams(newParams);
}
- DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
+ DispatchData dispatchData = SetDefault(newParams, autoTuneIndex);
- if (!CheckWorkGroups(runInfo)) {
+ if (!CheckWorkGroups(dispatchData)) {
// Internal Error - wrong calculation of global/local work group sizes
return {};
}
}
auto finalKernelName = GetKernelName(newParams);
- auto cldnnJit = GetJitConstants(newParams, runInfo);
+ auto cldnnJit = GetJitConstants(newParams, dispatchData);
auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options);
auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint);
}
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
finalKernelName,
jit,
fused_deps_total);
kernel.arguments.push_back({ArgumentDescriptor::Types::SPLIT, 0});
- kd.estimatedTime = runInfo.efficiency;
+ kd.estimatedTime = dispatchData.efficiency;
kd.autoTuneIndex = autoTuneIndex;
return {kd};
virtual std::string GetKernelName(const binary_convolution_params&) const { return kernelName; }
virtual bool NeedPaddedInput() const { return false; }
bool Validate(const Params& p, const optional_params& o) const override;
- virtual JitConstants GetJitConstants(const binary_convolution_params& params, const DispatchData& kd) const;
+ virtual JitConstants GetJitConstants(const binary_convolution_params& params, const DispatchData& dispatchData) const;
virtual JitConstants GetFusedPrimitivesJitConstants(const binary_convolution_params& params,
- const DispatchData& kd) const;
+ const DispatchData& dispatchData) const;
virtual DispatchData SetDefault(const binary_convolution_params& params, int autoTuneIndex = -1) const;
static bool CheckWorkGroups(const DispatchData&);
KernelsData GetCommonKernelsData(const Params& params,
return k;
}
-BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernelGeneric::SetDefault(
- const binary_convolution_params& params,
- int) const {
- DispatchData kd = BinaryConvolutionKernelBase::SetDefault(params);
+BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernelGeneric::SetDefault(const binary_convolution_params& params,
+ int) const {
+ DispatchData dispatchData = BinaryConvolutionKernelBase::SetDefault(params);
const auto& out = params.output;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = Align(x, sub_group_size) * y;
- kd.gws1 = CeilDiv(f, 2 * sub_group_size); // 1 WI calc 2 OC x 16 X
- kd.gws2 = b;
+ dispatchData.gws[0] = Align(x, sub_group_size) * y;
+ dispatchData.gws[1] = CeilDiv(f, 2 * sub_group_size); // 1 WI calc 2 OC x 16 X
+ dispatchData.gws[2] = b;
- kd.lws0 = sub_group_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = sub_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- return kd;
+ return dispatchData;
}
bool BinaryConvolutionKernelGeneric::Validate(const Params& p, const optional_params& o) const {
}
JitConstants BinaryConvolutionKernelGeneric::GetJitConstants(const binary_convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
auto input = params.inputs[0];
auto output = params.output;
return WeightsLayout::os_is_yx_osv32_isv32p;
}
JitConstants GetFusedPrimitivesJitConstants(const binary_convolution_params& params,
- const DispatchData& kd) const override;
+ const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const binary_convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const binary_convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const binary_convolution_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
return k;
}
-BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernelRef::SetDefault(
- const binary_convolution_params& params,
- int) const {
- DispatchData kd = BinaryConvolutionKernelBase::SetDefault(params);
+BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernelRef::SetDefault(const binary_convolution_params& params,
+ int) const {
+ DispatchData dispatchData = BinaryConvolutionKernelBase::SetDefault(params);
const auto& out = params.output;
auto y = out.Y().v;
auto x = out.X().v;
- kd.gws0 = b;
- kd.gws1 = f;
- kd.gws2 = x * y;
+ dispatchData.gws[0] = b;
+ dispatchData.gws[1] = f;
+ dispatchData.gws[2] = x * y;
- kd.lws0 = 1;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
+ dispatchData.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
- return kd;
+ return dispatchData;
}
JitConstants BinaryConvolutionKernelRef::GetJitConstants(const binary_convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
int pad_physical_val = params.pad_value == -1.0f ? 0x00000000 : 0xFFFFFFFF;
int leftovers_mask = (0xFFFFFFFF >> (32 - params.inputs[0].Feature().v % 32));
return WeightsLayout::os_is_yx_osv32_isv32p;
}
JitConstants GetFusedPrimitivesJitConstants(const binary_convolution_params& params,
- const DispatchData& kd) const override;
+ const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const binary_convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const binary_convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const binary_convolution_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
BorderKernelBase::DispatchData BorderKernelBase::SetDefault(const border_params& params) const {
const auto& output = params.output;
- DispatchData kd;
+ DispatchData dispatchData;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+ dispatchData.gws = { output.X().v * output.Z().v, output.Y().v * output.W().v, output.Batch().v * output.Feature().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- std::vector<size_t> global{output.X().v * output.Z().v, output.Y().v * output.W().v, output.Batch().v * output.Feature().v};
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData BorderKernelBase::GetCommonKernelsData(const Params& params,
const auto& prim_params =
static_cast<const border_params&>(params);
- auto run_info = SetDefault(prim_params);
+ auto dispatchData = SetDefault(prim_params);
KernelData k_data = KernelData::Default<border_params>(params);
auto cldnn_jit = GetJitConstants(prim_params);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = k_data.kernels[0];
- FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
k_data.estimatedTime = estimated_time;
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
BroadcastKernelBase::DispatchData BroadcastKernelBase::SetDefault(const broadcast_params& params) {
const auto& output = params.output;
- DispatchData kd;
+ DispatchData dispatchData;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+ dispatchData.gws = { output.X().v, output.Y().v * output.Z().v, output.Batch().v * output.Feature().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- std::vector<size_t> global{output.X().v, output.Y().v * output.Z().v, output.Batch().v * output.Feature().v};
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData BroadcastKernelBase::GetCommonKernelsData(const Params& params,
const auto& prim_params =
static_cast<const broadcast_params&>(params);
- auto run_info = SetDefault(prim_params);
+ auto dispatchData = SetDefault(prim_params);
KernelData k_data = KernelData::Default<broadcast_params>(params);
auto cldnn_jit = GetJitConstants(prim_params);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = k_data.kernels[0];
- FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
k_data.estimatedTime = estimated_time;
return {k_data};
}
ConcatenationKernelBase::DispatchData ConcatenationKernel_b_fs_yx_fsv16::SetDefault(const concatenation_params& params) const {
- DispatchData runInfo = ConcatenationKernelBase::SetDefault(params);
+ DispatchData dispatchData = ConcatenationKernelBase::SetDefault(params);
const auto& input = params.inputs[0];
auto tileXY = getTileXY(params);
size_t tileF = params.misalignment == 0 ? 1 : 2;
- runInfo.gws0 = CeilDiv(input.X().v * input.Y().v, tileXY);
- runInfo.gws1 = Align(input.Feature().v, 16 * tileF) / tileF;
- runInfo.gws2 = input.Batch().v;
+ dispatchData.gws[0] = CeilDiv(input.X().v * input.Y().v, tileXY);
+ dispatchData.gws[1] = Align(input.Feature().v, 16 * tileF) / tileF;
+ dispatchData.gws[2] = input.Batch().v;
- runInfo.lws0 = 1;
- runInfo.lws1 = 16;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 16;
+ dispatchData.lws[2] = 1;
- runInfo.efficiency = FORCE_PRIORITY_1;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
- return runInfo;
+ return dispatchData;
}
JitConstants ConcatenationKernel_b_fs_yx_fsv16::GetJitConstants(const concatenation_params& params) const {
}
ConcatenationKernelBase::DispatchData ConcatenationKernelBase::SetDefault(const concatenation_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& dims = params.inputs[0].GetDims();
auto layout = params.inputs[0].GetLayout();
DataTensor::Channelndex(layout, Tensor::DataChannelName::X) };
// Determine global work sizes.
- kd.gws0 = idx[2] != -1 ? dims[idx[2]].v : 1; // Y
- kd.gws1 = idx[1] != -1 ? dims[idx[1]].v : 1; // F
- kd.gws2 = idx[0] != -1 ? dims[idx[0]].v : 1; // B
+ dispatchData.gws[0] = idx[2] != -1 ? dims[idx[2]].v : 1; // Y
+ dispatchData.gws[1] = idx[1] != -1 ? dims[idx[1]].v : 1; // F
+ dispatchData.gws[2] = idx[0] != -1 ? dims[idx[0]].v : 1; // B
- kd.lws0 = std::min(std::max(kd.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
- while (kd.gws0 % kd.lws0 != 0) {
- --kd.lws0;
+ dispatchData.lws[0] = std::min(std::max(dispatchData.gws[0], static_cast<size_t>(1)), static_cast<size_t>(32));
+ while (dispatchData.gws[0] % dispatchData.lws[0] != 0) {
+ --dispatchData.lws[0];
}
- kd.lws1 = 1;
- kd.lws2 = 1;
- kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
- return kd;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
+ dispatchData.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
+ return dispatchData;
}
KernelsData ConcatenationKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options) const {
ifm_offset += ifm;
auto& kernel = kd.kernels[i];
- DispatchData runInfo = SetDefault(newParams);
+ DispatchData dispatchData = SetDefault(newParams);
auto cldnnJit = GetJitConstants(newParams);
auto entryPoint = GetEntryPoint(kernelName, newParams.layerID, options);
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
- kernel.workGroups.global = {runInfo.gws0, runInfo.gws1, runInfo.gws2};
- kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2};
+ kernel.workGroups.global = dispatchData.gws;
+ kernel.workGroups.local = dispatchData.lws;
kernel.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo);
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, (uint32_t)i });
kernel.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
kernel.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0});
lastOffset += (uint32_t)input.GetDims()[concatChannelIndex].v;
- efficiency = std::max(efficiency, runInfo.efficiency);
+ efficiency = std::max(efficiency, dispatchData.efficiency);
}
kd.estimatedTime = efficiency;
return true;
}
-ConcatenationKernelBase::DispatchData ConcatenationKernel_depth_bfyx_no_pitch::SetDefault(
- const concatenation_params& params) const {
- DispatchData runInfo = ConcatenationKernelBase::SetDefault(params);
+ConcatenationKernelBase::DispatchData ConcatenationKernel_depth_bfyx_no_pitch::SetDefault(const concatenation_params& params) const {
+ DispatchData dispatchData = ConcatenationKernelBase::SetDefault(params);
const auto& input = params.inputs[0];
const auto batch = input.Batch().v;
- runInfo.gws0 = batch;
- runInfo.gws1 = Align(std::max((size_t)1, input.LogicalSize() / batch), 16 * 8) / 8;
- runInfo.gws2 = 1;
+ dispatchData.gws[0] = batch;
+ dispatchData.gws[1] = Align(std::max((size_t)1, input.LogicalSize() / batch), 16 * 8) / 8;
+ dispatchData.gws[2] = 1;
- runInfo.lws0 = 1;
- runInfo.lws1 = 16;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 16;
+ dispatchData.lws[2] = 1;
- runInfo.efficiency = FORCE_PRIORITY_9;
+ dispatchData.efficiency = FORCE_PRIORITY_9;
- return runInfo;
+ return dispatchData;
}
KernelsData ConcatenationKernel_depth_bfyx_no_pitch::GetKernelsData(const Params& params,
}
ConcatenationKernelBase::DispatchData ConcatenationKernel_fs_b_yx_fsv32::SetDefault(const concatenation_params& params) const {
- DispatchData runInfo = ConcatenationKernelBase::SetDefault(params);
+ DispatchData dispatchData = ConcatenationKernelBase::SetDefault(params);
const auto& input = params.inputs[0];
- runInfo.gws0 = input.X().v;
- runInfo.gws1 = input.Y().v;
- runInfo.gws2 = CeilDiv(input.Feature().v, fsv) * subGroupSize * input.Batch().v;
+ dispatchData.gws[0] = input.X().v;
+ dispatchData.gws[1] = input.Y().v;
+ dispatchData.gws[2] = CeilDiv(input.Feature().v, fsv) * subGroupSize * input.Batch().v;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = subGroupSize;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = subGroupSize;
- runInfo.efficiency = FORCE_PRIORITY_1;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
- return runInfo;
+ return dispatchData;
}
JitConstants ConcatenationKernel_fs_b_yx_fsv32::GetJitConstants(const concatenation_params& params) const {
ifm_offset += ifm;
auto& kernel = kd.kernels[i];
- DispatchData runInfo = SetDefault(newParams);
+ DispatchData dispatchData = SetDefault(newParams);
auto cldnnJit = GetJitConstants(newParams);
auto entryPoint = GetEntryPoint(kernelName, newParams.layerID, optParams);
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
- kernel.workGroups.global = {runInfo.gws0, runInfo.gws1, runInfo.gws2};
- kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2};
+ kernel.workGroups.global = dispatchData.gws;
+ kernel.workGroups.local = dispatchData.lws;
kernel.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo);
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, (uint32_t)i});
kernel.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
kernel.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0});
lastOffset += (uint32_t)input.GetDims()[concatChannelIndex].v;
- efficiency = std::max(efficiency, runInfo.efficiency);
+ efficiency = std::max(efficiency, dispatchData.efficiency);
}
kd.estimatedTime = efficiency;
}
ConcatenationKernelBase::DispatchData ConcatenationKernel_simple_Ref::SetDefault(const concatenation_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& input = params.inputs[0];
- std::vector<size_t> global;
- global = {
- input.X().v * input.Y().v,
- input.Z().v * input.W().v,
- input.Feature().v * input.Batch().v};
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { input.X().v * input.Y().v,
+ input.Z().v * input.W().v,
+ input.Feature().v * input.Batch().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- kd.gws0 = global[0]; // X * Y
- kd.gws1 = global[1]; // Z * W
- kd.gws2 = global[2]; // F * B
+ dispatchData.efficiency = FORCE_PRIORITY_9;
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- kd.efficiency = FORCE_PRIORITY_9;
-
- return kd;
+ return dispatchData;
}
KernelsData ConcatenationKernel_simple_Ref::GetKernelsData(const Params& params, const optional_params& optParams) const {
}
ConvolutionKernel_b_fs_yx_fsv16::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16::GetAutoTuneOptions(const Params& params,
- int /*autoTuneIndex*/) const {
+ int /*autoTuneIndex*/) const {
const convolution_params& cp = static_cast<const convolution_params&>(params);
auto x = cp.output.X().v;
auto f = cp.output.Feature().v;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16::SetDefault(const convolution_params& params,
- int autoTuneIndex) const {
- DispatchData kd = ConvolutionKernelBase::SetDefault(params);
+ int autoTuneIndex) const {
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
const auto& out = params.output;
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
- kd.cldnnStyle.blockWidth = autoTune.blockWidth;
+ dispatchData.cldnnStyle.blockWidth = autoTune.blockWidth;
auto x = out.X().v;
auto y = out.Y().v;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = CeilDiv(x, autoTune.blockWidth) * y;
- kd.gws1 = Align(f, sub_group_size);
- kd.gws2 = b;
+ dispatchData.gws[0] = CeilDiv(x, autoTune.blockWidth) * y;
+ dispatchData.gws[1] = Align(f, sub_group_size);
+ dispatchData.gws[2] = b;
- kd.lws0 = 1;
- kd.lws1 = sub_group_size;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = sub_group_size;
+ dispatchData.lws[2] = 1;
if (b == 1)
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
else
- kd.efficiency = FORCE_PRIORITY_7;
+ dispatchData.efficiency = FORCE_PRIORITY_7;
- return kd;
+ return dispatchData;
}
bool ConvolutionKernel_b_fs_yx_fsv16::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
+ const DispatchData& dispatchData) const {
auto input = params.inputs[0];
auto output = params.output;
- auto jit = Parent::GetJitConstants(params, runInfo);
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- auto blockWidth = runInfo.cldnnStyle.blockWidth;
+ auto blockWidth = dispatchData.cldnnStyle.blockWidth;
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_vec = { "_VEC",
}
KernelsData ConvolutionKernel_b_fs_yx_fsv16::GetTunedKernelsDataByIndex(const Params& params,
- const optional_params& options,
- const int autoTuneIndex) const {
+ const optional_params& options,
+ const int autoTuneIndex) const {
auto tuneOptions = GetAutoTuneOptions(params, autoTuneIndex);
return GetCommonKernelsData(params, options, tuneOptions.exeMode, autoTuneIndex);
}
}
KernelsData ConvolutionKernel_b_fs_yx_fsv16::GetKernelsDataForAutoTune(const Params& params,
- const optional_params& options) const {
+ const optional_params& options) const {
if (!Validate(params, options)) {
return {};
}
bool NeedPaddedInput() const override { return false; }
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
private:
struct AutoTuneOption {
}
ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const Params& params,
- int /*autoTuneIndex*/) const {
+ int /*autoTuneIndex*/) const {
const convolution_params& cp = static_cast<const convolution_params&>(params);
auto x = cp.output.X().v;
auto f = cp.output.Feature().v;
ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefault(const convolution_params& params,
int autoTuneIndex) const {
- DispatchData kd = ConvolutionKernelBase::SetDefault(params);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
- kd.cldnnStyle.blockWidth = autoTune.blockWidth;
+ dispatchData.cldnnStyle.blockWidth = autoTune.blockWidth;
const auto& input = params.inputs[0];
const auto& out = params.output;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = CeilDiv(x * y, autoTune.blockWidth);
- kd.gws1 = Align(f, feature_block_size);
- kd.gws2 = b;
+ dispatchData.gws[0] = CeilDiv(x * y, autoTune.blockWidth);
+ dispatchData.gws[1] = Align(f, feature_block_size);
+ dispatchData.gws[2] = b;
- kd.lws0 = 1;
- kd.lws1 = sub_group_size;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = sub_group_size;
+ dispatchData.lws[2] = 1;
auto bBlockSizeX = x % autoTune.blockWidth == 0;
auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0;
auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0;
-
+
if (b == 1) {
if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) {
- kd.efficiency = FORCE_PRIORITY_1;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
} else {
- kd.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
}
} else {
- kd.efficiency = FORCE_PRIORITY_7;
+ dispatchData.efficiency = FORCE_PRIORITY_7;
}
- return kd;
+ return dispatchData;
}
bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- auto blockWidth = runInfo.cldnnStyle.blockWidth;
+ auto blockWidth = dispatchData.cldnnStyle.blockWidth;
if (!params.fused_ops.empty()) {
auto input_dt = GetUnitType(params);
FusedOpsConfiguration conf_vec = { "_VEC",
}
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
struct AutoTuneOption {
size_t blockWidth;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_depthwise::SetDefault(const convolution_params& params,
- int) const {
- DispatchData runInfo = Parent::SetDefault(params);
+ int) const {
+ DispatchData dispatchData = Parent::SetDefault(params);
const auto& out = params.output;
- runInfo.gws0 = CeilDiv(out.X().v, x_block_size) * out.Y().v;
- runInfo.gws1 = Align(out.Feature().v, feature_block_size);
- runInfo.gws2 = out.Batch().v;
- runInfo.lws0 = 1;
- runInfo.lws1 = sub_group_size;
- runInfo.lws2 = 1;
+ dispatchData.gws[0] = CeilDiv(out.X().v, x_block_size) * out.Y().v;
+ dispatchData.gws[1] = Align(out.Feature().v, feature_block_size);
+ dispatchData.gws[2] = out.Batch().v;
+
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = sub_group_size;
+ dispatchData.lws[2] = 1;
if (out.Batch().v == 1)
- runInfo.efficiency = FORCE_PRIORITY_1;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
else
- runInfo.efficiency = FORCE_PRIORITY_7;
+ dispatchData.efficiency = FORCE_PRIORITY_7;
- return runInfo;
+ return dispatchData;
}
JitConstants ConvolutionKernel_b_fs_yx_fsv16_depthwise::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto jit = ConvolutionKernelBase::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto jit = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
const size_t block_width = 8;
}
KernelsData ConvolutionKernel_b_fs_yx_fsv16_depthwise::GetKernelsData(const Params& params,
- const optional_params& options) const {
+ const optional_params& options) const {
return GetCommonKernelsData(params, options);
}
}
bool NeedPaddedInput() const override { return true; }
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
};
} // namespace kernel_selector
}
JitConstants Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto mem_consts = Parent::GetJitConstants(params, kd);
- mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_SPATIAL", kd.cldnnStyle.blockWidth));
- mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_FEATURES", kd.cldnnStyle.blockHeight));
- mem_consts.AddConstant(MakeJitConstant("FEATURE_SLM_SPLIT", kd.cldnnStyle.prefetch));
+ const DispatchData& dispatchData) const {
+ auto mem_consts = Parent::GetJitConstants(params, dispatchData);
+ mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_SPATIAL", dispatchData.cldnnStyle.blockWidth));
+ mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_FEATURES", dispatchData.cldnnStyle.blockHeight));
+ mem_consts.AddConstant(MakeJitConstant("FEATURE_SLM_SPLIT", dispatchData.cldnnStyle.prefetch));
mem_consts.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
mem_consts.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::SetDefault(const convolution_params& params,
int index) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& output = params.output;
auto tune_params = GetAutoTuneParams(params, index);
size_t k_slices = tune_params.feature_slm_split;
- kd.gws0 = CeilDiv(output.X().v * output.Y().v, tune_params.out_block_spatial);
- kd.gws1 = CeilDiv(output.Feature().v, tune_params.out_block_features * simd) * simd * k_slices;
- kd.gws2 = output.Batch().v;
+ dispatchData.gws[0] = CeilDiv(output.X().v * output.Y().v, tune_params.out_block_spatial);
+ dispatchData.gws[1] = CeilDiv(output.Feature().v, tune_params.out_block_features * simd) * simd * k_slices;
+ dispatchData.gws[2] = output.Batch().v;
- kd.lws0 = 1;
- kd.lws1 = simd * k_slices;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = simd * k_slices;
+ dispatchData.lws[2] = 1;
- kd.cldnnStyle = {0, 0, 0, 0, 0};
- kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+ dispatchData.cldnnStyle = {0, 0, 0, 0, 0};
+ dispatchData.gemmStyle = {0, 0, 0, 0, 0, 0};
- kd.cldnnStyle.blockWidth = tune_params.out_block_spatial;
- kd.cldnnStyle.blockHeight = tune_params.out_block_features;
- kd.cldnnStyle.prefetch = k_slices;
+ dispatchData.cldnnStyle.blockWidth = tune_params.out_block_spatial;
+ dispatchData.cldnnStyle.blockHeight = tune_params.out_block_features;
+ dispatchData.cldnnStyle.prefetch = k_slices;
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
auto in_f = params.weights.IFM().v;
auto out_f = params.weights.OFM().v;
general_is_faster |= in_f == 256 && out_f == 128 && out_x == 3 && out_y == 3 && batch == 1;
if (general_is_faster && !x_strided) {
- kd.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
}
// Better to use kernel with 4 input features in a loop
if (static_cast<float>(params.weights.IFM().v) / static_cast<float>(Align(params.weights.IFM().v, fsv)) < 0.5f)
- kd.efficiency = FORCE_PRIORITY_4;
+ dispatchData.efficiency = FORCE_PRIORITY_4;
- return kd;
+ return dispatchData;
} // SetDefault
bool Convolution_kernel_b_fs_yx_fsv16_imad_1x1::Validate(const Params& params, const optional_params& options) const {
protected:
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv4_int8::SetDefault(const convolution_params& cp, int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
- runInfo.efficiency = FORCE_PRIORITY_9;
+ dispatchData.efficiency = FORCE_PRIORITY_9;
if (cp.output.X().v > 512 && cp.filterSize.x == 5 && cp.filterSize.y == 5)
- runInfo.efficiency = FORCE_PRIORITY_2;
- runInfo.gws0 = CeilDiv(cp.output.X().v, sub_group_size) / 2;
- runInfo.gws1 = cp.output.Y().v;
- runInfo.gws2 = sub_group_size;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
+ dispatchData.gws[0] = CeilDiv(cp.output.X().v, sub_group_size) / 2;
+ dispatchData.gws[1] = cp.output.Y().v;
+ dispatchData.gws[2] = sub_group_size;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = sub_group_size;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = sub_group_size;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_b_fs_yx_fsv4_int8::Validate(const Params& p, const optional_params& o) const {
return true;
}
-JitConstants ConvolutionKernel_b_fs_yx_fsv4_int8::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+JitConstants ConvolutionKernel_b_fs_yx_fsv4_int8::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
+ jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[2]));
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
return WeightsLayout::os_is_yx_osv16_isv4;
}
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
bool NeedPaddedInput() const override { return true; }
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::DispatchData
ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::SetDefault(const convolution_params& params, int autoTuneIndex) const {
- DispatchData kd;
+ DispatchData dispatchData;
auto& out = params.output;
auto tune_params = GetAutoTuneParams(params, autoTuneIndex);
fsv = 32;
}
- std::vector<size_t> global = {
+ dispatchData.gws = {
Align(CeilDiv(out.X().v, tune_params.tile_x), tune_params.lws0),
- Align(out.Y().v, tune_params.lws1),
+ Align(out.Y().v, tune_params.lws1),
CeilDiv(out.Feature().v, fsv) * tune_params.simd * out.Batch().v
};
- std::vector<size_t> local = { tune_params.lws0, tune_params.lws1, tune_params.simd };
+ dispatchData.lws = { tune_params.lws0, tune_params.lws1, tune_params.simd };
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
+ dispatchData.gemmStyle = { 0, 0, 0, 0, 0, 0 };
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.cldnnStyle.blockWidth = tune_params.tile_x;
+ dispatchData.cldnnStyle.prefetch = tune_params.preload_input_slm;
- kd.gemmStyle = { 0, 0, 0, 0, 0, 0 };
+ dispatchData.efficiency = params.stride.x == 1 ? FORCE_PRIORITY_1 : FORCE_PRIORITY_2;
- kd.cldnnStyle.blockWidth = tune_params.tile_x;
- kd.cldnnStyle.prefetch = tune_params.preload_input_slm;
-
- kd.efficiency = params.stride.x == 1 ? FORCE_PRIORITY_1 : FORCE_PRIORITY_2;
-
- return kd;
+ return dispatchData;
}
bool ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::HasPaddedInput(const convolution_params& params) const {
return needs_pad;
}
-JitConstants ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
- auto mem_consts = Parent::GetJitConstants(params, kd);
+JitConstants ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const {
+ auto mem_consts = Parent::GetJitConstants(params, dispatchData);
constexpr size_t imad_width = 4;
auto filter_spatial = params.weights.X().v * params.weights.Y().v;
auto filter_blocked = filter_spatial / imad_width * imad_width;
- mem_consts.AddConstant(MakeJitConstant("LWS0", kd.lws0));
- mem_consts.AddConstant(MakeJitConstant("LWS1", kd.lws1));
- mem_consts.AddConstant(MakeJitConstant("SIMD", kd.lws2));
+ mem_consts.AddConstant(MakeJitConstant("LWS0", dispatchData.lws[0]));
+ mem_consts.AddConstant(MakeJitConstant("LWS1", dispatchData.lws[1]));
+ mem_consts.AddConstant(MakeJitConstant("SIMD", dispatchData.lws[2]));
- mem_consts.AddConstant(MakeJitConstant("TILE_X", kd.cldnnStyle.blockWidth));
+ mem_consts.AddConstant(MakeJitConstant("TILE_X", dispatchData.cldnnStyle.blockWidth));
mem_consts.AddConstant(MakeJitConstant("FILTER_BLOCKED", filter_blocked));
- mem_consts.AddConstant(MakeJitConstant("PRELOAD_INPUT_TO_SLM", kd.cldnnStyle.prefetch));
+ mem_consts.AddConstant(MakeJitConstant("PRELOAD_INPUT_TO_SLM", dispatchData.cldnnStyle.prefetch));
auto needs_boundary_check = ParamsHavePadding(params) &&
(!HasPaddedInput(params) ||
bool NeedPaddedInput() const override { return false; }
bool HasPaddedInput(const convolution_params& params) const;
bool ParamsHavePadding(const convolution_params& params) const;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
struct AutoTuneParams {
ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_zyx_fsv16::SetDefault(const convolution_params& params,
int autoTuneIndex) const {
- DispatchData kd = ConvolutionKernelBase::SetDefault(params, autoTuneIndex);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params, autoTuneIndex);
const auto& out = params.output;
const auto& input = params.inputs[0];
else
break;
}
- kd.cldnnStyle.blockWidth = ow_block;
+ dispatchData.cldnnStyle.blockWidth = ow_block;
if (out.GetDType() == Datatype::F16) {
- kd.lws0 = sub_group_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = sub_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- kd.gws0 = (f / 2);
- kd.gws1 = CeilDiv(y, oh_block) * CeilDiv(x, ow_block) * z;
- kd.gws2 = b % 2 == 0 ? b / 2 : b; // unroll mb by 2
+ dispatchData.gws[0] = (f / 2);
+ dispatchData.gws[1] = CeilDiv(y, oh_block) * CeilDiv(x, ow_block) * z;
+ dispatchData.gws[2] = b % 2 == 0 ? b / 2 : b; // unroll mb by 2
} else {
- kd.lws0 = sub_group_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = sub_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
auto ocb = (f % 32 == 0) ? 32 : 16;
- kd.gws0 = 16;
- kd.gws1 = CeilDiv(y, oh_block) * CeilDiv(x, ow_block) * z;
- kd.gws2 = b * f / ocb;
+ dispatchData.gws[0] = 16;
+ dispatchData.gws[1] = CeilDiv(y, oh_block) * CeilDiv(x, ow_block) * z;
+ dispatchData.gws[2] = b * f / ocb;
}
} else if (ver_16mb16c) {
f = (g > 1) ? f/g : Align(f, 16);
- kd.lws0 = sub_group_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = sub_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- kd.gws0 = f;
- kd.gws1 = x * y * z;
- kd.gws2 = (out.GetDType() == Datatype::F16) ? b / 32 : b / 16;
+ dispatchData.gws[0] = f;
+ dispatchData.gws[1] = x * y * z;
+ dispatchData.gws[2] = (out.GetDType() == Datatype::F16) ? b / 32 : b / 16;
- kd.cldnnStyle.blockWidth = 1;
+ dispatchData.cldnnStyle.blockWidth = 1;
} else {
auto oh_block = 1;
f = Align(f / g, 16);
ocb /= 2;
}
- kd.cldnnStyle.blockWidth = ow_block;
+ dispatchData.cldnnStyle.blockWidth = ow_block;
- kd.gws0 = ocb;
- kd.gws1 = CeilDiv(y, oh_block) * CeilDiv(x, ow_block) * z;
- kd.gws2 = b * (f / ocb) * g;
+ dispatchData.gws[0] = ocb;
+ dispatchData.gws[1] = CeilDiv(y, oh_block) * CeilDiv(x, ow_block) * z;
+ dispatchData.gws[2] = b * (f / ocb) * g;
- kd.lws0 = sub_group_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = sub_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
}
if (b == 1)
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
else
- kd.efficiency = FORCE_PRIORITY_7;
+ dispatchData.efficiency = FORCE_PRIORITY_7;
- return kd;
+ return dispatchData;
}
bool ConvolutionKernel_b_fs_zyx_fsv16::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_b_fs_zyx_fsv16::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
+ const DispatchData& dispatchData) const {
auto input = params.inputs[0];
auto output = params.output;
- auto jit = Parent::GetJitConstants(params, runInfo);
+ auto jit = Parent::GetJitConstants(params, dispatchData);
const bool is_1stconv = input.Feature().v == 3 && input.GetLayout() == DataLayout::bfzyx;
const bool ver_16mb16c = !is_1stconv && ((output.GetDType() == Datatype::F16 && output.Batch().v % 32 == 0) ||
else
jit.AddConstant(MakeJitConstant("CASE_3D", 1));
- jit.AddConstant(MakeJitConstant("LWS_0", runInfo.lws0));
- jit.AddConstant(MakeJitConstant("LWS_1", runInfo.lws1));
- jit.AddConstant(MakeJitConstant("LWS_2", runInfo.lws2));
+ jit.AddConstant(MakeJitConstant("LWS_0", dispatchData.lws[0]));
+ jit.AddConstant(MakeJitConstant("LWS_1", dispatchData.lws[1]));
+ jit.AddConstant(MakeJitConstant("LWS_2", dispatchData.lws[2]));
if (is_1stconv) {
if (output.GetDType() == Datatype::F16) {
} else if (ver_16mb16c) {
jit.AddConstant(MakeJitConstant("OCB", 1));
} else {
- jit.AddConstant(MakeJitConstant("OCB", runInfo.gws0));
+ jit.AddConstant(MakeJitConstant("OCB", dispatchData.gws[0]));
}
jit.AddConstant(MakeJitConstant("SUM_SCALE", 1));
- auto blockWidth = runInfo.cldnnStyle.blockWidth;
+ auto blockWidth = dispatchData.cldnnStyle.blockWidth;
if (ver_16mb16c) {
jit.AddConstant(MakeJitConstant("MB_BLOCK", 16));
}
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ELTWISE,
size_t in_block_depth = 1;
bool break_external_loop = false;
-
+
for (size_t d = 1; d < 16; ++d) {
if (params.output.Z().v % d != 0)
continue;
}
float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const {
- size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_depth *
+ size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_depth *
block.output_block_features * (block.feature_slm_split - 1);
size_t slm_bytes = slm_elements * BytesPerElement(GetAccumulatorType(params));
}
JitConstants Convolution_kernel_b_fs_zyx_fsv16_imad::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto mem_consts = Parent::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto mem_consts = Parent::GetJitConstants(params, dispatchData);
auto block_params = GetBlockParams(params);
idx_order[idx_order.size() - 3] = "out_z";
}
}
-
+
if (block_params.output_block_height != 1) {
loop_axes.push_back(Tensor::DataChannelName::Y);
} else {
} // GetJitConstants
ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_zyx_fsv16_imad::SetDefault(const convolution_params& params,
- int) const {
- DispatchData kd;
+ int) const {
+ DispatchData dispatchData;
const auto& output = params.output;
const auto& weights = params.weights;
auto block_params = GetBlockParams(params);
- kd.gws0 = CeilDiv(output.X().v, block_params.output_block_width);
- kd.gws1 = CeilDiv(output.Y().v, block_params.output_block_height) * CeilDiv(output.Z().v, block_params.output_block_depth);
- kd.gws2 = output.Batch().v * CeilDiv(weights.OFM().v, block_params.output_block_features) * params.groups * simd * block_params.feature_slm_split;
+ dispatchData.gws[0] = CeilDiv(output.X().v, block_params.output_block_width);
+ dispatchData.gws[1] = CeilDiv(output.Y().v, block_params.output_block_height) * CeilDiv(output.Z().v, block_params.output_block_depth);
+ dispatchData.gws[2] = output.Batch().v * CeilDiv(weights.OFM().v, block_params.output_block_features) * params.groups * simd * block_params.feature_slm_split;
- kd.lws0 = 1;
- kd.lws1 = 1;
- kd.lws2 = simd * block_params.feature_slm_split;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = simd * block_params.feature_slm_split;
- kd.cldnnStyle = {0, 0, 0, 0, 0};
- kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+ dispatchData.cldnnStyle = {0, 0, 0, 0, 0};
+ dispatchData.gemmStyle = {0, 0, 0, 0, 0, 0};
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
if (static_cast<float>(params.weights.IFM().v) / static_cast<float>(Align(params.weights.IFM().v, fsv)) < 0.5f)
- kd.efficiency = FORCE_PRIORITY_4;
+ dispatchData.efficiency = FORCE_PRIORITY_4;
- return kd;
+ return dispatchData;
} // SetDefault
bool Convolution_kernel_b_fs_zyx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
protected:
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
WeightsLayout GetPreferredWeightsLayout(const convolution_params& p) const override {
size_t output_block_width;
size_t output_block_height;
size_t output_block_depth;
-
+
size_t output_block_features;
size_t input_block_width;
return true;
}
-JitConstants ConvolutionKernelBase::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
+JitConstants ConvolutionKernelBase::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const {
JitConstants mem_consts = WeightBiasKernelBase::GetJitConstants(params);
- mem_consts.Merge(GetFusedPrimitivesJitConstants(params, kd));
+ mem_consts.Merge(GetFusedPrimitivesJitConstants(params, dispatchData));
const auto& padding = params.padding;
const auto& input = params.inputs[0];
std::vector<uint32_t> unrollLoopParams{params.filterSize.x,
params.filterSize.y,
- (uint32_t)kd.gemmStyle.globalWorkSizeDX,
- (uint32_t)kd.gemmStyle.globalWorkSizeDY,
- (uint32_t)kd.gemmStyle.globalWorkSizeDZ,
- (uint32_t)kd.gemmStyle.subBlockDimM,
- (uint32_t)kd.gemmStyle.subBlockDimK,
- (uint32_t)kd.gemmStyle.subBlockDimN};
+ (uint32_t)dispatchData.gemmStyle.globalWorkSizeDX,
+ (uint32_t)dispatchData.gemmStyle.globalWorkSizeDY,
+ (uint32_t)dispatchData.gemmStyle.globalWorkSizeDZ,
+ (uint32_t)dispatchData.gemmStyle.subBlockDimM,
+ (uint32_t)dispatchData.gemmStyle.subBlockDimK,
+ (uint32_t)dispatchData.gemmStyle.subBlockDimN};
auto loopCount = *std::max_element(unrollLoopParams.begin(), unrollLoopParams.end());
return mem_consts;
}
-bool ConvolutionKernelBase::CheckWorkGroups(const ConvolutionKernelBase::DispatchData& kd) {
- if (kd.gws0 == 0 || kd.gws1 == 0 || kd.gws2 == 0 || kd.lws0 == 0 || kd.lws1 == 0 || kd.lws2 == 0) {
+bool ConvolutionKernelBase::CheckWorkGroups(const ConvolutionKernelBase::DispatchData& dispatchData) {
+ if (dispatchData.gws.size() != 3 || dispatchData.lws.size() != 3)
return false;
- }
- if ((kd.gws0 % kd.lws0) != 0 || (kd.gws1 % kd.lws1) != 0 || (kd.gws2 % kd.lws2) != 0) {
- return false;
+ for (size_t i = 0; i < dispatchData.gws.size(); i++) {
+ if (dispatchData.gws[i] == 0 || dispatchData.lws[i] == 0)
+ return false;
+ if ((dispatchData.gws[i] % dispatchData.lws[i]) != 0)
+ return false;
}
return true;
}
ConvolutionKernelBase::DispatchData ConvolutionKernelBase::SetDefault(const convolution_params& params, int) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& out = params.output;
- kd.fp16UnitUsed = out.GetDType() == Datatype::F16;
- std::vector<size_t> global;
if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf) {
- global = {out.X().v, out.Y().v, out.Feature().v * out.Batch().v};
+ dispatchData.gws = {out.X().v, out.Y().v, out.Feature().v * out.Batch().v};
} else if (params.output.GetLayout() == DataLayout::bfzyx) {
- global = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
+ dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
} else {
- global = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v};
+ dispatchData.gws = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v};
}
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- kd.cldnnStyle.blockWidth = 1;
- kd.cldnnStyle.blockHeight = 1;
- kd.cldnnStyle.prefetch = 0;
- kd.cldnnStyle.inputBlockArraySize = 0;
- kd.cldnnStyle.inputBlockWidth = 0;
-
- kd.gemmStyle.globalWorkSizeDX = 1;
- kd.gemmStyle.globalWorkSizeDY = 1;
- kd.gemmStyle.globalWorkSizeDZ = 1;
- kd.gemmStyle.subBlockDimK = 1;
- kd.gemmStyle.subBlockDimM = 0;
- kd.gemmStyle.subBlockDimN = 0;
- kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
- return kd;
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
+
+ dispatchData.cldnnStyle.blockWidth = 1;
+ dispatchData.cldnnStyle.blockHeight = 1;
+ dispatchData.cldnnStyle.prefetch = 0;
+ dispatchData.cldnnStyle.inputBlockArraySize = 0;
+ dispatchData.cldnnStyle.inputBlockWidth = 0;
+
+ dispatchData.gemmStyle.globalWorkSizeDX = 1;
+ dispatchData.gemmStyle.globalWorkSizeDY = 1;
+ dispatchData.gemmStyle.globalWorkSizeDZ = 1;
+ dispatchData.gemmStyle.subBlockDimK = 1;
+ dispatchData.gemmStyle.subBlockDimM = 0;
+ dispatchData.gemmStyle.subBlockDimN = 0;
+ dispatchData.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
+ return dispatchData;
}
KernelsData ConvolutionKernelBase::GetCommonKernelsData(const Params& params,
if (kd.reorderInput && !options.allowInputReordering)
return {};
}
- DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
+ DispatchData dispatchData = SetDefault(newParams, autoTuneIndex);
- if (!CheckWorkGroups(runInfo)) {
+ if (!CheckWorkGroups(dispatchData)) {
// Internal Error - wrong calculation of global/local work group sizes
return {};
}
auto finalKernelName = GetKernelName(newParams);
- auto cldnnJit = GetJitConstants(newParams, runInfo);
+ auto cldnnJit = GetJitConstants(newParams, dispatchData);
auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options);
auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
finalKernelName,
jit,
}
kernel.arguments.push_back({ArgumentDescriptor::Types::SPLIT, 0});
- kd.estimatedTime = runInfo.efficiency;
+ kd.estimatedTime = dispatchData.efficiency;
kd.autoTuneIndex = autoTuneIndex;
return {kd};
virtual std::string GetKernelName(const convolution_params&) const { return kernelName; }
virtual bool NeedPaddedInput() const { return false; }
bool Validate(const Params& p, const optional_params& o) const override;
- virtual JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const;
- virtual JitConstants GetFusedPrimitivesJitConstants(const convolution_params& params, const DispatchData& kd) const;
+ virtual JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const;
+ virtual JitConstants GetFusedPrimitivesJitConstants(const convolution_params& params, const DispatchData& dispatchData) const;
virtual DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const;
static bool CheckWorkGroups(const DispatchData&);
static bool CheckPitchForSplitOnly(const convolution_params& params);
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_1x1::SetDefault(const convolution_params& params, int) const {
- DispatchData kd = ConvolutionKernelBase::SetDefault(params);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
const auto& out = params.output;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = Align(x * y, 16) / 16;
- kd.gws1 = Align(f, 16);
- kd.gws2 = b;
+ dispatchData.gws[0] = Align(x * y, 16) / 16;
+ dispatchData.gws[1] = Align(f, 16);
+ dispatchData.gws[2] = b;
- kd.lws0 = 1;
- kd.lws1 = 16;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 16;
+ dispatchData.lws[2] = 1;
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- return kd;
+ return dispatchData;
}
bool ConvolutionKernel_bfyx_1x1::Validate(const Params& p, const optional_params& o) const {
return true;
}
-JitConstants ConvolutionKernel_bfyx_1x1::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+JitConstants ConvolutionKernel_bfyx_1x1::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
if (params.output.Feature().v % 16)
jit.AddConstant(MakeJitConstant("LEFTOVERS", 1));
}
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_1x1_gemm_buf::SetDefault(const convolution_params& params, int) const {
- DispatchData kd = ConvolutionKernelBase::SetDefault(params);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
const auto& out = params.output;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = Align(f, 16);
- kd.gws1 = CeilDiv(x * y, 16);
- kd.gws2 = b;
+ dispatchData.gws[0] = Align(f, 16);
+ dispatchData.gws[1] = CeilDiv(x * y, 16);
+ dispatchData.gws[2] = b;
- kd.lws0 = 16;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 16;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- kd.efficiency = FORCE_PRIORITY_1;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
- return kd;
+ return dispatchData;
}
bool ConvolutionKernel_bfyx_1x1_gemm_buf::Validate(const Params& p, const optional_params& o) const {
return true;
}
-JitConstants ConvolutionKernel_bfyx_1x1_gemm_buf::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+JitConstants ConvolutionKernel_bfyx_1x1_gemm_buf::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
const auto& out = params.output;
const auto& input = params.inputs[0];
}
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
ConvolutionKernelBase::DispatchData convolution_kernel_bfyx_1x1_opt::SetDefault(const convolution_params& cp,
int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
constexpr size_t sub_group_size = 8;
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
auto block = get_out_block_size(cp);
- runInfo.gws0 = cp.output.X().v / block.out_width;
- runInfo.gws1 = cp.output.Y().v / block.out_height;
- runInfo.gws2 =
- 2 * (cp.output.Feature().v * cp.output.Batch().v) / block.out_depth; // process 8 output channels per Workitem
+ dispatchData.gws[0] = cp.output.X().v / block.out_width;
+ dispatchData.gws[1] = cp.output.Y().v / block.out_height;
+ // process 8 output channels per Workitem
+ dispatchData.gws[2] = 2 * (cp.output.Feature().v * cp.output.Batch().v) / block.out_depth;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 2 * sub_group_size;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 2 * sub_group_size;
- return runInfo;
+ return dispatchData;
}
bool convolution_kernel_bfyx_1x1_opt::Validate(const Params& p, const optional_params& o) const {
}
JitConstants convolution_kernel_bfyx_1x1_opt::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
auto block = get_out_block_size(params);
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
protected:
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
bool NeedPaddedInput() const override { return true; }
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
return true;
}
-ConvolutionKernel_bfyx_3x3_dw_opt::AutoTuneOption ConvolutionKernel_bfyx_3x3_dw_opt::GetAutoTuneOptions(
- const Params&,
- int autoTuneIndex) const {
+ConvolutionKernel_bfyx_3x3_dw_opt::AutoTuneOption ConvolutionKernel_bfyx_3x3_dw_opt::GetAutoTuneOptions(const Params&,
+ int autoTuneIndex) const {
if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast<int>(autoTuneOptions.size()))) {
return autoTuneOptions[autoTuneIndex];
}
int autoTuneIndex) const {
constexpr int simdSize = 16;
- DispatchData runInfo = Parent::SetDefault(params);
+ DispatchData dispatchData = Parent::SetDefault(params);
auto options = GetAutoTuneOptions(params, autoTuneIndex);
const int numTilesY = static_cast<int>(
std::ceil(static_cast<float>(params.inputs[0].Y().v) / static_cast<float>(options.tileDims.y)));
- runInfo.cldnnStyle.blockWidth = options.tileDims.x;
- runInfo.cldnnStyle.blockHeight = options.tileDims.y;
- runInfo.gws0 = numTilesX * simdSize;
- runInfo.gws1 = numTilesY;
- runInfo.gws2 = params.inputs[0].Feature().v * params.inputs[0].Batch().v;
- runInfo.lws0 = simdSize;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.cldnnStyle.blockWidth = options.tileDims.x;
+ dispatchData.cldnnStyle.blockHeight = options.tileDims.y;
+ dispatchData.gws[0] = numTilesX * simdSize;
+ dispatchData.gws[1] = numTilesY;
+ dispatchData.gws[2] = params.inputs[0].Feature().v * params.inputs[0].Batch().v;
+ dispatchData.lws[0] = simdSize;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- runInfo.efficiency = FORCE_PRIORITY_5;
+ dispatchData.efficiency = FORCE_PRIORITY_5;
- return runInfo;
+ return dispatchData;
}
JitConstants ConvolutionKernel_bfyx_3x3_dw_opt::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- stSize tileDims = {kd.cldnnStyle.blockWidth, kd.cldnnStyle.blockHeight};
- auto mem_consts = ConvolutionKernelBase::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ stSize tileDims = {dispatchData.cldnnStyle.blockWidth, dispatchData.cldnnStyle.blockHeight};
+ auto mem_consts = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
if (tileDims.y != 0 && tileDims.x != 0) {
- mem_consts.AddConstant(MakeJitConstant("UNIT_BYTE_SIZE", kd.fp16UnitUsed ? sizeof(short) : sizeof(float)));
- mem_consts.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", kd.lws0));
+ mem_consts.AddConstant(MakeJitConstant("UNIT_BYTE_SIZE", BytesPerElement(params.output.GetDType())));
+ mem_consts.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[0]));
mem_consts.AddConstant(MakeJitConstant("TILE_HEIGHT", tileDims.y));
mem_consts.AddConstant(MakeJitConstant("TILE_WIDTH", tileDims.x));
}
KernelData kd = KernelData::Default<convolution_params>(params);
convolution_params& convParams = *static_cast<convolution_params*>(kd.params.get());
- DispatchData runInfo = SetDefault(convParams, autoTuneIndex);
+ DispatchData dispatchData = SetDefault(convParams, autoTuneIndex);
- if (static_cast<int>(static_cast<int>(runInfo.gws0 - 1) / simdSize) * runInfo.cldnnStyle.blockWidth + simdSize >
+ if (static_cast<int>(static_cast<int>(dispatchData.gws[0] - 1) / simdSize) * dispatchData.cldnnStyle.blockWidth + simdSize >
convParams.inputs[0].Y().pitch) {
// Internal Error - requested tile size is not supported for y pitch
return {};
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::oiyx;
}
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
struct AutoTuneOption {
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
std::vector<AutoTuneOption> autoTuneOptions = {};
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
return true;
}
-ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_depthwise_weights_lwg::SetDefault(
- const convolution_params& params,
- int) const {
- DispatchData runInfo = Parent::SetDefault(params);
+ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_depthwise_weights_lwg::SetDefault(const convolution_params& params,
+ int) const {
+ DispatchData dispatchData = Parent::SetDefault(params);
const auto& out = params.output;
- std::vector<size_t> global = {out.X().v * out.Y().v, out.Feature().v, out.Batch().v};
+ dispatchData.gws = { Align(out.X().v * out.Y().v, 16), out.Feature().v, out.Batch().v };
+ dispatchData.lws = { 16, 1, 1 };
- runInfo.gws0 = Align(global[0], 16);
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
- runInfo.lws0 = 16;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- runInfo.efficiency = FORCE_PRIORITY_2;
-
- return runInfo;
+ return dispatchData;
}
JitConstants ConvolutionKernel_bfyx_depthwise_weights_lwg::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto mem_consts = ConvolutionKernelBase::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto mem_consts = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
if (params.padding.x != 0 || params.padding.y != 0)
mem_consts.AddConstant(MakeJitConstant("BOUNDARY_CHECK", 1));
const optional_params& options) const {
return GetTunedKernelsDataByIndex(params, options);
}
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::goiyx;
}
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
}
JitConstants ConvolutionKernel_bfyx_Direct_10_10_12::GetJitConstants(const convolution_params& cp,
- const DispatchData& runInfo) const {
- JitConstants jit = Parent::GetJitConstants(cp, runInfo);
+ const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(cp, dispatchData);
jit.AddConstants({
- MakeJitConstant("ALIGNED_OFM", RoundUp(cp.output.Feature().v / cp.groups, runInfo.gemmStyle.subBlockDimN) * cp.groups),
- MakeJitConstant("ALIGNED_OFM_PER_GROUP", RoundUp(cp.output.Feature().v / cp.groups, runInfo.gemmStyle.subBlockDimN)),
- MakeJitConstant("DX", runInfo.gemmStyle.globalWorkSizeDX),
- MakeJitConstant("DY", runInfo.gemmStyle.globalWorkSizeDY),
+ MakeJitConstant("ALIGNED_OFM", RoundUp(cp.output.Feature().v / cp.groups, dispatchData.gemmStyle.subBlockDimN) * cp.groups),
+ MakeJitConstant("ALIGNED_OFM_PER_GROUP", RoundUp(cp.output.Feature().v / cp.groups, dispatchData.gemmStyle.subBlockDimN)),
+ MakeJitConstant("DX", dispatchData.gemmStyle.globalWorkSizeDX),
+ MakeJitConstant("DY", dispatchData.gemmStyle.globalWorkSizeDY),
MakeJitConstant("KERNEL_SLICE_DIV2", (cp.filterSize.x * cp.filterSize.y) / 2),
- MakeJitConstant("RIGHT_PARTIAL_TILE_K", cp.output.X().v % runInfo.gemmStyle.globalWorkSizeDX),
+ MakeJitConstant("RIGHT_PARTIAL_TILE_K", cp.output.X().v % dispatchData.gemmStyle.globalWorkSizeDX),
MakeJitConstant("INPUT_BUFFER_WIDTH_PADDED", ""), // TODO: enable non padding path again
MakeJitConstant("INPUT_BUFFER_HEIGHT_PADDED", ""),
});
return jit;
}
-ConvolutionKernel_bfyx_Direct_10_10_12::Parent::DispatchData ConvolutionKernel_bfyx_Direct_10_10_12::SetDefault(
- const convolution_params& arg,
- int) const {
- Parent::DispatchData runInfo = Parent::SetDefault(arg);
+ConvolutionKernel_bfyx_Direct_10_10_12::DispatchData ConvolutionKernel_bfyx_Direct_10_10_12::SetDefault(const convolution_params& arg,
+ int) const {
+ DispatchData dispatchData = Parent::SetDefault(arg);
constexpr uint32_t TILE_N = 16;
if (arg.filterSize.x == 5) {
- runInfo.gemmStyle = {1, 1, TILE_N, /*GWS DX*/ 4, /*GWS DY*/ 4, 1};
+ dispatchData.gemmStyle = {1, 1, TILE_N, /*GWS DX*/ 4, /*GWS DY*/ 4, 1};
} else {
- runInfo.gemmStyle = {1, 1, TILE_N, /*GWS DX*/ 4, /*GWS DY*/ 3, 1};
+ dispatchData.gemmStyle = {1, 1, TILE_N, /*GWS DX*/ 4, /*GWS DY*/ 3, 1};
}
- runInfo.gws0 = RoundUp(arg.output.X().v, runInfo.gemmStyle.globalWorkSizeDX) / runInfo.gemmStyle.globalWorkSizeDX;
- runInfo.gws1 = RoundUp(arg.output.Y().v, runInfo.gemmStyle.globalWorkSizeDY) / runInfo.gemmStyle.globalWorkSizeDY;
- runInfo.gws2 = RoundUp(arg.output.Feature().v / arg.groups, TILE_N) * arg.output.Batch().v * arg.groups;
+ dispatchData.gws[0] = RoundUp(arg.output.X().v, dispatchData.gemmStyle.globalWorkSizeDX) / dispatchData.gemmStyle.globalWorkSizeDX;
+ dispatchData.gws[1] = RoundUp(arg.output.Y().v, dispatchData.gemmStyle.globalWorkSizeDY) / dispatchData.gemmStyle.globalWorkSizeDY;
+ dispatchData.gws[2] = RoundUp(arg.output.Feature().v / arg.groups, TILE_N) * arg.output.Batch().v * arg.groups;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = TILE_N;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = TILE_N;
- runInfo.efficiency = FORCE_PRIORITY_4;
+ dispatchData.efficiency = FORCE_PRIORITY_4;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_bfyx_Direct_10_10_12::Validate(const Params& p, const optional_params& o) const {
return (p.groups > 1) ? WeightsLayout::gi_yxs_os_yxsv2_osv16 : WeightsLayout::i_yxs_os_yxsv2_osv16;
}
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
bool NeedPaddedInput() const override { return true; }
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
}
JitConstants ConvolutionKernel_bfyx_GEMMLike::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- JitConstants jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
jit.AddConstants({
- MakeJitConstant("ALIGNED_OFM_PER_GROUP", RoundUp(params.output.Feature().v / params.groups, runInfo.gemmStyle.subBlockDimN)),
- MakeJitConstant("DX", runInfo.gemmStyle.globalWorkSizeDX),
- MakeJitConstant("DY", runInfo.gemmStyle.globalWorkSizeDY),
+ MakeJitConstant("ALIGNED_OFM_PER_GROUP", RoundUp(params.output.Feature().v / params.groups, dispatchData.gemmStyle.subBlockDimN)),
+ MakeJitConstant("DX", dispatchData.gemmStyle.globalWorkSizeDX),
+ MakeJitConstant("DY", dispatchData.gemmStyle.globalWorkSizeDY),
MakeJitConstant("FILTER_SIZE_X_DIV2", params.filterSize.x / 2),
MakeJitConstant("INPUT_BUFFER_WIDTH_PADDED", ""), // TODO: enable non padding path again
MakeJitConstant("INPUT_BUFFER_HEIGHT_PADDED", ""),
});
- if (CeilDiv(RoundUp(params.output.X().v * params.output.Y().v, runInfo.gemmStyle.subBlockDimM),
- runInfo.gemmStyle.globalWorkSizeDY) %
- runInfo.lws1 !=
+ if (CeilDiv(RoundUp(params.output.X().v * params.output.Y().v, dispatchData.gemmStyle.subBlockDimM),
+ dispatchData.gemmStyle.globalWorkSizeDY) %
+ dispatchData.lws[1] !=
0)
jit.AddConstant(MakeJitConstant("LEFTOVERS", 1));
ConvolutionKernel_bfyx_GEMMLike::Parent::DispatchData ConvolutionKernel_bfyx_GEMMLike::SetDefault(
const convolution_params& arg,
int autoTuneIndex) const {
- DispatchData runInfo = Parent::SetDefault(arg, autoTuneIndex);
+ DispatchData dispatchData = Parent::SetDefault(arg, autoTuneIndex);
- runInfo.lws0 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[2] = 1;
if (arg.inputs[0].GetDType() == Datatype::F16) {
- runInfo.gemmStyle = {1, arg.filterSize.x, 32, 32, 1, 1};
- runInfo.lws1 = 16;
- runInfo.efficiency = FORCE_PRIORITY_6;
+ dispatchData.gemmStyle = {1, arg.filterSize.x, 32, 32, 1, 1};
+ dispatchData.lws[1] = 16;
+ dispatchData.efficiency = FORCE_PRIORITY_6;
} else {
- runInfo.gemmStyle = {2, arg.filterSize.x, 32, 32, 2, 1};
- runInfo.lws1 = 8;
- runInfo.efficiency = FORCE_PRIORITY_8;
+ dispatchData.gemmStyle = {2, arg.filterSize.x, 32, 32, 2, 1};
+ dispatchData.lws[1] = 8;
+ dispatchData.efficiency = FORCE_PRIORITY_8;
}
- size_t sgemm_m = RoundUp(arg.output.X().v * arg.output.Y().v, runInfo.gemmStyle.subBlockDimM);
- size_t sgemm_n = RoundUp(arg.output.Feature().v / arg.groups, runInfo.gemmStyle.subBlockDimN);
+ size_t sgemm_m = RoundUp(arg.output.X().v * arg.output.Y().v, dispatchData.gemmStyle.subBlockDimM);
+ size_t sgemm_n = RoundUp(arg.output.Feature().v / arg.groups, dispatchData.gemmStyle.subBlockDimN);
- runInfo.gws0 = RoundUp(CeilDiv(sgemm_n, runInfo.gemmStyle.globalWorkSizeDX), runInfo.lws0);
- runInfo.gws1 = RoundUp(CeilDiv(sgemm_m, runInfo.gemmStyle.globalWorkSizeDY), runInfo.lws1);
- runInfo.gws2 = arg.output.Batch().v * arg.groups;
+ dispatchData.gws[0] = RoundUp(CeilDiv(sgemm_n, dispatchData.gemmStyle.globalWorkSizeDX), dispatchData.lws[0]);
+ dispatchData.gws[1] = RoundUp(CeilDiv(sgemm_m, dispatchData.gemmStyle.globalWorkSizeDY), dispatchData.lws[1]);
+ dispatchData.gws[2] = arg.output.Batch().v * arg.groups;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_bfyx_GEMMLike::Validate(const Params& p, const optional_params& o) const {
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
std::string GetKernelName(const convolution_params& params) const override;
bool NeedPaddedInput() const override { return true; }
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_iyxo::SetDefault(const convolution_params& cp, int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
- runInfo.efficiency = FORCE_PRIORITY_9;
+ dispatchData.efficiency = FORCE_PRIORITY_9;
- runInfo.gws0 = CeilDiv(cp.output.X().v, sub_group_size) / 4;
- runInfo.gws1 = cp.output.Y().v;
- runInfo.gws2 = sub_group_size;
+ dispatchData.gws[0] = CeilDiv(cp.output.X().v, sub_group_size) / 4;
+ dispatchData.gws[1] = cp.output.Y().v;
+ dispatchData.gws[2] = sub_group_size;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = sub_group_size;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = sub_group_size;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_bfyx_iyxo::Validate(const Params& p, const optional_params& o) const {
return true;
}
-JitConstants ConvolutionKernel_bfyx_iyxo::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+JitConstants ConvolutionKernel_bfyx_iyxo::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
+ jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[2]));
return jit;
}
return WeightsLayout::iyxo;
}
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
bool NeedPaddedInput() const override { return true; }
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
option.blockWidth = 4;
option.blockHeight = 3;
option.prefetch = 5;
- // run_info.efficiency = FORCE_PRIORITY_7; // GEMM is better
}
// if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16::SetDefault(const convolution_params& cp,
int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
const auto of_maps = cp.output.Feature().v;
const auto of_maps_per_group = of_maps / cp.groups;
const size_t of_threads_per_batch = RoundUp(of_maps_per_group, sub_group_size) * cp.groups;
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
- runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
- runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
- runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
+ dispatchData.cldnnStyle.blockWidth = tuneOptions.blockWidth;
+ dispatchData.cldnnStyle.blockHeight = tuneOptions.blockHeight;
+ dispatchData.cldnnStyle.prefetch = tuneOptions.prefetch;
- auto input_block_dims = get_bfyx_req_input_block_dims(runInfo.cldnnStyle.blockWidth,
- runInfo.cldnnStyle.blockHeight,
+ auto input_block_dims = get_bfyx_req_input_block_dims(dispatchData.cldnnStyle.blockWidth,
+ dispatchData.cldnnStyle.blockHeight,
cp.filterSize,
cp.stride,
cp.dilation,
sub_group_size,
- runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
+ cp.output.GetDType() == Datatype::F16 ? sub_group_size : sub_group_size / 2,
sub_group_size);
- runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
- runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
+ dispatchData.cldnnStyle.inputBlockArraySize = input_block_dims.first;
+ dispatchData.cldnnStyle.inputBlockWidth = input_block_dims.second;
- runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
- runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
- runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v;
+ dispatchData.gws[0] = CeilDiv(cp.output.X().v, dispatchData.cldnnStyle.blockWidth);
+ dispatchData.gws[1] = CeilDiv(cp.output.Y().v, dispatchData.cldnnStyle.blockHeight);
+ dispatchData.gws[2] = of_threads_per_batch * cp.output.Batch().v;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = sub_group_size;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = sub_group_size;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_bfyx_os_iyx_osv16::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
+ const DispatchData& dispatchData) const {
const auto of_maps = params.output.Feature().v;
const auto of_maps_per_group = of_maps / params.groups;
const size_t of_threads_per_batch = RoundUp(of_maps_per_group, sub_group_size);
size_t leftovers = of_threads_per_batch - of_maps_per_group;
- auto jit = Parent::GetJitConstants(params, runInfo);
+ auto jit = Parent::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
auto input_dt = GetUnitType(params);
}
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
- jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
- jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
- jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
+ jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[2]));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight));
+ jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", dispatchData.cldnnStyle.inputBlockArraySize));
+ jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", dispatchData.cldnnStyle.inputBlockWidth));
+ jit.AddConstant(MakeJitConstant("PREFETCH", dispatchData.cldnnStyle.prefetch));
if (leftovers) {
jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers));
FusedOpType::ACTIVATION };
}
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
bool NeedPaddedInput() const override { return true; }
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
option.blockWidth = 4;
option.blockHeight = 3;
option.prefetch = 5;
- // run_info.efficiency = FORCE_PRIORITY_7; // GEMM is better
}
// if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::SetDefault(const convolution_params& cp,
int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
const auto of_maps = cp.output.Feature().v;
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
- runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
- runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
- runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
+ dispatchData.cldnnStyle.blockWidth = tuneOptions.blockWidth;
+ dispatchData.cldnnStyle.blockHeight = tuneOptions.blockHeight;
+ dispatchData.cldnnStyle.prefetch = tuneOptions.prefetch;
- auto input_block_dims = get_bfyx_req_input_block_dims(runInfo.cldnnStyle.blockWidth,
- runInfo.cldnnStyle.blockHeight,
+ auto input_block_dims = get_bfyx_req_input_block_dims(dispatchData.cldnnStyle.blockWidth,
+ dispatchData.cldnnStyle.blockHeight,
cp.filterSize,
cp.stride,
cp.dilation,
sub_group_size,
- runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
+ cp.output.GetDType() == Datatype::F16 ? sub_group_size : sub_group_size / 2,
sub_group_size);
- runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
- runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
+ dispatchData.cldnnStyle.inputBlockArraySize = input_block_dims.first;
+ dispatchData.cldnnStyle.inputBlockWidth = input_block_dims.second;
- runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
- runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
- runInfo.gws2 = 2 * of_threads_per_batch * cp.output.Batch().v;
+ dispatchData.gws[0] = CeilDiv(cp.output.X().v, dispatchData.cldnnStyle.blockWidth);
+ dispatchData.gws[1] = CeilDiv(cp.output.Y().v, dispatchData.cldnnStyle.blockHeight);
+ dispatchData.gws[2] = 2 * of_threads_per_batch * cp.output.Batch().v;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 2 * sub_group_size;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 2 * sub_group_size;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
+ const DispatchData& dispatchData) const {
const auto of_maps = params.output.Feature().v;
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
size_t leftovers = of_threads_per_batch - of_maps;
- auto jit = Parent::GetJitConstants(params, runInfo);
+ auto jit = Parent::GetJitConstants(params, dispatchData);
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 16));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
- jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
- jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
- jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight));
+ jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", dispatchData.cldnnStyle.inputBlockArraySize));
+ jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", dispatchData.cldnnStyle.inputBlockWidth));
+ jit.AddConstant(MakeJitConstant("PREFETCH", dispatchData.cldnnStyle.prefetch));
if (leftovers) {
jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers));
protected:
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
bool NeedPaddedInput() const override { return true; }
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
std::vector<AutoTuneOption> autoTuneOptions = {};
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_to_bfyx_f16::SetDefault(const convolution_params& params,
int autoTuneIndex) const {
- DispatchData kd = ConvolutionKernelBase::SetDefault(params);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
const auto& out = params.output;
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
- kd.cldnnStyle.blockWidth = autoTune.blockWidth;
+ dispatchData.cldnnStyle.blockWidth = autoTune.blockWidth;
auto x = out.X().v;
auto y = out.Y().v;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = CeilDiv(x, autoTune.blockWidth) * y;
- kd.gws1 = Align(f, sub_group_size);
- kd.gws2 = b;
+ dispatchData.gws[0] = CeilDiv(x, autoTune.blockWidth) * y;
+ dispatchData.gws[1] = Align(f, sub_group_size);
+ dispatchData.gws[2] = b;
- kd.lws0 = 1;
- kd.lws1 = sub_group_size;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = sub_group_size;
+ dispatchData.lws[2] = 1;
if (b == 1)
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
else
- kd.efficiency = FORCE_PRIORITY_7;
+ dispatchData.efficiency = FORCE_PRIORITY_7;
- return kd;
+ return dispatchData;
}
bool ConvolutionKernel_bfyx_to_bfyx_f16::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_bfyx_to_bfyx_f16::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
+ const DispatchData& dispatchData) const {
auto input = params.inputs[0];
auto output = params.output;
- auto jit = Parent::GetJitConstants(params, runInfo);
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- auto blockWidth = runInfo.cldnnStyle.blockWidth;
+ auto blockWidth = dispatchData.cldnnStyle.blockWidth;
if (!params.fused_ops.empty()) {
auto input_dt = GetUnitType(params);
bool NeedPaddedInput() const override { return false; }
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
struct AutoTuneOption {
size_t blockWidth;
std::string exeMode;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_to_bfyx_bsv16_fsv16::SetDefault(const convolution_params& params,
- int autoTuneIndex) const {
- DispatchData kd = ConvolutionKernel_bfyx_to_bfyx_f16::SetDefault(params, autoTuneIndex);
+ int autoTuneIndex) const {
+ DispatchData dispatchData = ConvolutionKernel_bfyx_to_bfyx_f16::SetDefault(params, autoTuneIndex);
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- return kd;
+ return dispatchData;
}
bool ConvolutionKernel_bfyx_to_bfyx_bsv16_fsv16::Validate(const Params& p, const optional_params& o) const {
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_to_fs_byx_fsv32::SetDefault(const convolution_params& arg,
int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(arg);
AutoTuneOption option = GetAutoTuneOptions(arg, autoTuneIndex);
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
- runInfo.cldnnStyle.blockHeight = option.blockHeight;
- runInfo.cldnnStyle.blockWidth = option.blockWidth;
+ dispatchData.cldnnStyle.blockHeight = option.blockHeight;
+ dispatchData.cldnnStyle.blockWidth = option.blockWidth;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 16;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 16;
- runInfo.gws0 = CeilDiv(arg.output.X().v, option.blockWidth);
- runInfo.gws1 = CeilDiv(arg.output.Y().v, option.blockHeight);
- runInfo.gws2 = CeilDiv(arg.output.Feature().v, 32) * 16 * arg.output.Batch().v;
+ dispatchData.gws[0] = CeilDiv(arg.output.X().v, option.blockWidth);
+ dispatchData.gws[1] = CeilDiv(arg.output.Y().v, option.blockHeight);
+ dispatchData.gws[2] = CeilDiv(arg.output.Feature().v, 32) * 16 * arg.output.Batch().v;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_bfyx_to_fs_byx_fsv32::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_bfyx_to_fs_byx_fsv32::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto jit = ConvolutionKernelBase::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto jit = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", kd.cldnnStyle.blockWidth));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", kd.cldnnStyle.blockHeight));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight));
auto inputBlockWidth =
- getInputSize(params.stride.x, params.filterSize.x, params.dilation.x, kd.cldnnStyle.blockWidth);
+ getInputSize(params.stride.x, params.filterSize.x, params.dilation.x, dispatchData.cldnnStyle.blockWidth);
auto inputBlockHeight =
- getInputSize(params.stride.y, params.filterSize.y, params.dilation.y, kd.cldnnStyle.blockHeight);
+ getInputSize(params.stride.y, params.filterSize.y, params.dilation.y, dispatchData.cldnnStyle.blockHeight);
auto inputBlockWidthRound = RoundUp(inputBlockWidth, subGroupSize);
}
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
ConvolutionKernelBase::DispatchData ConvolutionKernel_fs_byx_fsv32::SetDefault(const convolution_params& arg,
int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(arg);
AutoTuneOption option = GetAutoTuneOptions(arg, autoTuneIndex);
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
- runInfo.cldnnStyle.blockHeight = 1;
- runInfo.cldnnStyle.blockWidth = option.blockWidth;
- runInfo.cldnnStyle.inputBlockWidth = getInputWidth(arg, option.blockWidth);
+ dispatchData.cldnnStyle.blockHeight = 1;
+ dispatchData.cldnnStyle.blockWidth = option.blockWidth;
+ dispatchData.cldnnStyle.inputBlockWidth = getInputWidth(arg, option.blockWidth);
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 16;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 16;
- runInfo.gws0 = CeilDiv(arg.output.X().v, option.blockWidth);
- runInfo.gws1 = arg.output.Y().v;
- runInfo.gws2 = CeilDiv(arg.output.Feature().v, 32) * 16 * arg.output.Batch().v;
+ dispatchData.gws[0] = CeilDiv(arg.output.X().v, option.blockWidth);
+ dispatchData.gws[1] = arg.output.Y().v;
+ dispatchData.gws[2] = CeilDiv(arg.output.Feature().v, 32) * 16 * arg.output.Batch().v;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_fs_byx_fsv32::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_fs_byx_fsv32::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto jit = ConvolutionKernelBase::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto jit = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
auto accumulator_type = GetAccumulatorType(params);
auto activation_type = GetAccumulatorType(params);
jit.Merge(MakeTypeJitConstants(accumulator_type, "ACCUMULATOR"));
jit.Merge(MakeTypeJitConstants(activation_type, "ACTIVATION"));
- jit.AddConstant(MakeJitConstant("INPUT_BLOCK_WIDTH", kd.cldnnStyle.inputBlockWidth));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", kd.cldnnStyle.blockWidth));
+ jit.AddConstant(MakeJitConstant("INPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.inputBlockWidth));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
jit.AddConstant(MakeJitConstant("FSV", fsv));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subGroupSize));
jit.AddConstant(MakeJitConstant("FSV_PER_THREAD", fsvPerThread));
}
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
ConvolutionKernelBase::DispatchData ConvolutionKernel_fs_byx_fsv32_1x1::SetDefault(const convolution_params& arg,
int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(arg);
AutoTuneOption option = GetAutoTuneOptions(arg, autoTuneIndex);
- runInfo.efficiency = FORCE_PRIORITY_4;
+ dispatchData.efficiency = FORCE_PRIORITY_4;
- runInfo.cldnnStyle.blockHeight = option.blockHeight;
- runInfo.cldnnStyle.blockWidth = option.blockWidth;
+ dispatchData.cldnnStyle.blockHeight = option.blockHeight;
+ dispatchData.cldnnStyle.blockWidth = option.blockWidth;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 16;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 16;
- runInfo.gws0 = CeilDiv(arg.output.X().v, option.blockWidth);
- runInfo.gws1 = CeilDiv(arg.output.Y().v, option.blockHeight);
- runInfo.gws2 = CeilDiv(arg.output.Feature().v, 32) * 16 * arg.output.Batch().v;
+ dispatchData.gws[0] = CeilDiv(arg.output.X().v, option.blockWidth);
+ dispatchData.gws[1] = CeilDiv(arg.output.Y().v, option.blockHeight);
+ dispatchData.gws[2] = CeilDiv(arg.output.Feature().v, 32) * 16 * arg.output.Batch().v;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_fs_byx_fsv32_1x1::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_fs_byx_fsv32_1x1::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto jit = ConvolutionKernelBase::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto jit = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", kd.cldnnStyle.blockWidth));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", kd.cldnnStyle.blockHeight));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight));
jit.AddConstant(MakeJitConstant("FSV", fsv));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subGroupSize));
jit.AddConstant(MakeJitConstant("FSV_PER_THREAD", fsvPerThread));
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_fs_byx_fsv32_depthwise::SetDefault(const convolution_params& arg,
- int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
+ int autoTuneIndex) const {
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(arg);
AutoTuneOption option = GetAutoTuneOptions(arg, autoTuneIndex);
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
- runInfo.cldnnStyle.blockHeight = 1;
- runInfo.cldnnStyle.blockWidth = option.blockWidth;
- runInfo.cldnnStyle.inputBlockWidth = getInputWidth(arg, option.blockWidth);
+ dispatchData.cldnnStyle.blockHeight = 1;
+ dispatchData.cldnnStyle.blockWidth = option.blockWidth;
+ dispatchData.cldnnStyle.inputBlockWidth = getInputWidth(arg, option.blockWidth);
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 16;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 16;
- runInfo.gws0 = CeilDiv(arg.output.X().v, option.blockWidth);
- runInfo.gws1 = arg.output.Y().v;
- runInfo.gws2 = CeilDiv(arg.output.Feature().v, 32) * 16 * arg.output.Batch().v;
+ dispatchData.gws[0] = CeilDiv(arg.output.X().v, option.blockWidth);
+ dispatchData.gws[1] = arg.output.Y().v;
+ dispatchData.gws[2] = CeilDiv(arg.output.Feature().v, 32) * 16 * arg.output.Batch().v;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_fs_byx_fsv32_depthwise::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_fs_byx_fsv32_depthwise::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto jit = ConvolutionKernelBase::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto jit = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
- jit.AddConstant(MakeJitConstant("INPUT_BLOCK_WIDTH", kd.cldnnStyle.inputBlockWidth));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", kd.cldnnStyle.blockWidth));
+ jit.AddConstant(MakeJitConstant("INPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.inputBlockWidth));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
jit.AddConstant(MakeJitConstant("FSV", fsv));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subGroupSize));
jit.AddConstant(MakeJitConstant("FSV_PER_THREAD", fsvPerThread));
}
KernelsData ConvolutionKernel_fs_byx_fsv32_depthwise::GetTunedKernelsDataByIndex(const Params& params,
- const optional_params& options,
- const int autoTuneIndex) const {
+ const optional_params& options,
+ const int autoTuneIndex) const {
auto tuneOptions = GetAutoTuneOptions(params, autoTuneIndex);
return GetCommonKernelsData(params, options, tuneOptions.exeMode, autoTuneIndex);
}
}
KernelsData ConvolutionKernel_fs_byx_fsv32_depthwise::GetKernelsDataForAutoTune(const Params& params,
- const optional_params& options) const {
+ const optional_params& options) const {
if (!Validate(params, options)) {
return {};
}
}
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
return GetCommonKernelsData(params, options);
}
-JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
- auto mem_consts = Parent::GetJitConstants(params, kd);
+JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const {
+ auto mem_consts = Parent::GetJitConstants(params, dispatchData);
const auto& input = params.inputs[0];
const auto& output = params.output;
ConvolutionKernelBase::DispatchData ConvolutionKernel_imad::SetDefault(const convolution_params& params,
int) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& output = params.output;
const auto& weights = params.weights;
size_t otw, oth;
getOutBlock_WH(output.X().v, params.stride.x, weights.X().v, params.dilation.x, otw, oth);
- std::vector<size_t> global = {// number of tiles needed to cover output width
- CeilDiv(output.X().v, otw),
+ dispatchData.gws = { // number of tiles needed to cover output width
+ CeilDiv(output.X().v, otw),
- // number of tiles needed to cover output height
- CeilDiv(output.Y().v, oth),
+ // number of tiles needed to cover output height
+ CeilDiv(output.Y().v, oth),
- // round depth range up
- Align(weights.OFM().v, SIMD_SIZE) * params.groups * output.Batch().v};
+ // round depth range up
+ Align(weights.OFM().v, SIMD_SIZE) * params.groups * output.Batch().v };
- std::vector<size_t> local = {1, 1, SIMD_SIZE};
+ dispatchData.lws = {1, 1, SIMD_SIZE};
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- kd.cldnnStyle = {0, 0, 0, 0, 0};
- kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+ dispatchData.cldnnStyle = {0, 0, 0, 0, 0};
+ dispatchData.gemmStyle = {0, 0, 0, 0, 0, 0};
// This kernel is quite slow for 1x1 and KHx1 kernels
// TODO: check if we need any optimized kernels in this layout
// If yes, we need to implement some customization for these cases.
- kd.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
- return kd;
+ return dispatchData;
} // SetDefault
bool ConvolutionKernel_imad::Validate(const Params& params, const optional_params& options) const {
protected:
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
WeightsLayout GetPreferredWeightsLayout(const convolution_params &p) const override {
}
ConvolutionKernel_imad_b_fs_yx_fsv4_1x1::AutoTuneParams ConvolutionKernel_imad_b_fs_yx_fsv4_1x1::GetAutoTuneParams(const convolution_params& params,
- int index) const {
+ int index) const {
AutoTuneParams tune_params;
bool selected = false;
if (index >= 0 && index < static_cast<int>(all_tune_params.size())) {
}
JitConstants ConvolutionKernel_imad_b_fs_yx_fsv4_1x1::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto mem_consts = Parent::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto mem_consts = Parent::GetJitConstants(params, dispatchData);
- auto simd = kd.lws0;
- auto features_per_wi = kd.cldnnStyle.blockHeight;
- auto lwg_depth = kd.lws2;
- auto force_prefetch = kd.cldnnStyle.prefetch == 1;
+ auto simd = dispatchData.lws[0];
+ auto features_per_wi = dispatchData.cldnnStyle.blockHeight;
+ auto lwg_depth = dispatchData.lws[2];
+ auto force_prefetch = dispatchData.cldnnStyle.prefetch == 1;
mem_consts.AddConstant(MakeJitConstant("SIMD", simd));
mem_consts.AddConstant(MakeJitConstant("FEATURES_PER_WI", features_per_wi));
} // GetJitConstants
ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_b_fs_yx_fsv4_1x1::SetDefault(const convolution_params& params,
- int autoTuneIndex) const {
- DispatchData kd;
+ int autoTuneIndex) const {
+ DispatchData dispatchData;
auto& out = params.output;
auto autoTuneParam = GetAutoTuneParams(params, autoTuneIndex);
auto simd = autoTuneParam.simd;
auto features_per_wi = autoTuneParam.features_per_wi;
- std::vector<size_t> global = { RoundUp(out.X().v * out.Y().v, simd), CeilDiv(out.Feature().v, features_per_wi), out.Batch().v * lwg_depth };
- std::vector<size_t> local = { simd, 1, lwg_depth};
+ dispatchData.gws = { RoundUp(out.X().v * out.Y().v, simd), CeilDiv(out.Feature().v, features_per_wi), out.Batch().v * lwg_depth };
+ dispatchData.lws = { simd, 1, lwg_depth};
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
+ dispatchData.gemmStyle = { 0, 0, 0, 0, 0, 0 };
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.cldnnStyle.blockHeight = features_per_wi;
+ dispatchData.cldnnStyle.blockWidth = simd;
+ dispatchData.cldnnStyle.prefetch = autoTuneParam.force_prefetch ? 1 : 0;
- kd.gemmStyle = { 0, 0, 0, 0, 0, 0 };
+ dispatchData.efficiency = FORCE_PRIORITY_1;
- kd.cldnnStyle.blockHeight = features_per_wi;
- kd.cldnnStyle.blockWidth = simd;
- kd.cldnnStyle.prefetch = autoTuneParam.force_prefetch ? 1 : 0;
-
- kd.efficiency = FORCE_PRIORITY_1;
-
- return kd;
+ return dispatchData;
} // SetDefault
KernelsData ConvolutionKernel_imad_b_fs_yx_fsv4_1x1::GetTunedKernelsDataByIndex(const Params& params,
- const optional_params& options,
- int autoTuneIndex) const {
+ const optional_params& options,
+ int autoTuneIndex) const {
auto convParams = static_cast<const convolution_params&>(params);
auto tuneParams = GetAutoTuneParams(convParams, autoTuneIndex);
return GetCommonKernelsData(params, options, tuneParams.exeMode, autoTuneIndex);
}
KernelsData ConvolutionKernel_imad_b_fs_yx_fsv4_1x1::GetKernelsDataForAutoTune(const Params& params,
- const optional_params& options) const {
+ const optional_params& options) const {
if (!Validate(params, options)) {
return {};
}
protected:
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
}
JitConstants ConvolutionKernel_imad_b_fs_yx_fsv4_dw::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto mem_consts = Parent::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto mem_consts = Parent::GetJitConstants(params, dispatchData);
size_t filter_block_size = 4;
size_t min_blocked_leftovers = 4;
}
mem_consts.AddConstant(MakeJitConstant("FILTER_BLOCKED", filter_blocked));
- auto& work_mode = kd.cldnnStyle.prefetch;
+ auto& work_mode = dispatchData.cldnnStyle.prefetch;
bool tiled = (work_mode & mode::tiled) != 0;
bool preload_input = (work_mode & mode::preload_input) != 0;
bool preload_weights = (work_mode & mode::preload_weights) != 0;
if (tiled) {
preload_weights = true;
- simd = kd.lws0;
- tile_x = kd.cldnnStyle.blockWidth;
- tile_y = kd.cldnnStyle.blockHeight;
+ simd = dispatchData.lws[0];
+ tile_x = dispatchData.cldnnStyle.blockWidth;
+ tile_y = dispatchData.cldnnStyle.blockHeight;
input_line_size = 1;
output_block_x = 1;
} else if (preload_input) {
tile_x = 1;
- tile_y = kd.cldnnStyle.blockHeight;
- output_block_x = kd.cldnnStyle.blockWidth;
+ tile_y = dispatchData.cldnnStyle.blockHeight;
+ output_block_x = dispatchData.cldnnStyle.blockWidth;
input_line_size = (output_block_x - 1) * params.stride.x + (params.weights.X().v - 1) * params.dilation.x + 1;
} else {
tile_x = 1;
tile_y = 1;
input_line_size = 1;
- output_block_x = kd.cldnnStyle.blockWidth;
+ output_block_x = dispatchData.cldnnStyle.blockWidth;
}
mem_consts.AddConstant(MakeJitConstant("TILED", tiled));
ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_b_fs_yx_fsv4_dw::SetDefault(const convolution_params& params,
int autoTuneIndex) const {
- DispatchData kd;
+ DispatchData dispatchData;
auto& out = params.output;
auto autoTuneParam = GetAutoTuneParams(params, autoTuneIndex);
global_x = global_x * autoTuneParam.tiled_simd;
}
- std::vector<size_t> global = { global_x, global_y, CeilDiv(out.Feature().v, fsv) * out.Batch().v };
- std::vector<size_t> local = { 1, 1, 1 };
+ dispatchData.gws = { global_x, global_y, CeilDiv(out.Feature().v, fsv) * out.Batch().v };
+ dispatchData.lws = { 1, 1, 1 };
if (autoTuneParam.tiled) {
- local[0] = autoTuneParam.tiled_simd;
+ dispatchData.lws[0] = autoTuneParam.tiled_simd;
} else {
- local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
}
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
+ dispatchData.gemmStyle = { 0, 0, 0, 0, 0, 0 };
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.cldnnStyle.blockWidth = autoTuneParam.block_x;
+ dispatchData.cldnnStyle.blockHeight = autoTuneParam.block_y;
+ dispatchData.cldnnStyle.prefetch = (static_cast<size_t>(autoTuneParam.tiled) * mode::tiled)
+ | (static_cast<size_t>(autoTuneParam.preload_input) * mode::preload_input)
+ | (static_cast<size_t>(autoTuneParam.preload_weights) * mode::preload_weights);
- kd.gemmStyle = { 0, 0, 0, 0, 0, 0 };
+ dispatchData.efficiency = FORCE_PRIORITY_1;
- kd.cldnnStyle.blockWidth = autoTuneParam.block_x;
- kd.cldnnStyle.blockHeight = autoTuneParam.block_y;
- kd.cldnnStyle.prefetch = (static_cast<size_t>(autoTuneParam.tiled) * mode::tiled)
- | (static_cast<size_t>(autoTuneParam.preload_input) * mode::preload_input)
- | (static_cast<size_t>(autoTuneParam.preload_weights) * mode::preload_weights);
-
- kd.efficiency = FORCE_PRIORITY_1;
-
- return kd;
+ return dispatchData;
} // SetDefault
KernelsData ConvolutionKernel_imad_b_fs_yx_fsv4_dw::GetTunedKernelsDataByIndex(const Params& params,
}
KernelsData ConvolutionKernel_imad_b_fs_yx_fsv4_dw::GetKernelsDataForAutoTune(const Params& params,
- const optional_params& options) const {
+ const optional_params& options) const {
if (!Validate(params, options)) {
return {};
}
protected:
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return false; }
return GetCommonKernelsData(params, options);
}
-JitConstants Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
- auto mem_consts = Parent::GetJitConstants(params, kd);
+JitConstants Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const {
+ auto mem_consts = Parent::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_scalar = {"",
} // GetJitConstants
ConvolutionKernelBase::DispatchData Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::SetDefault(const convolution_params& params, int) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& output = params.output;
- std::vector<size_t> global = {output.X().v, output.Y().v, output.Feature().v / 32 * output.Batch().v};
- std::vector<size_t> local = {1, 1, SIMD_SIZE};
+ dispatchData.gws = { output.X().v, output.Y().v, output.Feature().v / 32 * output.Batch().v };
+ dispatchData.lws = { 1, 1, SIMD_SIZE};
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
+ dispatchData.cldnnStyle = {0, 0, 0, 0, 0};
+ dispatchData.gemmStyle = {0, 0, 0, 0, 0, 0};
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- kd.cldnnStyle = {0, 0, 0, 0, 0};
- kd.gemmStyle = {0, 0, 0, 0, 0, 0};
-
- kd.efficiency = FORCE_PRIORITY_2;
-
- return kd;
+ return dispatchData;
} // SetDefault
bool Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::Validate(const Params& params, const optional_params& options) const {
protected:
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
return GetCommonKernelsData(params, options);
}
-JitConstants Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
- auto mem_consts = Parent::GetJitConstants(params, kd);
+JitConstants Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const {
+ auto mem_consts = Parent::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
} // GetJitConstants
ConvolutionKernelBase::DispatchData Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::SetDefault(const convolution_params& params, int) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& output = params.output;
- std::vector<size_t> global = {output.X().v, output.Y().v, output.Feature().v / 16 * output.Batch().v};
- std::vector<size_t> local = {1, 1, SIMD_SIZE};
+ dispatchData.gws = { output.X().v, output.Y().v, output.Feature().v / 16 * output.Batch().v };
+ dispatchData.lws = { 1, 1, SIMD_SIZE };
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
+ dispatchData.cldnnStyle = {0, 0, 0, 0, 0};
+ dispatchData.gemmStyle = {0, 0, 0, 0, 0, 0};
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- kd.cldnnStyle = {0, 0, 0, 0, 0};
- kd.gemmStyle = {0, 0, 0, 0, 0, 0};
-
- kd.efficiency = FORCE_PRIORITY_2;
-
- return kd;
+ return dispatchData;
} // SetDefault
bool Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::Validate(const Params& params, const optional_params& options) const {
protected:
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_b_fs_yx_fsv32::SetDefault(const convolution_params& cp,
int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
- runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
- runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
- runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
+ dispatchData.cldnnStyle.blockWidth = tuneOptions.blockWidth;
+ dispatchData.cldnnStyle.blockHeight = tuneOptions.blockHeight;
+ dispatchData.cldnnStyle.prefetch = tuneOptions.prefetch;
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
size_t ow_group = 8;
while (ow_group > 1) {
- if (CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth) % ow_group == 0)
+ if (CeilDiv(cp.output.X().v, dispatchData.cldnnStyle.blockWidth) % ow_group == 0)
break;
ow_group--;
}
- runInfo.gws0 = Align(cp.output.Feature().v, 32) / 4;
- runInfo.gws1 = Align(CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth), ow_group) * cp.output.Y().v * cp.output.Z().v;
- runInfo.gws2 = cp.output.Batch().v;
+ dispatchData.gws[0] = Align(cp.output.Feature().v, 32) / 4;
+ dispatchData.gws[1] = Align(CeilDiv(cp.output.X().v, dispatchData.cldnnStyle.blockWidth), ow_group) * cp.output.Y().v * cp.output.Z().v;
+ dispatchData.gws[2] = cp.output.Batch().v;
- runInfo.lws0 = 8;
- runInfo.lws1 = ow_group;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 8;
+ dispatchData.lws[1] = ow_group;
+ dispatchData.lws[2] = 1;
- return runInfo;
+ return dispatchData;
}
JitConstants ConvolutionKernel_mmad_b_fs_yx_fsv32::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- jit.AddConstant(MakeJitConstant("OW_GROUP", runInfo.lws1));
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws0));
+ jit.AddConstant(MakeJitConstant("OW_GROUP", dispatchData.lws[1]));
+ jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[0]));
jit.AddConstant(MakeJitConstant("OSV_SIZE", 32));
jit.AddConstant(MakeJitConstant("ISV_SIZE", 32));
- jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", runInfo.cldnnStyle.blockWidth));
+ jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", dispatchData.cldnnStyle.blockWidth));
jit.AddConstant(MakeJitConstant("IFM_BLOCKS", CeilDiv(params.inputs[0].Feature().v, 32)));
auto input = params.inputs[0];
auto output = params.output;
- auto blockWidth = runInfo.cldnnStyle.blockWidth;
+ auto blockWidth = dispatchData.cldnnStyle.blockWidth;
size_t input_line_size = params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1)*params.dilation.x + 1;
jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
protected:
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return false; }
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_b_fs_yx_fsv32_dw::SetDefault(const convolution_params& cp,
int /*autoTuneIndex*/) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
- std::vector<size_t> global = {cp.output.Feature().v, cp.output.X().v * cp.output.Y().v, cp.output.Batch().v};
+ dispatchData.gws = { cp.output.Feature().v, cp.output.X().v * cp.output.Y().v, cp.output.Batch().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, cp.engineInfo);
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- auto local = GetOptimalLocalWorkGroupSizes(global, cp.engineInfo);
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
// TODO: optimize this kernel
JitConstants ConvolutionKernel_mmad_b_fs_yx_fsv32_dw::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
protected:
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::goiyx;
}
}
ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::AutoTuneOption ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::GetAutoTuneOptions(const Params &p,
- int autoTuneIndex) const {
+ int autoTuneIndex) const {
if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast<int>(autoTuneOptions.size()))) {
return autoTuneOptions[autoTuneIndex];
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::SetDefault(const convolution_params &cp,
int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
- runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
- runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
- runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
+ dispatchData.cldnnStyle.blockWidth = tuneOptions.blockWidth;
+ dispatchData.cldnnStyle.blockHeight = tuneOptions.blockHeight;
+ dispatchData.cldnnStyle.prefetch = tuneOptions.prefetch;
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
const size_t max_lws = std::max((size_t)1, cp.engineInfo.maxWorkGroupSize / sub_group_size);
- runInfo.gws0 = Align(cp.output.Feature().v, 32) / 2;
- runInfo.gws1 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
- runInfo.gws2 = cp.output.Batch().v * cp.output.Y().v * cp.output.Z().v;
+ dispatchData.gws[0] = Align(cp.output.Feature().v, 32) / 2;
+ dispatchData.gws[1] = CeilDiv(cp.output.X().v, dispatchData.cldnnStyle.blockWidth);
+ dispatchData.gws[2] = cp.output.Batch().v * cp.output.Y().v * cp.output.Z().v;
- runInfo.lws0 = sub_group_size;
- runInfo.lws1 = get_lws(cp, runInfo.gws1, tuneOptions.blockWidth, max_lws);
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = sub_group_size;
+ dispatchData.lws[1] = get_lws(cp, dispatchData.gws[1], tuneOptions.blockWidth, max_lws);
+ dispatchData.lws[2] = 1;
- return runInfo;
+ return dispatchData;
}
JitConstants ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::GetJitConstants(const convolution_params ¶ms,
- const DispatchData &runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData &dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws0));
- jit.AddConstant(MakeJitConstant("LWS0", runInfo.lws0));
- jit.AddConstant(MakeJitConstant("LWS1", runInfo.lws1));
- jit.AddConstant(MakeJitConstant("LWS2", runInfo.lws2));
+ jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[0]));
+ jit.AddConstant(MakeJitConstant("LWS0", dispatchData.lws[0]));
+ jit.AddConstant(MakeJitConstant("LWS1", dispatchData.lws[1]));
+ jit.AddConstant(MakeJitConstant("LWS2", dispatchData.lws[2]));
jit.AddConstant(MakeJitConstant("OSV", 32));
- jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", runInfo.cldnnStyle.blockWidth));
+ jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", dispatchData.cldnnStyle.blockWidth));
auto input = params.inputs[0];
auto output = params.output;
- auto blockWidth = runInfo.cldnnStyle.blockWidth;
- size_t slm_line_size = params.stride.x * (runInfo.lws1 * blockWidth - 1) + (params.weights.X().v - 1) * params.dilation.x + 1;
- size_t slm_chunk_size = slm_line_size / runInfo.lws1;
- size_t slm_tail = slm_line_size % runInfo.lws1;
- size_t slm_line_aligned = slm_chunk_size*runInfo.lws1 + Align(slm_tail, sub_group_size);
+ auto blockWidth = dispatchData.cldnnStyle.blockWidth;
+ size_t slm_line_size = params.stride.x * (dispatchData.lws[1] * blockWidth - 1) + (params.weights.X().v - 1) * params.dilation.x + 1;
+ size_t slm_chunk_size = slm_line_size / dispatchData.lws[1];
+ size_t slm_tail = slm_line_size % dispatchData.lws[1];
+ size_t slm_line_aligned = slm_chunk_size*dispatchData.lws[1] + Align(slm_tail, sub_group_size);
size_t input_line_size = params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1) * params.dilation.x + 1;
jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
- jit.AddConstant(MakeJitConstant("GROUP_SIZE", blockWidth * runInfo.lws1));
+ jit.AddConstant(MakeJitConstant("GROUP_SIZE", blockWidth * dispatchData.lws[1]));
jit.AddConstant(MakeJitConstant("SLM_LINE_SIZE", slm_line_aligned));
jit.AddConstant(MakeJitConstant("SLM_CHUNK_SIZE", slm_chunk_size));
jit.AddConstant(MakeJitConstant("SLM_TAIL", slm_tail));
protected:
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &p) const override {
if (p.output.GetDType() == Datatype::F16 || p.output.GetDType() == Datatype::F32 ||
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv4::SetDefault(const convolution_params &cp,
- int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+ int autoTuneIndex) const {
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
- runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
- runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
- runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
+ dispatchData.cldnnStyle.blockWidth = tuneOptions.blockWidth;
+ dispatchData.cldnnStyle.blockHeight = tuneOptions.blockHeight;
+ dispatchData.cldnnStyle.prefetch = tuneOptions.prefetch;
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
- runInfo.gws0 = Align(cp.output.Feature().v, 32) / 2;
- runInfo.gws1 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth) * cp.output.Y().v;
- runInfo.gws2 = cp.output.Batch().v;
+ dispatchData.gws[0] = Align(cp.output.Feature().v, 32) / 2;
+ dispatchData.gws[1] = CeilDiv(cp.output.X().v, dispatchData.cldnnStyle.blockWidth) * cp.output.Y().v;
+ dispatchData.gws[2] = cp.output.Batch().v;
- runInfo.lws0 = 16;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 16;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return runInfo;
+ return dispatchData;
}
JitConstants ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv4::GetJitConstants(const convolution_params ¶ms,
- const DispatchData &runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData &dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws0));
+ jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[0]));
jit.AddConstant(MakeJitConstant("OSV", 32));
jit.AddConstant(MakeJitConstant("ISV", 32));
- jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", runInfo.cldnnStyle.blockWidth));
+ jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", dispatchData.cldnnStyle.blockWidth));
jit.AddConstant(MakeJitConstant("IFM_BLOCKS", CeilDiv(params.inputs[0].Feature().v, 32)));
auto input = params.inputs[0];
auto output = params.output;
- auto blockWidth = runInfo.cldnnStyle.blockWidth;
+ auto blockWidth = dispatchData.cldnnStyle.blockWidth;
size_t input_line_size = std::min(params.stride.x * (blockWidth - 1) + (params.weights.X().v - 1) * params.dilation.x + 1,
input.X().v + input.X().pad.Total());
}
KernelsData ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv4::GetKernelsDataForAutoTune(const Params ¶ms,
- const optional_params &options) const {
+ const optional_params &options) const {
if (!Validate(params, options)) {
return {};
}
protected:
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::os_is_yx_osv32_isv4_swizzled_by_2;
/*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return GetTunedKernelsDataByIndex(params, options);
}
-JitConstants ConvolutionKernel_Ref::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
- JitConstants jit = ConvolutionKernelBase::GetJitConstants(params, kd);
+JitConstants ConvolutionKernel_Ref::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const {
+ JitConstants jit = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
Datatype accumulator_dt;
Datatype activation_dt;
ConvolutionKernelBase::DispatchData ConvolutionKernel_Ref::SetDefault(const convolution_params& params,
int autoTuneIndex) const {
- DispatchData kd = ConvolutionKernelBase::SetDefault(params, autoTuneIndex);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params, autoTuneIndex);
// FIXME: ConvolutionKernelBase::SetDefault should probably be pure and
// not setting these at all as it's something specific to a concrete
// Just set the correct value for a particular implementation here,
// until the whole hierarchy is re-written.
const auto& out = params.output;
- std::vector<size_t> global = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
-
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
- return kd;
+ dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
+ return dispatchData;
}
bool ConvolutionKernel_Ref::Validate(const Params& params, const optional_params& options) const {
FusedOpType::ACTIVATION };
}
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool Validate(const Params& params, const optional_params& options) const override;
};
}
JitConstants ConvolutionKernel_Winograd_2x3_s1::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- JitConstants jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
const size_t input_tile_width = winograd_input_tile_width;
const size_t input_tile_height = winograd_input_tile_height;
return jit;
}
-ConvolutionKernel_Winograd_2x3_s1::Parent::DispatchData ConvolutionKernel_Winograd_2x3_s1::SetDefault(
- const convolution_params& arg,
- int) const {
- Parent::DispatchData runInfo = Parent::SetDefault(arg);
+ConvolutionKernel_Winograd_2x3_s1::Parent::DispatchData ConvolutionKernel_Winograd_2x3_s1::SetDefault(const convolution_params& arg,
+ int) const {
+ Parent::DispatchData dispatchData = Parent::SetDefault(arg);
const size_t tile_n = winograd_tile_n; // goes in-depth
const size_t tile_m = winograd_tile_m; // goes over flattened x and y
// width by tile's width to get tiles count
const size_t nr_tiles_y = Align(arg.output.Y().v, 8) / input_tile_height;
- runInfo.gws0 = arg.output.Feature().v / tile_n;
- runInfo.gws1 = nr_tiles_x * nr_tiles_y / tile_m;
- runInfo.gws2 = input_tile_width * input_tile_height * arg.inputs[0].Batch().v;
+ dispatchData.gws[0] = arg.output.Feature().v / tile_n;
+ dispatchData.gws[1] = nr_tiles_x * nr_tiles_y / tile_m;
+ dispatchData.gws[2] = input_tile_width * input_tile_height * arg.inputs[0].Batch().v;
- runInfo.lws0 = 8;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 8;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- runInfo.efficiency = FORCE_PRIORITY_4;
+ dispatchData.efficiency = FORCE_PRIORITY_4;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_Winograd_2x3_s1::Validate(const Params& p, const optional_params& o) const {
return WeightsLayout::winograd_2x3_s1_weights;
}
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
}
JitConstants ConvolutionKernel_Winograd_2x3_s1_fused::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- JitConstants jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
const auto idepth = params.inputs[0].Feature().v;
const auto input_pad_y = params.inputs[0].Y().pad.before + params.inputs[0].Y().pad.after;
ConvolutionKernel_Winograd_2x3_s1_fused::Parent::DispatchData ConvolutionKernel_Winograd_2x3_s1_fused::SetDefault(
const convolution_params& arg,
int) const {
- Parent::DispatchData runInfo = Parent::SetDefault(arg);
+ Parent::DispatchData dispatchData = Parent::SetDefault(arg);
const auto odepth = arg.output.Feature().v;
const auto input_pad_y = arg.inputs[0].Y().pad.before + arg.inputs[0].Y().pad.after;
auto K = odepth;
auto N = 1;
- uint32_t global_step[3] = {14, 4, 16 * 8};
- uint32_t local_size[3] = {8, 2, 8};
+ size_t global_step[3] = {14, 4, 16 * 8};
+ size_t local_size[3] = {8, 2, 8};
- uint32_t zStep = local_size[2];
- runInfo.gws0 = ((uint32_t)((Q + global_step[0] - 1)) / global_step[0]) * local_size[0];
- runInfo.gws1 = ((uint32_t)((P + global_step[1] - 1)) / global_step[1]) * local_size[1];
- runInfo.gws2 = ((uint32_t)((N * K * 8 + global_step[2] - 1)) / global_step[2]) * zStep;
+ size_t zStep = local_size[2];
+ dispatchData.gws[0] = ((size_t)((Q + global_step[0] - 1)) / global_step[0]) * local_size[0];
+ dispatchData.gws[1] = ((size_t)((P + global_step[1] - 1)) / global_step[1]) * local_size[1];
+ dispatchData.gws[2] = ((size_t)((N * K * 8 + global_step[2] - 1)) / global_step[2]) * zStep;
- runInfo.lws0 = local_size[0];
- runInfo.lws1 = local_size[1];
- runInfo.lws2 = local_size[2];
+ dispatchData.lws[0] = local_size[0];
+ dispatchData.lws[1] = local_size[1];
+ dispatchData.lws[2] = local_size[2];
- runInfo.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_Winograd_2x3_s1_fused::Validate(const Params& p, const optional_params& o) const {
const optional_params& options) const {
return GetTunedKernelsDataByIndex(params, options);
}
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
return WeightsLayout::winograd_2x3_s1_fused_weights;
}
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
}
JitConstants ConvolutionKernel_Winograd_6x3_s1_fused::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- JitConstants jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
const auto idepth = params.inputs[0].Feature().v;
const auto input_pad_y = params.inputs[0].Y().pad.before + params.inputs[0].Y().pad.after;
ConvolutionKernel_Winograd_6x3_s1_fused::Parent::DispatchData ConvolutionKernel_Winograd_6x3_s1_fused::SetDefault(
const convolution_params& arg,
int) const {
- Parent::DispatchData runInfo = Parent::SetDefault(arg);
+ Parent::DispatchData dispatchData = Parent::SetDefault(arg);
const auto odepth = arg.output.Feature().v;
const auto input_pad_y = arg.inputs[0].Y().pad.before + arg.inputs[0].Y().pad.after;
uint32_t global_step[3] = {14, 6, 16 * 8};
uint32_t local_size[3] = {16, 1, 8};
- runInfo.gws0 = ((uint32_t)((Q + global_step[0] - 1)) / global_step[0]) * local_size[0];
- runInfo.gws1 = ((uint32_t)((P + global_step[1] - 1)) / global_step[1]) * local_size[1];
- runInfo.gws2 = ((uint32_t)((N * K * 8 + global_step[2] - 1)) / global_step[2]) * local_size[2];
+ dispatchData.gws[0] = ((uint32_t)((Q + global_step[0] - 1)) / global_step[0]) * local_size[0];
+ dispatchData.gws[1] = ((uint32_t)((P + global_step[1] - 1)) / global_step[1]) * local_size[1];
+ dispatchData.gws[2] = ((uint32_t)((N * K * 8 + global_step[2] - 1)) / global_step[2]) * local_size[2];
- runInfo.lws0 = local_size[0];
- runInfo.lws1 = local_size[1];
- runInfo.lws2 = local_size[2];
+ dispatchData.lws[0] = local_size[0];
+ dispatchData.lws[1] = local_size[1];
+ dispatchData.lws[2] = local_size[2];
- runInfo.efficiency = FORCE_PRIORITY_1;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_Winograd_6x3_s1_fused::Validate(const Params& p, const optional_params& o) const {
ParamsKey GetSupportedKey() const override;
protected:
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b16::SetDefault(const convolution_params& arg,
int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(arg);
const auto filter_ofm_num = arg.weights.OFM().v * arg.weights.G().v;
const auto batch_size = arg.output.Batch().v;
const size_t ofmPerWorkItem = GetOfmPerWorkitem(arg.inputs[0].GetDType());
if (arg.inputs[0].GetDType() == Datatype::F16) {
- runInfo.efficiency = FORCE_PRIORITY_7;
+ dispatchData.efficiency = FORCE_PRIORITY_7;
} else {
- runInfo.efficiency = FORCE_PRIORITY_9;
+ dispatchData.efficiency = FORCE_PRIORITY_9;
}
- runInfo.lws0 = min_lws;
- runInfo.gws0 = filter_ofm_num * batch_size / (ofmPerWorkItem * batchesPerWorkItem);
+ dispatchData.lws[0] = min_lws;
+ dispatchData.gws[0] = filter_ofm_num * batch_size / (ofmPerWorkItem * batchesPerWorkItem);
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_yxfb_yxio_b16::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_yxfb_yxio_b16::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto jit = Parent::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- const auto local_work_group_size = kd.lws0;
+ const auto local_work_group_size = dispatchData.lws[0];
const auto batch_size = params.output.Batch().v;
if (params.inputs[0].GetDType() == Datatype::F32) {
const size_t ofmPerWorkItem = GetOfmPerWorkitem(params.inputs[0].GetDType());
jit.AddConstants({
- MakeJitConstant("LOCAL_WORK_GROUP_SIZE", kd.lws0),
+ MakeJitConstant("LOCAL_WORK_GROUP_SIZE", dispatchData.lws[0]),
MakeJitConstant("OFM_PER_WORK_ITEM", ofmPerWorkItem),
MakeJitConstant("BATCHES_PER_WORK_ITEM",
batchesPerWorkItem), // how many batches will a single work item compute
}
std::string GetKernelName(const convolution_params&) const override;
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
};
} // namespace kernel_selector
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b1_block::SetDefault(const convolution_params& arg,
int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(arg);
// TODO: fill the proper data here (I don't know where can I locate it).
- return runInfo;
+ return dispatchData;
}
JitConstants ConvolutionKernel_yxfb_yxio_b1_block::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto cldnn_jit = ConvolutionKernelBase::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto cldnn_jit = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
- cldnn_jit.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", kd.lws0));
+ cldnn_jit.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", dispatchData.lws[0]));
return cldnn_jit;
}
ParamsKey GetSupportedKey() const override;
protected:
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::yxio;
}
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::SetDefault(
const convolution_params& arg,
int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg, autoTuneIndex);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(arg, autoTuneIndex);
const auto filter_ofm_num = arg.weights.OFM().v;
const auto batch_size = arg.output.Batch().v;
- runInfo.lws0 = local_work_size;
+ dispatchData.lws[0] = local_work_size;
// We cannot return 8 because we are processing 4 spatial coordinates for batch1,
// and if we use more than 4 ofm_per_work_item we downgrade simd16 to simd8 which would break this algorithm.
// TODO: experiment with SIMD8 version of algorithm and check if it could be faster
/*if (output_feature_count % (lws * 8) == 0)
{
- run_info.ofm_per_work_item = 8;
- run_info.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(run_info.gws1) / 2.0f));
+ dispatchData.ofm_per_work_item = 8;
+ dispatchData.gws[1] = static_cast<size_t>(std::ceil(static_cast<float>(dispatchData.gws[1]) / 2.0f));
}
else*/
const size_t ofmPerWorkItem = GetOfmPerWorkitem(filter_ofm_num, local_work_size);
if (ofmPerWorkItem == 4) {
// We compute multiple spatial coordinates "x" in a single workitem that's why we must divide
- runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 4.0f));
+ dispatchData.gws[1] = static_cast<size_t>(std::ceil(static_cast<float>(dispatchData.gws[1]) / 4.0f));
} else if (ofmPerWorkItem == 2) {
- runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 8.0f));
+ dispatchData.gws[1] = static_cast<size_t>(std::ceil(static_cast<float>(dispatchData.gws[1]) / 8.0f));
} else {
- runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 8.0f));
+ dispatchData.gws[1] = static_cast<size_t>(std::ceil(static_cast<float>(dispatchData.gws[1]) / 8.0f));
}
- runInfo.gws0 = filter_ofm_num * batch_size / ofmPerWorkItem;
+ dispatchData.gws[0] = filter_ofm_num * batch_size / ofmPerWorkItem;
- return runInfo;
+ return dispatchData;
}
JitConstants ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- auto cldnn_jit = ConvolutionKernelBase::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto cldnn_jit = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
size_t ofmPerWorkItem = GetOfmPerWorkitem(params.weights.OFM().v, local_work_size);
cldnn_jit.AddConstant(MakeJitConstant("USE_VECTOR", ofmPerWorkItem));
cldnn_jit.AddConstant(MakeJitConstant(
"OFM_PER_WORK_ITEM",
ofmPerWorkItem)); // how many output feature maps for a single batch will a single work item produce
- cldnn_jit.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", kd.lws0));
+ cldnn_jit.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", dispatchData.lws[0]));
return cldnn_jit;
}
return WeightsLayout::yxio;
}
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b8::SetDefault(const convolution_params& arg,
int autoTuneIndex) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg, autoTuneIndex);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(arg, autoTuneIndex);
const auto filterOfmNum = arg.weights.OFM().v;
const auto batchSize = arg.output.Batch().v;
- runInfo.lws0 = batchSize == 8 ? 8 : 16;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = batchSize == 8 ? 8 : 16;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- size_t ofmPerWorkItem = GetOfmPerWorkitem(filterOfmNum, batchSize, runInfo.lws0);
+ size_t ofmPerWorkItem = GetOfmPerWorkitem(filterOfmNum, batchSize, dispatchData.lws[0]);
- runInfo.gws0 = filterOfmNum * batchSize / ofmPerWorkItem;
+ dispatchData.gws[0] = filterOfmNum * batchSize / ofmPerWorkItem;
- runInfo.efficiency = FORCE_PRIORITY_9;
+ dispatchData.efficiency = FORCE_PRIORITY_9;
- return runInfo;
+ return dispatchData;
}
bool ConvolutionKernel_yxfb_yxio_b8::Validate(const Params& p, const optional_params& o) const {
}
JitConstants ConvolutionKernel_yxfb_yxio_b8::GetJitConstants(const convolution_params& params,
- const DispatchData& kd) const {
- JitConstants jits = ConvolutionKernelBase::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ JitConstants jits = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
- size_t ofmPerWorkItem = GetOfmPerWorkitem(params.weights.OFM().v, params.output.Batch().v, kd.lws0);
+ size_t ofmPerWorkItem = GetOfmPerWorkitem(params.weights.OFM().v, params.output.Batch().v, dispatchData.lws[0]);
jits.AddConstant(MakeJitConstant("OFM_PER_WORK_ITEM", ofmPerWorkItem));
- jits.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", kd.lws0));
+ jits.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", dispatchData.lws[0]));
return jits;
}
ParamsKey GetSupportedKey() const override;
protected:
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::yxio;
}
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
DeformableConvolutionKernel_bfyx_conv::DispatchData DeformableConvolutionKernel_bfyx_conv::SetDefault(const convolution_params& params,
int autoTuneIndex) const {
- DispatchData kd = ConvolutionKernelBase::SetDefault(params, autoTuneIndex);
+ DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params, autoTuneIndex);
const auto& out = params.output;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = CeilDiv(x * y, 16);
- kd.gws1 = Align(f, 16);
- kd.gws2 = b;
+ dispatchData.gws[0] = CeilDiv(x * y, 16);
+ dispatchData.gws[1] = Align(f, 16);
+ dispatchData.gws[2] = b;
- kd.lws0 = 1;
- kd.lws1 = 16;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 16;
+ dispatchData.lws[2] = 1;
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- return kd;
+ return dispatchData;
}
JitConstants DeformableConvolutionKernel_bfyx_conv::GetJitConstants(const convolution_params& params,
- const DispatchData& /*kd*/) const {
+ const DispatchData& /*dispatchData*/) const {
JitConstants jit = WeightBiasKernelBase::GetJitConstants(params);
jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", 16));
jit.AddConstant(MakeJitConstant("INPUT_CHANNELS", params.inputs[0].Feature().v / params.weights.X().v / params.weights.Y().v));
protected:
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
ParamsKey GetSupportedKey() const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
return WeightsLayout::os_is_yx_isv16_osv16;
}
CommonDispatchData DeformableConvolutionKernel_bfyx_interp::SetDefault(const convolution_params& params) const {
- CommonDispatchData kd;
+ CommonDispatchData dispatchData;
const auto& out = params.output;
auto b = out.Batch().v;
auto kernel_size = params.kernelSize.x * params.kernelSize.y;
- kd.gws0 = Align(x * y, 16);
- kd.gws1 = params.deformable_groups * b;
- kd.gws2 = kernel_size;
+ dispatchData.gws[0] = Align(x * y, 16);
+ dispatchData.gws[1] = params.deformable_groups * b;
+ dispatchData.gws[2] = kernel_size;
- kd.lws0 = 16;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 16;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- return kd;
+ return dispatchData;
}
KernelData kd = KernelData::Default<convolution_params>(params);
convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
- CommonDispatchData runInfo = SetDefault(newParams);
+ CommonDispatchData dispatchData = SetDefault(newParams);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, DEFAULT,
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, DEFAULT,
false, false, static_cast<int>(newParams.inputs.size()));
return {kd};
}
CTCGreedyDecoderKernelBase::DispatchData CTCGreedyDecoderKernelBase::SetDefault(const ctc_greedy_decoder_params& params) const {
- DispatchData kd;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+ DispatchData dispatchData;
- std::vector<size_t> global = { 1, 1, 1 };
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { 1, 1, 1 };
+ dispatchData.lws= GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData CTCGreedyDecoderKernelBase::GetCommonKernelsData(const Params& params,
- const optional_params& options,
- float estimated_time) const {
+ const optional_params& options,
+ float estimated_time) const {
assert(params.GetType() == KernelType::CTC_GREEDY_DECODER);
if (!Validate(params, options))
const ctc_greedy_decoder_params& orgParams = static_cast<const ctc_greedy_decoder_params&>(params);
- DispatchData runInfo;
-
- runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<ctc_greedy_decoder_params>(params);
- auto cldnn_jit = GetJitConstants(orgParams, runInfo);
+ auto cldnn_jit = GetJitConstants(orgParams, dispatchData);
auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
using DispatchData = CommonDispatchData;
protected:
- virtual JitConstants GetJitConstants(const ctc_greedy_decoder_params& params, DispatchData kd) const;
+ virtual JitConstants GetJitConstants(const ctc_greedy_decoder_params& params, DispatchData dispatchData) const;
virtual DispatchData SetDefault(const ctc_greedy_decoder_params& params) const;
KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const;
};
}
CumSumKernelBase::DispatchData CumSumKernelBase::SetDefault(const cum_sum_params& params) const {
- DispatchData runInfo;
- std::vector<size_t> global = {params.output.Batch().v,
- params.output.Feature().v * params.output.W().v,
- params.output.Z().v * params.output.Y().v * params.output.X().v};
+ DispatchData dispatchData;
+ dispatchData.gws = { params.output.Batch().v,
+ params.output.Feature().v * params.output.W().v,
+ params.output.Z().v * params.output.Y().v * params.output.X().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
KernelsData CumSumKernelBase::GetCommonKernelsData(const Params& params,
return {};
}
- auto runInfo = SetDefault(newParams);
+ auto dispatchData = SetDefault(newParams);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
- auto cldnn_jit = GetJitConstants(newParams, runInfo);
+ auto cldnn_jit = GetJitConstants(newParams, dispatchData);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kd.estimatedTime = estimatedTime;
int32_t GetCumSumAxisIndex(const cum_sum_params& params) const;
size_t GetRealAxisIndex(const cum_sum_params& params) const;
ParamsKey GetSupportedKey() const override;
- virtual JitConstants GetJitConstants(const cum_sum_params& params, DispatchData kd) const;
+ virtual JitConstants GetJitConstants(const cum_sum_params& params, DispatchData dispatchData) const;
virtual DispatchData SetDefault(const cum_sum_params& params) const;
KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
bool Validate(const Params&, const optional_params&) const override;
static constexpr size_t simd = 16;
static constexpr size_t BLOCK_SIZE = 16;
-JitConstants CumSumKernelPartialSum::GetJitConstants(const cum_sum_params& params, DispatchData kd) const {
- auto jits = CumSumKernelBase::GetJitConstants(params, kd);
+JitConstants CumSumKernelPartialSum::GetJitConstants(const cum_sum_params& params, DispatchData dispatchData) const {
+ auto jits = CumSumKernelBase::GetJitConstants(params, dispatchData);
auto activation_dt = GetActivationType(params);
jits.Merge(MakeTypeJitConstants(activation_dt, "PARTIAL"));
jits.AddConstant(MakeJitConstant("SIMD", simd));
- jits.AddConstant(MakeJitConstant("LWS", kd.lws0));
+ jits.AddConstant(MakeJitConstant("LWS", dispatchData.lws[0]));
jits.AddConstant(MakeJitConstant("BLOCK_SIZE", BLOCK_SIZE));
- jits.AddConstant(MakeJitConstant("SUM_ITEMS_NUM", kd.sum_items_num));
+ jits.AddConstant(MakeJitConstant("SUM_ITEMS_NUM", dispatchData.sum_items_num));
return jits;
}
KernelData kd = KernelData::Default<cum_sum_params>(params, kernels_num);
const cum_sum_params& newParams = *static_cast<cum_sum_params*>(kd.params.get());
- auto runInfo = SetDefaultForMulti(newParams);
+ auto dispatchData = SetDefaultForMulti(newParams);
{
// partial sum
- auto cldnn_jit = GetJitConstants(newParams, runInfo.stage_1);
+ auto cldnn_jit = GetJitConstants(newParams, dispatchData.stage_1);
cldnn_jit.AddConstant(MakeJitConstant("CUM_SUM_PARTIAL_SUM", 1));
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo.stage_1, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData.stage_1, params.engineInfo, kernelName, jit, entry_point);
kernel.arguments.clear(); // Clear original output argument
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
kernel.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
{
// Final
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
- auto cldnn_jit = GetJitConstants(newParams, runInfo.stage_final);
+ auto cldnn_jit = GetJitConstants(newParams, dispatchData.stage_final);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[1];
- FillCLKernelData(kernel, runInfo.stage_final, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData.stage_final, params.engineInfo, kernelName, jit, entry_point);
kernel.arguments.clear(); // Clear original output argument
kernel.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
}
CumSumKernelPartialSum::MultiDispatchData CumSumKernelPartialSum::SetDefaultForMulti(const cum_sum_params& params) const {
- MultiDispatchData md;
+ MultiDispatchData dispatchData;
std::vector<size_t> dims = {params.output.Batch().v,
params.output.Feature().v,
params.output.W().v,
}
}
- md.stage_1.gws0 = Align(gws[0], BLOCK_SIZE);
- md.stage_1.gws1 = gws[1];
- md.stage_1.gws2 = gws[2];
- md.stage_1.lws0 = BLOCK_SIZE;
- md.stage_1.lws1 = 1;
- md.stage_1.lws2 = 1;
- md.stage_1.sum_items_num = items_num;
-
- md.stage_final.gws0 = gws[0];
- md.stage_final.gws1 = gws[1];
- md.stage_final.gws2 = gws[2];
- md.stage_final.lws0 = 1;
- md.stage_final.lws1 = 1;
- md.stage_final.lws2 = 1;
- md.stage_final.sum_items_num = Align(items_num, BLOCK_SIZE);
-
- return md;
+ dispatchData.stage_1.gws[0] = Align(gws[0], BLOCK_SIZE);
+ dispatchData.stage_1.gws[1] = gws[1];
+ dispatchData.stage_1.gws[2] = gws[2];
+ dispatchData.stage_1.lws[0] = BLOCK_SIZE;
+ dispatchData.stage_1.lws[1] = 1;
+ dispatchData.stage_1.lws[2] = 1;
+ dispatchData.stage_1.sum_items_num = items_num;
+
+ dispatchData.stage_final.gws = gws;
+ dispatchData.stage_final.lws = { 1, 1, 1 };
+ dispatchData.stage_final.sum_items_num = Align(items_num, BLOCK_SIZE);
+
+ return dispatchData;
}
KernelsData CumSumKernelPartialSum::GetKernelsData(const Params& params, const optional_params& options) const {
DispatchData stage_final;
};
- JitConstants GetJitConstants(const cum_sum_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const cum_sum_params& params, DispatchData dispatchData) const override;
KernelsData GetMultiStageKernelsData(const Params& params, const optional_params&, float estimated_time) const;
MultiDispatchData SetDefaultForMulti(const cum_sum_params& params) const;
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
#include <vector>
namespace kernel_selector {
-JitConstants CumSumKernelRef::GetJitConstants(const cum_sum_params& params, DispatchData kd) const {
- auto jits = CumSumKernelBase::GetJitConstants(params, kd);
+JitConstants CumSumKernelRef::GetJitConstants(const cum_sum_params& params, DispatchData dispatchData) const {
+ auto jits = CumSumKernelBase::GetJitConstants(params, dispatchData);
jits.AddConstant(MakeJitConstant("AXIS_LAYOUT_INDEX", GetCumSumAxisIndex(params)));
CumSumKernelRef() : CumSumKernelBase("cum_sum_ref") {}
virtual ~CumSumKernelRef() = default;
protected:
- JitConstants GetJitConstants(const cum_sum_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const cum_sum_params& params, DispatchData dispatchData) const override;
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
};
} // namespace kernel_selector
}
DeconvolutionKernelBase::DispatchData DeconvolutionKernel_b_fs_zyx_fsv16::SetDefault(const deconvolution_params& params) const {
- DispatchData kd = DeconvolutionKernelBase::SetDefault(params);
+ DispatchData dispatchData = DeconvolutionKernelBase::SetDefault(params);
const auto& out = params.output;
if (ver_bsv16_fsv16) {
if (params.depthwise_separable_opt) {
- kd.gws0 = x * y * z;
- kd.gws1 = f;
- kd.gws2 = b / 16;
+ dispatchData.gws[0] = x * y * z;
+ dispatchData.gws[1] = f;
+ dispatchData.gws[2] = b / 16;
- kd.lws0 = 1;
- kd.lws1 = sub_group_size;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = sub_group_size;
+ dispatchData.lws[2] = 1;
} else {
- kd.gws0 = 64;
- while (kd.gws0 > 16) {
- if (f % kd.gws0 == 0) break;
- kd.gws0 /= 2;
+ dispatchData.gws[0] = 64;
+ while (dispatchData.gws[0] > 16) {
+ if (f % dispatchData.gws[0] == 0)
+ break;
+ dispatchData.gws[0] /= 2;
}
- kd.gws1 = x * y * z;
- kd.gws2 = CeilDiv(b, 16) * (f / kd.gws0) * params.groups;
+ dispatchData.gws[1] = x * y * z;
+ dispatchData.gws[2] = CeilDiv(b, 16) * (f / dispatchData.gws[0]) * params.groups;
- kd.lws0 = sub_group_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = sub_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
}
} else {
size_t x_block_size = 16;
}
x_block_size = std::max(x_block_size, (size_t)8);
if (params.depthwise_separable_opt) {
- kd.gws0 = CeilDiv(x, x_block_size) * y * z;
- kd.gws1 = f;
- kd.gws2 = b;
+ dispatchData.gws[0] = CeilDiv(x, x_block_size) * y * z;
+ dispatchData.gws[1] = f;
+ dispatchData.gws[2] = b;
- kd.lws0 = 1;
- kd.lws1 = sub_group_size;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = sub_group_size;
+ dispatchData.lws[2] = 1;
} else {
- kd.gws0 = 64;
- while (kd.gws0 > 16) {
- if (f % kd.gws0 == 0) break;
- kd.gws0 /= 2;
+ dispatchData.gws[0] = 64;
+ while (dispatchData.gws[0] > 16) {
+ if (f % dispatchData.gws[0] == 0)
+ break;
+ dispatchData.gws[0] /= 2;
}
- kd.gws1 = CeilDiv(x, x_block_size) * y * z;
- kd.gws2 = b * (f / kd.gws0);
+ dispatchData.gws[1] = CeilDiv(x, x_block_size) * y * z;
+ dispatchData.gws[2] = b * (f / dispatchData.gws[0]);
- kd.lws0 = sub_group_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = sub_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
}
}
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- return kd;
+ return dispatchData;
}
bool DeconvolutionKernel_b_fs_zyx_fsv16::Validate(const Params& p, const optional_params& o) const {
jit.AddConstant(MakeJitConstant("IW_FULL", params.output.X().LogicalDimPadded()));
- DispatchData runInfo = SetDefault(params);
- jit.AddConstant(MakeJitConstant("LWS_0", runInfo.lws0));
- jit.AddConstant(MakeJitConstant("LWS_1", runInfo.lws1));
- jit.AddConstant(MakeJitConstant("LWS_2", runInfo.lws2));
+ DispatchData dispatchData = SetDefault(params);
+ jit.AddConstant(MakeJitConstant("LWS_0", dispatchData.lws[0]));
+ jit.AddConstant(MakeJitConstant("LWS_1", dispatchData.lws[1]));
+ jit.AddConstant(MakeJitConstant("LWS_2", dispatchData.lws[2]));
if (!params.fused_ops.empty()) {
auto fused_dt = GetActivationType(params);
}
DeconvolutionKernelBase::DispatchData DeconvolutionKernel_b_fs_zyx_fsv16_dw::SetDefault(const deconvolution_params& params) const {
- DispatchData kd = DeconvolutionKernelBase::SetDefault(params);
+ DispatchData dispatchData = DeconvolutionKernelBase::SetDefault(params);
const auto& out = params.output;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = CeilDiv(x, GetDispatchParams(params).block_size_x) * y * z;
- kd.gws1 = Align(f, feature_block_size);
- kd.gws2 = b;
+ dispatchData.gws[0] = CeilDiv(x, GetDispatchParams(params).block_size_x) * y * z;
+ dispatchData.gws[1] = Align(f, feature_block_size);
+ dispatchData.gws[2] = b;
- kd.lws0 = 1;
- kd.lws1 = sub_group_size;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = sub_group_size;
+ dispatchData.lws[2] = 1;
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- return kd;
+ return dispatchData;
}
bool DeconvolutionKernel_b_fs_zyx_fsv16_dw::Validate(const Params& p, const optional_params& o) const {
auto batch_size = params.output.Batch().v;
auto output_features = params.output.Feature().v;
- DispatchData kd;
+ DispatchData dispatchData;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
size_t gws0 = output_features * batch_size;
size_t lws0 = std::min(gws0, static_cast<size_t>(32));
while (gws0 % lws0) {
lws0--;
}
- kd.gws0 = gws0;
- kd.gws1 = params.output.X().v;
- kd.gws2 = params.output.Y().v * params.output.Z().v;
- kd.lws0 = lws0;
- kd.lws1 = 1;
- kd.lws2 = 1;
- kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
- return kd;
+
+ dispatchData.gws[0] = gws0;
+ dispatchData.gws[1] = params.output.X().v;
+ dispatchData.gws[2] = params.output.Y().v * params.output.Z().v;
+
+ dispatchData.lws[0] = lws0;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
+
+ dispatchData.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
+ return dispatchData;
}
KernelsData DeconvolutionKernelBase::GetKernelsData(const Params& params, const optional_params& options) const {
}
const deconvolution_params& orgParams = static_cast<const deconvolution_params&>(params);
- DispatchData runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<deconvolution_params>(params);
deconvolution_params& newParams = *static_cast<deconvolution_params*>(kd.params.get());
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
GetFusedPrimitiveInputsCount(params));
kernel.arguments.push_back({ArgumentDescriptor::Types::SPLIT, 0});
- kd.estimatedTime = runInfo.efficiency;
+ kd.estimatedTime = dispatchData.efficiency;
return {kd};
}
}
CommonDispatchData DeconvolutionKernel_bfyx_opt::SetDefault(const deconvolution_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
auto wg_size = 16;
- kd.gws0 = Align(params.output.X().v, wg_size * params.stride.x);
- kd.gws1 = params.output.Y().v;
- kd.gws2 = params.output.Batch().v * params.output.Feature().v;
- kd.lws0 = wg_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
- kd.efficiency = FORCE_PRIORITY_6;
- return kd;
+ dispatchData.gws[0] = Align(params.output.X().v, wg_size * params.stride.x);
+ dispatchData.gws[1] = params.output.Y().v;
+ dispatchData.gws[2] = params.output.Batch().v * params.output.Feature().v;
+
+ dispatchData.lws[0] = wg_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
+
+ dispatchData.efficiency = FORCE_PRIORITY_6;
+ return dispatchData;
}
JitConstants DeconvolutionKernel_bfyx_opt::GetJitConstants(const deconvolution_params& params) const {
}
DeconvolutionKernelBase::DispatchData DeconvolutionKernel_imad_along_f_tile_bfx::SetDefault(const deconvolution_params& params) const {
- auto dispatch = Parent::SetDefault(params);
+ DispatchData dispatchData = Parent::SetDefault(params);
auto tile_x = GetTileX(params);
auto tile_ofm = GetTileOFM(params);
auto tile_b = GetTileB(params);
- std::vector<size_t> global = {
+ dispatchData.gws = {
CeilDiv(params.output.X().v, tile_x) * params.output.Y().v * params.output.Z().v,
Align(CeilDiv(params.output.Feature().v, tile_ofm), simd),
CeilDiv(params.output.Batch().v, tile_b)
};
- std::vector<size_t> local = { 1, simd, 1 };
-
- dispatch.gws0 = global[0];
- dispatch.gws1 = global[1];
- dispatch.gws2 = global[2];
-
- dispatch.lws0 = local[0];
- dispatch.lws1 = local[1];
- dispatch.lws2 = local[2];
+ dispatchData.lws = { 1, simd, 1 };
// Currently most optimized for fsv16 formats
if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 || params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv16) {
- dispatch.efficiency = FORCE_PRIORITY_7;
+ dispatchData.efficiency = FORCE_PRIORITY_7;
} else {
- dispatch.efficiency = FORCE_PRIORITY_8;
+ dispatchData.efficiency = FORCE_PRIORITY_8;
}
- return dispatch;
+ return dispatchData;
}
JitConstants DeconvolutionKernel_imad_along_f_tile_bfx::GetJitConstants(const deconvolution_params& params) const {
}
DeconvolutionKernelBase::DispatchData DeconvolutionKernel_imad_ref::SetDefault(const deconvolution_params& params) const {
- auto dispatch = Parent::SetDefault(params);
+ DispatchData dispatchData = Parent::SetDefault(params);
- std::vector<size_t> global = {
+ dispatchData.gws = {
params.output.Feature().v,
params.output.X().v * params.output.Y().v * params.output.Z().v,
params.output.Batch().v
};
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- dispatch.gws0 = global[0];
- dispatch.gws1 = global[1];
- dispatch.gws2 = global[2];
+ dispatchData.efficiency = FORCE_PRIORITY_9;
- dispatch.lws0 = local[0];
- dispatch.lws1 = local[1];
- dispatch.lws2 = local[2];
-
- dispatch.efficiency = FORCE_PRIORITY_9;
-
- return dispatch;
+ return dispatchData;
}
JitConstants DeconvolutionKernel_imad_ref::GetJitConstants(const deconvolution_params& params) const {
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
CommonDispatchData DeconvolutionKernelRef::SetDefault(const deconvolution_params& params) const {
- CommonDispatchData runInfo = DeconvolutionKernelBase::SetDefault(params);
+ CommonDispatchData dispatchData = DeconvolutionKernelBase::SetDefault(params);
if (params.output.Feature().v * params.output.Batch().v <= 16) {
const auto& out = params.output;
- runInfo.gws0 = Align(out.X().v, 32);
- runInfo.gws1 = out.Y().v * out.Z().v;
- runInfo.gws2 = out.Feature().v * out.Batch().v;
+ dispatchData.gws[0] = Align(out.X().v, 32);
+ dispatchData.gws[1] = out.Y().v * out.Z().v;
+ dispatchData.gws[2] = out.Feature().v * out.Batch().v;
- runInfo.lws0 = 32;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 32;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
}
- return runInfo;
+ return dispatchData;
}
JitConstants DeconvolutionKernelRef::GetJitConstants(const deconvolution_params& params) const {
}
CommonDispatchData DepthToSpaceKernelBase::SetDefault(const depth_to_space_params& params) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
- std::vector<size_t> global = { params.output.Batch().v,
- params.output.Feature().v,
- params.output.Z().v * params.output.Y().v * params.output.X().v };
+ dispatchData.gws = { params.output.Batch().v,
+ params.output.Feature().v,
+ params.output.Z().v * params.output.Y().v * params.output.X().v };
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
JitConstants DepthToSpaceKernelBase::GetJitConstants(const depth_to_space_params& params) const {
return {};
}
- auto runInfo = SetDefault(newParams);
+ auto dispatchData = SetDefault(newParams);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point,
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
DEFAULT, false, false, 1, GetFusedPrimitiveInputsCount(params));
kd.estimatedTime = estimatedTime;
}
CommonDispatchData DepthToSpaceKernelBlock2Opt::SetDefault(const depth_to_space_params& params) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
- std::vector<size_t> global = { Align(params.inputs[0].X().v / 2, 16),
- params.inputs[0].Y().v,
- 1};
+ dispatchData.gws = { Align(params.inputs[0].X().v / 2, 16),
+ params.inputs[0].Y().v,
+ 1 };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
JitConstants DepthToSpaceKernelBlock2Opt::GetJitConstants(const depth_to_space_params& params) const {
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return jit;
}
-DetectionOutputKernelBase::DispatchData DetectionOutputKernelBase::SetDefault(
- const detection_output_params& params) const {
- DispatchData kd;
+DetectionOutputKernelBase::DispatchData DetectionOutputKernelBase::SetDefault(const detection_output_params& /*params*/) const {
+ DispatchData dispatchData;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
- kd.gws0 = 0;
- kd.gws1 = 0;
- kd.gws2 = 0;
- kd.lws0 = 0;
- kd.lws1 = 0;
- kd.lws2 = 0;
- return kd;
+ dispatchData.gws[0] = 0;
+ dispatchData.gws[1] = 0;
+ dispatchData.gws[2] = 0;
+
+ dispatchData.lws[0] = 0;
+ dispatchData.lws[1] = 0;
+ dispatchData.lws[2] = 0;
+
+ return dispatchData;
}
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
CommonDispatchData DetectionOutputKernel::SetDefault(const detection_output_params& params) const {
- CommonDispatchData runInfo = DetectionOutputKernelBase::SetDefault(params);
+ CommonDispatchData dispatchData = DetectionOutputKernelBase::SetDefault(params);
// Number of all work items is set to total number of bounding boxes -
// one bounding box is procerssed by one work item
bboxesNum = work_group_size * params.inputs[0].Batch().v;
- runInfo.gws0 = Align(bboxesNum, work_group_size);
- runInfo.gws1 = 1;
- runInfo.gws2 = 1;
+ dispatchData.gws[0] = Align(bboxesNum, work_group_size);
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
- runInfo.lws0 = work_group_size;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = work_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return runInfo;
+ return dispatchData;
}
KernelsData DetectionOutputKernel::GetKernelsData(const Params& params, const optional_params& options) const {
KernelData kd = KernelData::Default<detection_output_params>(params);
const detection_output_params& detectOutParams = static_cast<const detection_output_params&>(params);
- DispatchData runInfo = SetDefault(detectOutParams);
+ DispatchData dispatchData = SetDefault(detectOutParams);
auto cldnnJit = GetJitConstants(detectOutParams);
auto entryPoint = GetEntryPoint(kernelName, detectOutParams.layerID, options);
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entryPoint);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entryPoint);
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2});
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
CommonDispatchData DetectionOutputKernel_sort::SetDefault(const detection_output_params& params) const {
- CommonDispatchData runInfo = DetectionOutputKernelBase::SetDefault(params);
+ CommonDispatchData dispatchData = DetectionOutputKernelBase::SetDefault(params);
unsigned class_num = params.detectOutParams.num_classes;
if (params.detectOutParams.share_location && params.detectOutParams.background_label_id == 0) {
work_group_size = (work_group_size + work_group_size % 2) / (work_group_size / 256 + 1);
}
- runInfo.gws0 = Align(bboxesNum, work_group_size);
- runInfo.gws1 = 1;
- runInfo.gws2 = 1;
+ dispatchData.gws[0] = Align(bboxesNum, work_group_size);
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
- runInfo.lws0 = work_group_size;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = work_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return runInfo;
+ return dispatchData;
}
KernelsData DetectionOutputKernel_sort::GetKernelsData(const Params& params, const optional_params& options) const {
KernelData kd = KernelData::Default<detection_output_params>(params);
const detection_output_params& detectOutParams = static_cast<const detection_output_params&>(params);
- DispatchData runInfo = SetDefault(detectOutParams);
+ DispatchData dispatchData = SetDefault(detectOutParams);
auto cldnnJit = GetJitConstants(detectOutParams);
auto entryPoint = GetEntryPoint(kernelName, detectOutParams.layerID, options);
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entryPoint);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entryPoint);
kd.estimatedTime = FORCE_PRIORITY_8;
}
EltwiseKernelBase::DispatchData EltwiseKernel_b_fs_yx_fsv16::SetDefault(const eltwise_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
- kd.gws0 = Align(params.output.Feature().v, 16);
- kd.gws1 = CeilDiv(params.output.X().v, GetBlockSize(params)) * params.output.Y().v;
- kd.gws2 = params.output.Batch().v;
+ dispatchData.gws[0] = Align(params.output.Feature().v, 16);
+ dispatchData.gws[1] = CeilDiv(params.output.X().v, GetBlockSize(params)) * params.output.Y().v;
+ dispatchData.gws[2] = params.output.Batch().v;
- kd.lws0 = 16;
- kd.lws1 = 16;
- while (kd.lws1 > 1) {
- if (kd.gws1 % kd.lws1 == 0)
+ dispatchData.lws[0] = 16;
+ dispatchData.lws[1] = 16;
+ while (dispatchData.lws[1] > 1) {
+ if (dispatchData.gws[1] % dispatchData.lws[1] == 0)
break;
- kd.lws1--;
+ dispatchData.lws[1]--;
}
- kd.lws2 = 1;
+ dispatchData.lws[2] = 1;
- kd.efficiency = FORCE_PRIORITY_1;
- return kd;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
+ return dispatchData;
}
KernelsData EltwiseKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, const optional_params& options) const {
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
- DispatchData runInfo = SetDefault(newParams);
+ DispatchData dispatchData = SetDefault(newParams);
auto& kernel = kd.kernels[0];
- kernel.workGroups.global = {runInfo.gws0, runInfo.gws1, runInfo.gws2};
- kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2};
+ kernel.workGroups.global = dispatchData.gws;
+ kernel.workGroups.local = dispatchData.lws;
kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(),
false,
GetFusedPrimitiveInputsCount(params));
- kd.estimatedTime = runInfo.efficiency;
+ kd.estimatedTime = dispatchData.efficiency;
return {kd};
}
}
EltwiseKernelBase::DispatchData EltwiseKernelBase::SetDefault(const eltwise_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
if (params.layoutBased || params.int8_quantization || params.broadcast) {
- auto global = GetTensorFriendlyWorkGroups(params.output);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
+ dispatchData.gws = GetTensorFriendlyWorkGroups(params.output);
} else if (CheckInputsOutputNoPitchSameDims(params)) {
- kd.gws0 = params.output.LogicalSize();
- kd.gws1 = 1;
- kd.gws2 = 1;
+ dispatchData.gws[0] = params.output.LogicalSize();
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
} else {
const auto& out = params.output;
gws.push_back(1U);
}
- kd.gws0 = gws[0];
+ dispatchData.gws[0] = gws[0];
if (n_dims == 6) {
- kd.gws1 = gws[1] * gws[2] * gws[3]; // y*z*w
- kd.gws2 = gws[4] * gws[5];
+ dispatchData.gws[1] = gws[1] * gws[2] * gws[3]; // y*z*w
+ dispatchData.gws[2] = gws[4] * gws[5];
} else if (n_dims == 5) {
- kd.gws1 = gws[1] * gws[2]; // y*z
- kd.gws2 = gws[3] * gws[4];
+ dispatchData.gws[1] = gws[1] * gws[2]; // y*z
+ dispatchData.gws[2] = gws[3] * gws[4];
} else {
- kd.gws1 = gws[1];
- kd.gws2 = gws[2] * gws[3];
+ dispatchData.gws[1] = gws[1];
+ dispatchData.gws[2] = gws[2] * gws[3];
}
}
- auto local = GetOptimalLocalWorkGroupSizes({kd.gws0, kd.gws1, kd.gws2}, params.engineInfo);
+ auto local = GetOptimalLocalWorkGroupSizes({dispatchData.gws[0], dispatchData.gws[1], dispatchData.gws[2]}, params.engineInfo);
const size_t optimal_lws_values[] = {256, 224, 192, 160, 128, 96, 64, 32, 16};
if ((params.output.GetLayout() == DataLayout::b_fs_yx_fsv16 ||
params.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 ||
params.output.GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16) &&
- params.output.Feature().v % 16 == 0 && kd.gws1 % 16 == 0) {
- kd.lws0 = 1;
+ params.output.Feature().v % 16 == 0 && dispatchData.gws[1] % 16 == 0) {
+ dispatchData.lws[0] = 1;
for (auto lws : optimal_lws_values) {
- if (kd.gws1 % lws == 0) {
- kd.lws1 = lws;
+ if (dispatchData.gws[1] % lws == 0) {
+ dispatchData.lws[1] = lws;
break;
}
}
- kd.lws2 = 1;
+ dispatchData.lws[2] = 1;
} else if (params.output.GetLayout() == DataLayout::fs_b_yx_fsv32) {
- kd.gws2 = Align(kd.gws2, 32);
- kd.lws0 = 1;
- kd.lws1 = 1;
- kd.lws2 = 32;
+ dispatchData.gws[2] = Align(dispatchData.gws[2], 32);
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 32;
} else if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv32 && params.output.Feature().v % 32 == 0) {
if (params.layoutBased || params.int8_quantization || params.broadcast) {
- kd.lws0 = 1;
- kd.lws1 = 32;
- kd.lws2 = 1;
- } else if (kd.gws0 == params.output.LogicalSize()) {
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 32;
+ dispatchData.lws[2] = 1;
+ } else if (dispatchData.gws[0] == params.output.LogicalSize()) {
+ dispatchData.lws = local;
} else {
- kd.lws0 = 1;
- kd.lws1 = 1;
- kd.lws2 = 32;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 32;
}
} else {
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.lws[0] = local[0];
+ dispatchData.lws[1] = local[1];
+ dispatchData.lws[2] = local[2];
}
- return kd;
+ return dispatchData;
}
KernelsData EltwiseKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options) const {
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
- DispatchData runInfo = SetDefault(newParams);
+ DispatchData dispatchData = SetDefault(newParams);
auto& kernel = kd.kernels[0];
- kernel.workGroups.global = {runInfo.gws0, runInfo.gws1, runInfo.gws2};
- kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2};
+ kernel.workGroups.global = dispatchData.gws;
+ kernel.workGroups.local = dispatchData.lws;
kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(),
namespace kernel_selector {
-namespace {
-std::shared_ptr<JitConstant> GetJit_GetIndexForDataLayout(std::string jitName,
- std::string prefix,
- DataLayout dataLayout) {
- std::string jitValue;
- switch (dataLayout) {
- case DataLayout::byxf:
- jitValue += "GET_DATA_INDEX(";
- break;
- case DataLayout::fs_b_yx_fsv32:
- jitValue += "GET_DATA_FS_B_YX_FSV32_INDEX(";
- break;
- default:
- throw std::runtime_error("incorrect data_layout");
- }
- jitValue += prefix + ",b,f,y,x)";
-
- return MakeJitConstant(jitName, jitValue);
-}
-} // namespace
// TODO: [blocked_formats] does fp32 work well with kernel?
ParamsKey EltwiseKernel_mixed_byxf_and_fs_b_yx_fsv32::GetSupportedKey() const {
ParamsKey k;
}
CommonDispatchData EmbeddingBagKernelRef::SetDefault(const embedding_bag_params& params) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
- std::vector<size_t> global = { params.output.Batch().v,
- params.output.Feature().v,
- params.output.Y().v * params.output.X().v };
+ dispatchData.gws = { params.output.Batch().v,
+ params.output.Feature().v,
+ params.output.Y().v * params.output.X().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
KernelsData EmbeddingBagKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
return {};
}
- auto runInfo = SetDefault(newParams);
+ auto dispatchData = SetDefault(newParams);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
}
ExtractImagePatchesKernelBase::DispatchData ExtractImagePatchesKernelBase::SetDefault(const extract_image_patches_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
- std::vector<size_t> global = { params.output.Batch().v,
- params.output.Feature().v,
- params.output.Y().v * params.output.X().v };
+ dispatchData.gws = { params.output.Batch().v,
+ params.output.Feature().v,
+ params.output.Y().v * params.output.X().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData ExtractImagePatchesKernelBase::GetCommonKernelsData(const Params& params,
const auto& prim_params = static_cast<const extract_image_patches_params&>(params);
- auto run_info = SetDefault(prim_params);
+ auto dispatchData = SetDefault(prim_params);
KernelData kd = KernelData::Default<extract_image_patches_params>(params);
auto cldnn_jit = GetJitConstants(prim_params);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kd.estimatedTime = estimated_time;
virtual ~FullyConnectedBlockKernelBase() {}
protected:
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
// how many batches will a single work item compute
virtual size_t GetBatchesPerWorkItem(const fully_connected_params& params) const;
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
FullyConnectedKernelBase::DispatchData FullyConnectedKernelBase::SetDefault(const fully_connected_params& params,
int) const {
DispatchData dispatchData;
- dispatchData.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
// Determine global work sizes.
- dispatchData.gws0 = params.output.LogicalSize();
- dispatchData.gws1 = dispatchData.gws2 = 1;
+ dispatchData.gws = { params.output.LogicalSize(), 1, 1 };
// Find largest positive local work size that is divider for global work size.
- dispatchData.lws0 = std::min(std::max(dispatchData.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
- while (dispatchData.gws0 % dispatchData.lws0 != 0) {
- --dispatchData.lws0;
+ dispatchData.lws[0] = std::min(std::max(dispatchData.gws[0], static_cast<size_t>(1)), static_cast<size_t>(32));
+ while (dispatchData.gws[0] % dispatchData.lws[0] != 0) {
+ --dispatchData.lws[0];
}
- dispatchData.lws1 = dispatchData.lws2 = 1;
+ dispatchData.lws[1] = dispatchData.lws[2] = 1;
return dispatchData;
}
auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
- const DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
- auto cldnn_jit = GetJitConstants(newParams, runInfo);
+ const DispatchData dispatchData = SetDefault(newParams, autoTuneIndex);
+ auto cldnn_jit = GetJitConstants(newParams, dispatchData);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
uint32_t fused_deps_total = 0;
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
const int autoTuneIndex = -1) const;
protected:
- virtual JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const;
+ virtual JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const;
virtual DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const;
KernelsData GetCommonKernelsData(const Params ¶ms,
const optional_params &options,
int autoTuneIndex = -1) const;
// Fused ops
- virtual JitConstants GetFusedPrimitivesJitConstants(const fully_connected_params& params, const DispatchData& kd) const;
+ virtual JitConstants GetFusedPrimitivesJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const;
Datatype GetActivationType(const fully_connected_params& params) const;
// --Fused ops
FullyConnected_bf_io_GEMM::DispatchData FullyConnected_bf_io_GEMM::SetDefault(const fully_connected_params& params,
int autoTuneIndex) const {
- auto runInfo = Parent::SetDefault(params, autoTuneIndex);
+ auto dispatchData = Parent::SetDefault(params, autoTuneIndex);
const uint32_t localWorkSizeX = 64;
const uint32_t globalWorkSizeX = localWorkSizeX;
- std::vector<size_t> global = {globalWorkSizeX, params.output.Feature().v, params.output.Batch().v};
- std::vector<size_t> local = {localWorkSizeX, 1, 1};
+ dispatchData.gws = { globalWorkSizeX, params.output.Feature().v, 1 };
+ dispatchData.lws = { localWorkSizeX, 1, 1 };
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = 1;
+ dispatchData.efficiency = FORCE_PRIORITY_6;
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = 1;
-
- runInfo.efficiency = FORCE_PRIORITY_6;
-
- return runInfo;
+ return dispatchData;
}
JitConstants FullyConnected_bf_io_GEMM::GetJitConstants(const fully_connected_params& params,
- const DispatchData& kd) const {
- auto jit = Parent::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
if (params.inputs[0].GetDType() == Datatype::F16) {
jit.AddConstant(MakeJitConstant("__fc_f16", ""));
protected:
DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
FullyConnected_bf_io_input_spatial::DispatchData FullyConnected_bf_io_input_spatial::SetDefault(
const fully_connected_params& arg,
int) const {
- auto kd = FullyConnectedKernelBase::SetDefault(arg);
+ auto dispatchData = FullyConnectedKernelBase::SetDefault(arg);
- kd.gws0 = Align(arg.output.LogicalSize() / arg.inputs[0].Batch().v, 16);
- kd.gws1 = arg.inputs[0].Batch().v;
- kd.gws2 = 1;
- kd.lws0 = 16;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.gws[0] = Align(arg.output.LogicalSize() / arg.inputs[0].Batch().v, 16);
+ dispatchData.gws[1] = arg.inputs[0].Batch().v;
+ dispatchData.gws[2] = 1;
- kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
+ dispatchData.lws[0] = 16;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
+
+ dispatchData.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
const auto& input = arg.inputs[0];
const auto& output = arg.output;
if (input.Batch().v == 1 && output.Batch().v == 1) {
if ((input.LogicalSize() / output.Batch().v >= 4096) && (output.Feature().v >= 4096)) {
- kd.efficiency = FORCE_PRIORITY_1;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
}
}
- return kd;
+ return dispatchData;
}
bool FullyConnected_bf_io_input_spatial::Validate(const Params& p, const optional_params& o) const {
return k;
}
-JitConstants FullyConnected_bf_io_ref::GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const {
- JitConstants jit = Parent::GetJitConstants(params, kd);
+JitConstants FullyConnected_bf_io_ref::GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
auto input_dt = GetUnitType(params);
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ACTIVATION };
}
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
.Case(tune_params(16, std::min(max_tile_ofm, 2u), 1, 2, 1, 1, AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 1, 1, AGE_BASED));
}
-
+
if (dtype == Datatype::F32) {
// tune_params(tile_b, tile_ofm, tile_ifm, tile_k, dispatch_bsv, dispatch_fsv, exec_options)
selector.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 16, 2, AGE_BASED))
selector.Case([&](const fully_connected_params&) -> tune_params {
tune_params result(8, std::min(max_tile_ofm, 2u), 1, 2, 1, 1, DEFAULT);
-
+
while (batch % result.tile_b != 0)
result.tile_b--;
-
+
result.dispatch_bsv = 16;
while (batch % (result.tile_b * result.dispatch_bsv) != 0)
result.dispatch_bsv--;
if (result.tile_b >= 8)
result.exec_options = AGE_BASED;
-
+
return result;
});
FullyConnected_bf_tiled::DispatchData
FullyConnected_bf_tiled::SetDefault(const fully_connected_params& params, int autoTuneIndex) const {
- auto runInfo = Parent::SetDefault(params);
+ auto dispatchData = Parent::SetDefault(params);
auto tparams = GetAutoTuneParams(params, autoTuneIndex);
size_t feature_threads = CeilDiv(params.output.Feature().v, tparams.tile_ofm * simd);
size_t batch_threads = params.output.Batch().v / tparams.tile_b;
- runInfo.gws0 = feature_threads * batch_threads * simd;
- runInfo.gws1 = 1;
- runInfo.gws2 = 1;
+ dispatchData.gws[0] = feature_threads * batch_threads * simd;
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
- runInfo.lws0 = simd;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = simd;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- runInfo.tile_m = tparams.tile_b;
- runInfo.tile_n = tparams.tile_ofm;
- runInfo.tile_mk = tparams.tile_ifm;
- runInfo.tile_nk = tparams.tile_k;
- runInfo.tile_ms = tparams.dispatch_bsv;
- runInfo.tile_ns = tparams.dispatch_fsv;
+ dispatchData.tile_m = tparams.tile_b;
+ dispatchData.tile_n = tparams.tile_ofm;
+ dispatchData.tile_mk = tparams.tile_ifm;
+ dispatchData.tile_nk = tparams.tile_k;
+ dispatchData.tile_ms = tparams.dispatch_bsv;
+ dispatchData.tile_ns = tparams.dispatch_fsv;
- return runInfo;
+ return dispatchData;
}
-JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const {
- JitConstants jit = Parent::GetJitConstants(params, kd);
+JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
jit.AddConstant(MakeJitConstant("SIMD", simd));
- jit.AddConstant(MakeJitConstant("TILE_B", kd.tile_m));
- jit.AddConstant(MakeJitConstant("TILE_OFM", kd.tile_n));
- jit.AddConstant(MakeJitConstant("TILE_IFM", kd.tile_mk));
- jit.AddConstant(MakeJitConstant("TILE_K", kd.tile_nk));
- jit.AddConstant(MakeJitConstant("TILE_K_OFM", kd.tile_nk * kd.tile_n));
- jit.AddConstant(MakeJitConstant("DISPATCH_BSV", kd.tile_ms));
- jit.AddConstant(MakeJitConstant("DISPATCH_FSV", kd.tile_ns));
+ jit.AddConstant(MakeJitConstant("TILE_B", dispatchData.tile_m));
+ jit.AddConstant(MakeJitConstant("TILE_OFM", dispatchData.tile_n));
+ jit.AddConstant(MakeJitConstant("TILE_IFM", dispatchData.tile_mk));
+ jit.AddConstant(MakeJitConstant("TILE_K", dispatchData.tile_nk));
+ jit.AddConstant(MakeJitConstant("TILE_K_OFM", dispatchData.tile_nk * dispatchData.tile_n));
+ jit.AddConstant(MakeJitConstant("DISPATCH_BSV", dispatchData.tile_ms));
+ jit.AddConstant(MakeJitConstant("DISPATCH_FSV", dispatchData.tile_ns));
- jit.Merge(MakeConstantLoopUnrollJitConstants(kd.tile_m));
+ jit.Merge(MakeConstantLoopUnrollJitConstants(dispatchData.tile_m));
bool realign_fp16_offset = params.inputs[0].GetDType() == Datatype::F16 && params.output.GetFirstElementOffset() % 2 != 0;
jit.AddConstant(MakeJitConstant("REALIGN_FP16_OFFSET", realign_fp16_offset));
if (!params.fused_ops.empty()) {
auto boundary_check = BoundaryCheck::DISABLED;
- if (params.output.Feature().v % (kd.tile_n * simd) != 0)
+ if (params.output.Feature().v % (dispatchData.tile_n * simd) != 0)
boundary_check = BoundaryCheck::ENABLED;
FusedOpsConfiguration conf = { "",
{"(out_b + bi)", "out_f", "0", "0"},
"activated[bi]",
activation_dt,
- kd.tile_n,
+ dispatchData.tile_n,
LoadType::LT_ALIGNED_READ,
boundary_check,
IndexType::TENSOR_COORD,
FusedOpType::SCALE,
FusedOpType::QUANTIZE };
}
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& params, const optional_params& options) const override;
tune_params GetAutoTuneParams(const fully_connected_params& params, int idx = -1) const;
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
FullyConnected_bfyx_Ref::DispatchData FullyConnected_bfyx_Ref::SetDefault(const fully_connected_params& params,
int) const {
- auto runInfo = Parent::SetDefault(params);
+ auto dispatchData = Parent::SetDefault(params);
- std::vector<size_t> global = {params.output.Feature().v, params.output.Batch().v};
- std::vector<size_t> local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { params.output.Feature().v, params.output.Batch().v, 1 };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = 1;
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = 1;
-
- return runInfo;
+ return dispatchData;
}
JitConstants FullyConnected_bfyx_Ref::GetJitConstants(const fully_connected_params& params,
- const FullyConnectedKernelBase::DispatchData& kd) const {
- JitConstants jit = Parent::GetJitConstants(params, kd);
+ const FullyConnectedKernelBase::DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
Datatype accumulator_dt;
Datatype activation_dt;
FusedOpType::ACTIVATION };
}
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
FullyConnected_bs_f_bsv16_af8::DispatchData FullyConnected_bs_f_bsv16_af8::SetDefault(const fully_connected_params& arg,
int) const {
- auto kd = FullyConnectedBlockKernelBase::SetDefault(arg);
+ auto dispatchData = FullyConnectedBlockKernelBase::SetDefault(arg);
size_t groups_per_batches = GetLocalGroupsSize(arg);
- kd.gws0 = Align(arg.output.LogicalSize() / (GetBatchesPerWorkItem(arg) * groups_per_batches), 16);
- kd.gws1 = groups_per_batches;
- kd.lws0 = 16;
- kd.lws1 = 1;
+ dispatchData.gws[0] = Align(arg.output.LogicalSize() / (GetBatchesPerWorkItem(arg) * groups_per_batches), 16);
+ dispatchData.gws[1] = groups_per_batches;
+ dispatchData.lws[0] = 16;
+ dispatchData.lws[1] = 1;
- return kd;
+ return dispatchData;
}
static bool check_input_layout(const DataTensor& t) {
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
JitConstants FullyConnected_bs_f_bsv16_b1::GetJitConstants(
const fully_connected_params& params,
- const FullyConnectedKernelBase::DispatchData& run_info) const {
- auto& d = static_cast<const DispatchData&>(run_info);
- auto cldnn_jit = FullyConnectedKernelBase::GetJitConstants(params, run_info);
+ const FullyConnectedKernelBase::DispatchData& dispatchData) const {
+ auto& d = static_cast<const DispatchData&>(dispatchData);
+ auto cldnn_jit = FullyConnectedKernelBase::GetJitConstants(params, dispatchData);
cldnn_jit.AddConstants({
- MakeJitConstant("SUB_GROUP_SIZE", run_info.lws0),
- MakeJitConstant("WORK_ITEMS_PER_BATCH", run_info.gws1),
+ MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[0]),
+ MakeJitConstant("WORK_ITEMS_PER_BATCH", dispatchData.gws[1]),
MakeJitConstant("UNIT_BYTE_SIZE", d.unit_byte_size),
MakeJitConstant("CHUNK_TYPE", d.chunk_type),
FullyConnected_bs_f_bsv16_b1::DispatchData FullyConnected_bs_f_bsv16_b1::SetDefault(const fully_connected_params& arg,
int) const {
- DispatchData run_info = FullyConnectedKernelBase::SetDefault(arg);
+ DispatchData dispatchData = FullyConnectedKernelBase::SetDefault(arg);
// Properties of chunk and unit.
const char* chunk_type = "uint";
- const uint32_t unit_byte_size = run_info.fp16UnitUsed ? sizeof(short) : sizeof(float);
+ const uint32_t unit_byte_size = BytesPerElement(arg.inputs[0].GetDType());
constexpr uint32_t chunk_byte_size = sizeof(uint32_t);
constexpr uint32_t sub_group_size = 16;
const uint32_t units_per_chunk = chunk_byte_size / unit_byte_size;
const auto response_size = arg.output.Feature().v;
auto rg_count = CeilDiv(response_size, responses_per_sg_exec);
- run_info.lws0 = sub_group_size;
+ dispatchData.lws[0] = sub_group_size;
// Number of work items needed to process all response groups.
- run_info.gws0 = rg_count * sub_group_size;
- run_info.lws1 = run_info.lws2 = 1;
- run_info.gws1 = run_info.gws2 = 1;
+ dispatchData.gws[0] = rg_count * sub_group_size;
+ dispatchData.lws[1] = dispatchData.lws[2] = 1;
+ dispatchData.gws[1] = dispatchData.gws[2] = 1;
- run_info.unit_byte_size = unit_byte_size;
- run_info.chunk_type = chunk_type;
- run_info.chunk_byte_size = chunk_byte_size;
- run_info.units_per_chunk = units_per_chunk;
- run_info.bytes_per_sg_read = sub_group_size * chunk_byte_size;
- run_info.units_per_sg_read = units_per_sg_read;
- run_info.responses_per_sg_exec = responses_per_sg_exec;
- run_info.in_chunk_prefetch_size = 2;
- run_info.filter_chunk_prefetch_size = responses_per_sg_exec;
+ dispatchData.unit_byte_size = unit_byte_size;
+ dispatchData.chunk_type = chunk_type;
+ dispatchData.chunk_byte_size = chunk_byte_size;
+ dispatchData.units_per_chunk = units_per_chunk;
+ dispatchData.bytes_per_sg_read = sub_group_size * chunk_byte_size;
+ dispatchData.units_per_sg_read = units_per_sg_read;
+ dispatchData.responses_per_sg_exec = responses_per_sg_exec;
+ dispatchData.in_chunk_prefetch_size = 2;
+ dispatchData.filter_chunk_prefetch_size = responses_per_sg_exec;
- return run_info;
+ return dispatchData;
}
KernelsData FullyConnected_bs_f_bsv16_b1::GetKernelsData(const Params& params, const optional_params& optParams) const {
protected:
JitConstants GetJitConstants(const fully_connected_params& params,
- const FullyConnectedKernelBase::DispatchData& kd) const override;
+ const FullyConnectedKernelBase::DispatchData& dispatchData) const override;
DispatchData SetDefault(const fully_connected_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
FullyConnected_bs_f_bsv8_af8::DispatchData FullyConnected_bs_f_bsv8_af8::SetDefault(const fully_connected_params& arg,
int) const {
- auto kd = FullyConnectedBlockKernelBase::SetDefault(arg);
+ auto dispatchData = FullyConnectedBlockKernelBase::SetDefault(arg);
size_t groups_per_batches = GetLocalGroupsSize(arg);
- kd.gws0 =
+ dispatchData.gws[0] =
Align(arg.output.LogicalSize() / (GetNeuronsPerWorkItem(arg) * GetBatchesPerWorkItem(arg) * groups_per_batches),
8);
- kd.gws1 = groups_per_batches;
- kd.lws0 = 8;
- kd.lws1 = 1;
+ dispatchData.gws[1] = groups_per_batches;
+ dispatchData.lws[0] = 8;
+ dispatchData.lws[1] = 1;
- return kd;
+ return dispatchData;
}
static bool check_input_layout(const DataTensor& t) {
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
FullyConnected_fb_io_b8_f8::DispatchData FullyConnected_fb_io_b8_f8::SetDefault(const fully_connected_params& arg,
int) const {
- auto kd = FullyConnectedBlockKernelBase::SetDefault(arg);
+ auto dispatchData = FullyConnectedBlockKernelBase::SetDefault(arg);
const auto& output = arg.output;
size_t groups_per_batches = GetLocalGroupsSize(arg);
- kd.gws0 =
+ dispatchData.gws[0] =
Align(output.LogicalSize() / (GetNeuronsPerWorkItem(arg) * GetBatchesPerWorkItem(arg) * groups_per_batches), 8);
- kd.gws1 = groups_per_batches;
- kd.lws0 = 8;
- kd.lws1 = 1;
+ dispatchData.gws[1] = groups_per_batches;
+ dispatchData.lws[0] = 8;
+ dispatchData.lws[1] = 1;
- return kd;
+ return dispatchData;
}
bool FullyConnected_fb_io_b8_f8::Validate(const Params& p, const optional_params& o) const {
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
FullyConnected_fb_io_block::DispatchData FullyConnected_fb_io_block::SetDefault(const fully_connected_params& arg,
int) const {
- auto kd = FullyConnectedKernelBase::SetDefault(arg);
+ auto dispatchData = FullyConnectedKernelBase::SetDefault(arg);
const auto& output = arg.output;
auto batch_size = output.Batch().v;
// for at least one input data set from batch.
auto rg_count = CeilDiv(response_size, units_per_sg_read);
- kd.lws0 = sub_group_size;
+ dispatchData.lws[0] = sub_group_size;
// Number of work items needed to process all response groups.
- kd.gws0 = rg_count * sub_group_size;
- kd.lws1 = 1;
- kd.gws1 = batch_size / units_per_sg_read;
-
- kd.unit_byte_size = unit_byte_size;
- kd.chunk_type = chunk_type;
- kd.chunk_byte_size = chunk_byte_size;
- kd.units_per_chunk = units_per_chunk;
- kd.bytes_per_sg_read = sub_group_size * chunk_byte_size;
- kd.units_per_sg_read = units_per_sg_read;
- kd.rg_count = (uint32_t)rg_count;
- kd.last_rg_size = response_size % units_per_sg_read;
- return kd;
+ dispatchData.gws[0] = rg_count * sub_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.gws[1] = batch_size / units_per_sg_read;
+
+ dispatchData.unit_byte_size = unit_byte_size;
+ dispatchData.chunk_type = chunk_type;
+ dispatchData.chunk_byte_size = chunk_byte_size;
+ dispatchData.units_per_chunk = units_per_chunk;
+ dispatchData.bytes_per_sg_read = sub_group_size * chunk_byte_size;
+ dispatchData.units_per_sg_read = units_per_sg_read;
+ dispatchData.rg_count = (uint32_t)rg_count;
+ dispatchData.last_rg_size = response_size % units_per_sg_read;
+ return dispatchData;
}
JitConstants FullyConnected_fb_io_block::GetJitConstants(const fully_connected_params& params,
- const FullyConnectedKernelBase::DispatchData& run_info) const {
- auto cldnn_jit = FullyConnectedKernelBase::GetJitConstants(params, run_info);
+ const FullyConnectedKernelBase::DispatchData& dispatchData) const {
+ auto cldnn_jit = FullyConnectedKernelBase::GetJitConstants(params, dispatchData);
cldnn_jit.AddConstants({
- MakeJitConstant("SUB_GROUP_SIZE", run_info.lws0),
- MakeJitConstant("WORK_ITEMS_PER_BATCH", run_info.gws1),
- MakeJitConstant("UNIT_BYTE_SIZE", run_info.unit_byte_size),
- MakeJitConstant("CHUNK_TYPE", run_info.chunk_type),
- MakeJitConstant("CHUNK_BYTE_SIZE", run_info.chunk_byte_size),
- MakeJitConstant("UNITS_PER_CHUNK", run_info.units_per_chunk),
- MakeJitConstant("BYTES_PER_SG_READ", run_info.bytes_per_sg_read),
- MakeJitConstant("UNITS_PER_SG_READ", run_info.units_per_sg_read),
- MakeJitConstant("RG_COUNT", run_info.rg_count),
- MakeJitConstant("LAST_RG_SIZE", run_info.last_rg_size),
+ MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[0]),
+ MakeJitConstant("WORK_ITEMS_PER_BATCH", dispatchData.gws[1]),
+ MakeJitConstant("UNIT_BYTE_SIZE", dispatchData.unit_byte_size),
+ MakeJitConstant("CHUNK_TYPE", dispatchData.chunk_type),
+ MakeJitConstant("CHUNK_BYTE_SIZE", dispatchData.chunk_byte_size),
+ MakeJitConstant("UNITS_PER_CHUNK", dispatchData.units_per_chunk),
+ MakeJitConstant("BYTES_PER_SG_READ", dispatchData.bytes_per_sg_read),
+ MakeJitConstant("UNITS_PER_SG_READ", dispatchData.units_per_sg_read),
+ MakeJitConstant("RG_COUNT", dispatchData.rg_count),
+ MakeJitConstant("LAST_RG_SIZE", dispatchData.last_rg_size),
});
return cldnn_jit;
}
protected:
bool Validate(const Params& p, const optional_params& o) const override;
JitConstants GetJitConstants(const fully_connected_params& params,
- const FullyConnectedKernelBase::DispatchData& kd) const override;
+ const FullyConnectedKernelBase::DispatchData& dispatchData) const override;
DispatchData SetDefault(const fully_connected_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
return k;
}
-JitConstants FullyConnected_fb_io_ref::GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const {
- JitConstants jit = Parent::GetJitConstants(params, kd);
+JitConstants FullyConnected_fb_io_ref::GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ACTIVATION };
}
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
FullyConnected_fb_oi_b8_ref::DispatchData FullyConnected_fb_oi_b8_ref::SetDefault(const fully_connected_params& arg,
int) const {
- auto kd = FullyConnectedKernelBase::SetDefault(arg);
+ auto dispatchData = FullyConnectedKernelBase::SetDefault(arg);
const auto& output = arg.output;
- kd.gws0 = output.Batch().v;
- kd.gws1 = output.LogicalSize() / kd.gws0;
- kd.lws0 = 8;
- kd.lws1 = 1;
+ dispatchData.gws[0] = output.Batch().v;
+ dispatchData.gws[1] = output.LogicalSize() / dispatchData.gws[0];
+ dispatchData.lws[0] = 8;
+ dispatchData.lws[1] = 1;
- return kd;
+ return dispatchData;
}
bool FullyConnected_fb_oi_b8_ref::Validate(const Params& p, const optional_params& o) const {
}
-JitConstants FullyConnected_fb_oi_ref::GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const {
- JitConstants jit = Parent::GetJitConstants(params, kd);
+JitConstants FullyConnected_fb_oi_ref::GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
auto input_dt = GetUnitType(params);
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ACTIVATION };
}
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
FullyConnected_fs_byx_fsv32::Parent::DispatchData FullyConnected_fs_byx_fsv32::SetDefault(
const fully_connected_params& params,
int autoTuneIndex) const {
- auto runInfo = Parent::SetDefault(params, autoTuneIndex);
+ auto dispatchData = Parent::SetDefault(params, autoTuneIndex);
auto blockSizeB = std::min(outputBlockSizeB, params.output.Batch().v);
auto blockNumB = CeilDiv(params.output.Batch().v, blockSizeB);
auto wgHeight = std::min(preferredWGHeight, blockNumB);
- runInfo.gws0 = CeilDiv(params.output.Feature().v, outputBlockSizeF);
- runInfo.gws1 = RoundUp(blockNumB, wgHeight);
- runInfo.gws2 = subGroupSize;
+ dispatchData.gws[0] = CeilDiv(params.output.Feature().v, outputBlockSizeF);
+ dispatchData.gws[1] = RoundUp(blockNumB, wgHeight);
+ dispatchData.gws[2] = subGroupSize;
- runInfo.lws0 = 1;
- runInfo.lws1 = wgHeight;
- runInfo.lws2 = subGroupSize;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = wgHeight;
+ dispatchData.lws[2] = subGroupSize;
- runInfo.efficiency = FORCE_PRIORITY_5;
+ dispatchData.efficiency = FORCE_PRIORITY_5;
- return runInfo;
+ return dispatchData;
}
JitConstants FullyConnected_fs_byx_fsv32::GetJitConstants(const fully_connected_params& params,
- const DispatchData& kd) const {
- auto jit = Parent::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
auto blockSizeB = std::min(outputBlockSizeB, params.output.Batch().v);
auto blockNumB = CeilDiv(params.output.Batch().v, blockSizeB);
protected:
ParamsKey GetSupportedKey() const override;
DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
int) const {
const int simdSize = 16;
- auto runInfo = Parent::SetDefault(params);
+ auto dispatchData = Parent::SetDefault(params);
- runInfo.gws0 = RoundUp(params.output.Feature().v, simdSize);
- runInfo.gws1 = params.output.Batch().v;
- runInfo.gws2 = 1;
+ dispatchData.gws[0] = RoundUp(params.output.Feature().v, simdSize);
+ dispatchData.gws[1] = params.output.Batch().v;
+ dispatchData.gws[2] = 1;
- runInfo.lws0 = simdSize;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = simdSize;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return runInfo;
+ return dispatchData;
} // SetDefault
bool FullyConnectedKernelIMAD::Validate(const Params& params, const optional_params& options) const {
return true;
} // Validate
-JitConstants FullyConnectedKernelIMAD::GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const {
- auto jit = Parent::GetJitConstants(params, kd);
+JitConstants FullyConnectedKernelIMAD::GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
protected:
bool Validate(const Params& params, const optional_params& options) const override;
DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override;
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE,
FusedOpType::SCALE,
FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params,
int) const {
FullyConnectedTuningData tuning_data = SetTuningParams(params);
- auto runInfo = Parent::SetDefault(params);
+ auto dispatchData = Parent::SetDefault(params);
const auto& output = params.output;
- std::vector<size_t> global = { Align(output.Feature().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, 1 };
- std::vector<size_t> local = { tuning_data.work_group_size, 1, 1 };
+ dispatchData.gws = { Align(output.Feature().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, 1 };
+ dispatchData.lws = { tuning_data.work_group_size, 1, 1 };
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_params& params,
- const DispatchData& runInfo) const {
+ const DispatchData& dispatchData) const {
FullyConnectedTuningData tuning_data = SetTuningParams(params);
- auto jit = Parent::GetJitConstants(params, runInfo);
+ auto jit = Parent::GetJitConstants(params, dispatchData);
auto& input = params.inputs[0];
auto& weights = params.weights;
};
protected:
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE,
return k;
}
-JitConstants FullyConnected_yxfb_ref::GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const {
- JitConstants jit = Parent::GetJitConstants(params, kd);
+JitConstants FullyConnected_yxfb_ref::GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
auto input_dt = GetUnitType(params);
FusedOpsConfiguration conf = { "", {"b", "f", "y", "x"}, "result", input_dt, 1 };
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ACTIVATION };
}
- JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
}
JitConstants fused_conv_eltwise_kernel_base::GetJitConstants(const fused_conv_eltwise_params& params,
- const DispatchData& kd) const {
+ const DispatchData& dispatchData) const {
JitConstants mem_consts = WeightBiasKernelBase::GetJitConstants(params);
const auto& padding = params.conv.padding;
const auto& input = params.inputs[0];
std::vector<uint32_t> unrollLoopParams{params.conv.filterSize.x,
params.conv.filterSize.y,
params.conv.filterSize.z,
- (uint32_t)kd.gemmStyle.globalWorkSizeDX,
- (uint32_t)kd.gemmStyle.globalWorkSizeDY,
- (uint32_t)kd.gemmStyle.globalWorkSizeDZ,
- (uint32_t)kd.gemmStyle.subBlockDimM,
- (uint32_t)kd.gemmStyle.subBlockDimK,
- (uint32_t)kd.gemmStyle.subBlockDimN};
+ (uint32_t)dispatchData.gemmStyle.globalWorkSizeDX,
+ (uint32_t)dispatchData.gemmStyle.globalWorkSizeDY,
+ (uint32_t)dispatchData.gemmStyle.globalWorkSizeDZ,
+ (uint32_t)dispatchData.gemmStyle.subBlockDimM,
+ (uint32_t)dispatchData.gemmStyle.subBlockDimK,
+ (uint32_t)dispatchData.gemmStyle.subBlockDimN};
auto loopCount = *std::max_element(unrollLoopParams.begin(), unrollLoopParams.end());
return mem_consts;
}
-bool fused_conv_eltwise_kernel_base::CheckWorkGroups(const fused_conv_eltwise_kernel_base::DispatchData& kd) {
- if (kd.gws0 == 0 || kd.gws1 == 0 || kd.gws2 == 0 || kd.lws0 == 0 || kd.lws1 == 0 || kd.lws2 == 0) {
+bool fused_conv_eltwise_kernel_base::CheckWorkGroups(const fused_conv_eltwise_kernel_base::DispatchData& dispatchData) {
+ if (dispatchData.gws.size() != 3 || dispatchData.lws.size() != 3)
return false;
- }
- if ((kd.gws0 % kd.lws0) != 0 || (kd.gws1 % kd.lws1) != 0 || (kd.gws2 % kd.lws2) != 0) {
- return false;
+ for (size_t i = 0; i < dispatchData.gws.size(); i++) {
+ if (dispatchData.gws[i] == 0 || dispatchData.lws[i] == 0)
+ return false;
+ if ((dispatchData.gws[i] % dispatchData.lws[i]) != 0)
+ return false;
}
return true;
fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_base::SetDefault(
const fused_conv_eltwise_params& params,
int) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& out = params.output;
- kd.fp16UnitUsed = out.GetDType() == Datatype::F16;
- std::vector<size_t> global;
+
if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf ||
params.output.GetLayout() == DataLayout::bfzyx || params.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 ||
params.output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) {
- global = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
+ dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
} else {
- global = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v * out.Z().v };
+ dispatchData.gws = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v * out.Z().v };
}
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- kd.cldnnStyle.blockWidth = 1;
- kd.cldnnStyle.blockHeight = 1;
- kd.cldnnStyle.prefetch = 0;
- kd.cldnnStyle.inputBlockArraySize = 0;
- kd.cldnnStyle.inputBlockWidth = 0;
-
- kd.gemmStyle.globalWorkSizeDX = 1;
- kd.gemmStyle.globalWorkSizeDY = 1;
- kd.gemmStyle.globalWorkSizeDZ = 1;
- kd.gemmStyle.subBlockDimK = 1;
- kd.gemmStyle.subBlockDimM = 0;
- kd.gemmStyle.subBlockDimN = 0;
- kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
- return kd;
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
+
+ dispatchData.cldnnStyle.blockWidth = 1;
+ dispatchData.cldnnStyle.blockHeight = 1;
+ dispatchData.cldnnStyle.prefetch = 0;
+ dispatchData.cldnnStyle.inputBlockArraySize = 0;
+ dispatchData.cldnnStyle.inputBlockWidth = 0;
+
+ dispatchData.gemmStyle.globalWorkSizeDX = 1;
+ dispatchData.gemmStyle.globalWorkSizeDY = 1;
+ dispatchData.gemmStyle.globalWorkSizeDZ = 1;
+ dispatchData.gemmStyle.subBlockDimK = 1;
+ dispatchData.gemmStyle.subBlockDimM = 0;
+ dispatchData.gemmStyle.subBlockDimN = 0;
+ dispatchData.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
+ return dispatchData;
}
KernelsData fused_conv_eltwise_kernel_base::GetCommonKernelsData(const Params& params,
if (NeedPaddedInput()) {
kd.reorderInput = CovolutionUpdateInputParams(newParams);
}
- DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
+ DispatchData dispatchData = SetDefault(newParams, autoTuneIndex);
- if (!CheckWorkGroups(runInfo)) {
+ if (!CheckWorkGroups(dispatchData)) {
// Internal Error - wrong calculation of global/local work group sizes
return {};
}
}
auto finalKernelName = GetKernelName(newParams);
- auto cldnnJit = GetJitConstants(newParams, runInfo);
+ auto cldnnJit = GetJitConstants(newParams, dispatchData);
auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options);
auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
finalKernelName,
jit,
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
}
- kd.estimatedTime = runInfo.efficiency;
+ kd.estimatedTime = dispatchData.efficiency;
kd.autoTuneIndex = autoTuneIndex;
return {kd};
virtual std::string GetKernelName(const fused_conv_eltwise_params&) const { return kernelName; }
virtual bool NeedPaddedInput() const { return false; }
bool Validate(const Params& p, const optional_params& o) const override;
- virtual JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const;
+ virtual JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& dispatchData) const;
virtual DispatchData SetDefault(const fused_conv_eltwise_params& params, int autoTuneIndex = -1) const;
static bool CheckWorkGroups(const DispatchData&);
static bool CheckPitchForSplitOnly(const fused_conv_eltwise_params& params);
fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_1x1_opt::SetDefault(
const fused_conv_eltwise_params& arg,
int) const {
- DispatchData runInfo = Parent::SetDefault(arg);
+ DispatchData dispatchData = Parent::SetDefault(arg);
constexpr size_t sub_group_size = 8;
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
auto block = get_out_block_size(arg);
- runInfo.gws0 = arg.output.X().v / block.out_width;
- runInfo.gws1 = arg.output.Y().v / block.out_height;
- runInfo.gws2 = 2 * (arg.output.Feature().v * arg.output.Batch().v) /
- block.out_depth; // process 8 output channels per Workitem
+ dispatchData.gws[0] = arg.output.X().v / block.out_width;
+ dispatchData.gws[1] = arg.output.Y().v / block.out_height;
+ dispatchData.gws[2] = 2 * (arg.output.Feature().v * arg.output.Batch().v) /
+ block.out_depth; // process 8 output channels per Workitem
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 2 * sub_group_size;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 2 * sub_group_size;
- return runInfo;
+ return dispatchData;
}
JitConstants fused_conv_eltwise_kernel_bfyx_1x1_opt::GetJitConstants(const fused_conv_eltwise_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
auto block = get_out_block_size(params);
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override;
std::string GetKernelName(const fused_conv_eltwise_params& params) const override;
bool NeedPaddedInput() const override { return true; }
- JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_iyxo::SetDefault(
const fused_conv_eltwise_params& cp,
int) const {
- DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(cp);
+ DispatchData dispatchData = fused_conv_eltwise_kernel_base::SetDefault(cp);
- runInfo.efficiency = FORCE_PRIORITY_9;
+ dispatchData.efficiency = FORCE_PRIORITY_9;
- runInfo.gws0 = CeilDiv(cp.output.X().v, sub_group_size) / 4 / 2;
- runInfo.gws1 = cp.output.Y().v / 2;
- runInfo.gws2 = sub_group_size;
+ dispatchData.gws[0] = CeilDiv(cp.output.X().v, sub_group_size) / 4 / 2;
+ dispatchData.gws[1] = cp.output.Y().v / 2;
+ dispatchData.gws[2] = sub_group_size;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = sub_group_size;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = sub_group_size;
- return runInfo;
+ return dispatchData;
}
bool fused_conv_eltwise_kernel_bfyx_iyxo::Validate(const Params& p, const optional_params& o) const {
}
JitConstants fused_conv_eltwise_kernel_bfyx_iyxo::GetJitConstants(const fused_conv_eltwise_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
+ jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[2]));
return jit;
}
WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params&) const override {
return WeightsLayout::iyxo;
}
- JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
bool NeedPaddedInput() const override { return true; }
DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
option.blockWidth = 4;
option.blockHeight = 3;
option.prefetch = 5;
- // run_info.efficiency = FORCE_PRIORITY_7; // GEMM is better
}
// if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block
fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::SetDefault(
const fused_conv_eltwise_params& cp,
int autoTuneIndex) const {
- DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(cp);
+ DispatchData dispatchData = fused_conv_eltwise_kernel_base::SetDefault(cp);
const auto of_maps = cp.output.Feature().v;
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
- runInfo.efficiency = FORCE_PRIORITY_3;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
- runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
- runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
- runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
+ dispatchData.cldnnStyle.blockWidth = tuneOptions.blockWidth;
+ dispatchData.cldnnStyle.blockHeight = tuneOptions.blockHeight;
+ dispatchData.cldnnStyle.prefetch = tuneOptions.prefetch;
- auto input_block_dims = get_bfyx_req_input_block_dims(runInfo.cldnnStyle.blockWidth,
- runInfo.cldnnStyle.blockHeight,
+ auto input_block_dims = get_bfyx_req_input_block_dims(dispatchData.cldnnStyle.blockWidth,
+ dispatchData.cldnnStyle.blockHeight,
cp.conv.filterSize,
cp.conv.stride,
cp.conv.dilation,
sub_group_size,
- runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
+ cp.output.GetDType() == Datatype::F16 ? sub_group_size : sub_group_size / 2,
sub_group_size);
- runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
- runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
+ dispatchData.cldnnStyle.inputBlockArraySize = input_block_dims.first;
+ dispatchData.cldnnStyle.inputBlockWidth = input_block_dims.second;
- runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
- runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
- runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v;
+ dispatchData.gws[0] = CeilDiv(cp.output.X().v, dispatchData.cldnnStyle.blockWidth);
+ dispatchData.gws[1] = CeilDiv(cp.output.Y().v, dispatchData.cldnnStyle.blockHeight);
+ dispatchData.gws[2] = of_threads_per_batch * cp.output.Batch().v;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = sub_group_size;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = sub_group_size;
- return runInfo;
+ return dispatchData;
}
bool fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::Validate(const Params& p, const optional_params& o) const {
}
JitConstants fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetJitConstants(const fused_conv_eltwise_params& params,
- const DispatchData& runInfo) const {
+ const DispatchData& dispatchData) const {
const auto of_maps = params.output.Feature().v;
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
size_t leftovers = of_threads_per_batch - of_maps;
- auto jit = Parent::GetJitConstants(params, runInfo);
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
- jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
- jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
- jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
+ jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[2]));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
+ jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight));
+ jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", dispatchData.cldnnStyle.inputBlockArraySize));
+ jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", dispatchData.cldnnStyle.inputBlockWidth));
+ jit.AddConstant(MakeJitConstant("PREFETCH", dispatchData.cldnnStyle.prefetch));
if (leftovers) {
jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers));
protected:
WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override;
- JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& dispatchData) const override;
bool Validate(const Params& p, const optional_params& o) const override;
bool NeedPaddedInput() const override { return true; }
DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
std::vector<AutoTuneOption> autoTuneOptions = {};
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_yxfb_yxio_b16::SetDefault(
const fused_conv_eltwise_params& arg,
int) const {
- DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(arg);
+ DispatchData dispatchData = fused_conv_eltwise_kernel_base::SetDefault(arg);
const auto filter_ofm_num = arg.weights.OFM().v;
const auto batch_size = arg.output.Batch().v;
const size_t ofmPerWorkItem = GetOfmPerWorkitem(arg.inputs[0].GetDType());
if (arg.inputs[0].GetDType() == Datatype::F16) {
- runInfo.efficiency = FORCE_PRIORITY_7;
+ dispatchData.efficiency = FORCE_PRIORITY_7;
} else {
- runInfo.efficiency = FORCE_PRIORITY_9;
+ dispatchData.efficiency = FORCE_PRIORITY_9;
}
- runInfo.lws0 = min_lws;
- runInfo.gws0 = filter_ofm_num * batch_size / (ofmPerWorkItem * batchesPerWorkItem);
+ dispatchData.lws[0] = min_lws;
+ dispatchData.gws[0] = filter_ofm_num * batch_size / (ofmPerWorkItem * batchesPerWorkItem);
- return runInfo;
+ return dispatchData;
}
bool fused_conv_eltwise_kernel_yxfb_yxio_b16::Validate(const Params& p, const optional_params& o) const {
}
JitConstants fused_conv_eltwise_kernel_yxfb_yxio_b16::GetJitConstants(const fused_conv_eltwise_params& params,
- const DispatchData& kd) const {
- auto jit = Parent::GetJitConstants(params, kd);
+ const DispatchData& dispatchData) const {
+ auto jit = Parent::GetJitConstants(params, dispatchData);
- const auto local_work_group_size = kd.lws0;
+ const auto local_work_group_size = dispatchData.lws[0];
const auto batch_size = params.output.Batch().v;
if (params.inputs[0].GetDType() == Datatype::F32) {
const size_t ofmPerWorkItem = GetOfmPerWorkitem(params.inputs[0].GetDType());
jit.AddConstants({
- MakeJitConstant("LOCAL_WORK_GROUP_SIZE", kd.lws0),
+ MakeJitConstant("LOCAL_WORK_GROUP_SIZE", dispatchData.lws[0]),
MakeJitConstant("OFM_PER_WORK_ITEM", ofmPerWorkItem),
MakeJitConstant("BATCHES_PER_WORK_ITEM",
batchesPerWorkItem), // how many batches will a single work item compute
}
std::string GetKernelName(const fused_conv_eltwise_params&) const override;
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
} else if (size == 6) {
idx_order = {"b", "f", "w", "z", "y", "x"};
}
-
+
return idx_order;
}
for (size_t i = dictionary_dims_num; i < idx_order.size(); i++)
idx_order[i] = zeroVal;
-
+
// Fix size to inputs[0] dims size
for (size_t i = 0; i < params.output.GetDims().size() - params.inputs[0].GetDims().size(); i++)
idx_order.pop_back();
}
CommonDispatchData GatherKernelRef::SetDefault(const gather_params& params, const optional_params&) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
const auto& output = params.output;
- std::vector<size_t> global;
- std::vector<size_t> local;
-
if (output.GetLayout() == DataLayout::bfyx) {
- global = {output.X().v, output.Y().v, output.Feature().v * output.Batch().v};
+ dispatchData.gws = {output.X().v, output.Y().v, output.Feature().v * output.Batch().v};
} else if (output.GetLayout() == DataLayout::bfzyx) {
- global = {output.X().v, output.Y().v * output.Z().v, output.Feature().v * output.Batch().v};
+ dispatchData.gws = {output.X().v, output.Y().v * output.Z().v, output.Feature().v * output.Batch().v};
} else {
- global = {output.X().v * output.Y().v, output.Z().v * output.W().v, output.Feature().v * output.Batch().v};
+ dispatchData.gws = {output.X().v * output.Y().v, output.Z().v * output.W().v, output.Feature().v * output.Batch().v};
}
- local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- runInfo.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- return runInfo;
+ return dispatchData;
}
JitConstants GatherKernelRef::GetJitConstants(const gather_params& params) const {
KernelData kd = KernelData::Default<gather_params>(params);
gather_params& newParams = *static_cast<gather_params*>(kd.params.get());
- auto runInfo = SetDefault(newParams, options);
+ auto dispatchData = SetDefault(newParams, options);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2, GetFusedPrimitiveInputsCount(params));
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2, GetFusedPrimitiveInputsCount(params));
kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
}
GatherTreeKernelBase::DispatchData GatherTreeKernelBase::SetDefault(const gather_tree_params & params) const {
- std::vector<size_t> global{
- params.output.Y().v, // beam
- params.output.Feature().v, // batch
- 1
- };
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ DispatchData dispatchData;
/*
b -> time
f -> batch
y -> beam
*/
- DispatchData data;
- data.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
- data.gws0 = global[0];
- data.gws1 = global[1];
- data.gws2 = global[2];
- data.lws0 = local[0];
- data.lws1 = local[1];
- data.lws2 = local[2];
- return data;
+ dispatchData.gws = { params.output.Y().v, // beam
+ params.output.Feature().v, // batch
+ 1 };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
+ return dispatchData;
}
KernelsData GatherTreeKernelBase::GetCommonKernelsData(const Params& params,
- const optional_params& options,
- float estimated_time) const {
+ const optional_params& options,
+ float estimated_time) const {
assert(params.GetType() == KernelType::GATHER_TREE);
const auto& gt_params = static_cast<const gather_tree_params&>(params);
- auto run_info = SetDefault(gt_params);
+ auto dispatchData = SetDefault(gt_params);
auto kernel_data = KernelData::Default<gather_tree_params>(params);
auto cldnn_jit = GetJitConstants(gt_params);
auto entry_point = GetEntryPoint(kernelName, gt_params.layerID, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
FillCLKernelData(kernel_data.kernels[0],
- run_info,
- params.engineInfo,
- kernelName,
- jit,
- entry_point,
- DEFAULT,
- false,
- false,
- static_cast<int>(gt_params.inputs.size()));
+ dispatchData,
+ params.engineInfo,
+ kernelName,
+ jit,
+ entry_point,
+ DEFAULT,
+ false,
+ false,
+ static_cast<int>(gt_params.inputs.size()));
kernel_data.estimatedTime = estimated_time;
return { kernel_data };
}
/*
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
GemmKernelBase::DispatchData GemmKernelBase::SetDefault(const gemm_params& params) const {
const auto& output = params.output;
- DispatchData kd;
-
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+ DispatchData dispatchData;
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
- std::vector<size_t> global = { output.X().v, output.Y().v, total_batches };
-
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.gws = { output.X().v, output.Y().v, total_batches };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- return kd;
+ return dispatchData;
}
KernelsData GemmKernelBase::GetCommonKernelsData(const Params& params,
const auto& prim_params = static_cast<const gemm_params&>(params);
- auto run_info = SetDefault(prim_params);
+ auto dispatchData = SetDefault(prim_params);
KernelData k_data = KernelData::Default<gemm_params>(params);
auto cldnn_jit = GetJitConstants(prim_params);
auto& kernel = k_data.kernels[0];
FillCLKernelData(kernel,
- run_info,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
virtual DispatchData SetDefault(const gemm_params& params) const;
KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const;
// Fused ops
- virtual JitConstants GetFusedPrimitivesJitConstants(const gemm_params& params, const DispatchData& kd) const;
+ virtual JitConstants GetFusedPrimitivesJitConstants(const gemm_params& params, const DispatchData& dispatchData) const;
Datatype GetActivationType(const gemm_params& params) const;
// --Fused ops
const auto& output = params.output;
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
- DispatchData kd;
+ DispatchData dispatchData;
GemmTuningData td = SetTuningParams(params);
- std::vector<size_t> global = { Align(output.X().v, td.simd_size),
- Align(output.Y().v, td.simd_size * td.tile_num) / (td.simd_size * td.tile_num),
- total_batches };
+ dispatchData.gws = { Align(output.X().v, td.simd_size),
+ Align(output.Y().v, td.simd_size * td.tile_num) / (td.simd_size * td.tile_num),
+ total_batches };
+ dispatchData.lws = { td.simd_size, 1, 1 };
- std::vector<size_t> local = { td.simd_size, 1, 1 };
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
GemmKernelMMADint8::GemmTuningData GemmKernelMMADint8::InitGemmTuningData(const gemm_params& params) const {
const auto& prim_params = static_cast<const gemm_params&>(params);
- auto run_info = GemmKernelMMADint8::SetDefault(prim_params);
+ auto dispatchData = GemmKernelMMADint8::SetDefault(prim_params);
KernelData k_data = KernelData::Default<gemm_params>(params);
auto cldnn_jit = GetJitConstants(prim_params);
auto& kernel = k_data.kernels[0];
FillCLKernelData(kernel,
- run_info,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
const auto& output = params.output;
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
- DispatchData kd;
+ DispatchData dispatchData;
GemmTuningData td = SetTuningParams(params);
- std::vector<size_t> global = { td.size_n / td.pack_size, output.Y().v / td.simd_size, total_batches };
- std::vector<size_t> local = { td.slm_tile_size / td.pack_size, td.slm_tile_size / td.simd_size, 1 };
+ dispatchData.gws = { td.size_n / td.pack_size, output.Y().v / td.simd_size, total_batches };
+ dispatchData.lws = { td.slm_tile_size / td.pack_size, td.slm_tile_size / td.simd_size, 1 };
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
GemmKernelMMADslmInt8::GemmTuningData GemmKernelMMADslmInt8::InitGemmTuningData(const gemm_params& params) const {
const auto& prim_params = static_cast<const gemm_params&>(params);
- auto run_info = GemmKernelMMADslmInt8::SetDefault(prim_params);
+ auto dispatchData = GemmKernelMMADslmInt8::SetDefault(prim_params);
KernelData k_data = KernelData::Default<gemm_params>(params);
auto cldnn_jit = GetJitConstants(prim_params);
auto& kernel = k_data.kernels[0];
FillCLKernelData(kernel,
- run_info,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& params) const {
const auto& output = params.output;
- DispatchData kd;
+ DispatchData dispatchData;
GemmTuningData td = SetTuningParams(params);
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
std::vector<size_t> global = { output.X().v, output.Y().v, total_batches };
- kd.gws0 = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
- kd.gws1 = Align(global[1], td.tile_m_size) / td.tile_m_size;
- kd.gws2 = global[2];
+ dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
+ dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
+ dispatchData.gws[2] = global[2];
- kd.lws0 = td.simd_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = td.simd_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return kd;
+ return dispatchData;
}
GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gemm_params& params) const {
GRNKernelBase::DispatchData GRNKernelBase::SetDefault(const grn_params& params) const {
const auto& output = params.output;
- DispatchData kd;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+ DispatchData dispatchData;
+ dispatchData.gws = { output.Batch().v, output.Y().v, output.X().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- std::vector<size_t> global = { output.Batch().v, output.Y().v, output.X().v };
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData GRNKernelBase::GetCommonKernelsData(const Params& params,
const grn_params& orgParams = static_cast<const grn_params&>(params);
- DispatchData runInfo;
-
- runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<grn_params>(params);
- auto cldnn_jit = GetJitConstants(orgParams, runInfo);
+ auto cldnn_jit = GetJitConstants(orgParams, dispatchData);
auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
using DispatchData = CommonDispatchData;
protected:
- virtual JitConstants GetJitConstants(const grn_params& params, DispatchData kd) const;
+ virtual JitConstants GetJitConstants(const grn_params& params, DispatchData dispatchData) const;
virtual DispatchData SetDefault(const grn_params& params) const;
KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const;
};
}
CommonDispatchData LRNKernelAcrossChannelMultipleFeatures::SetDefault(const lrn_params& params) const {
- CommonDispatchData runInfo = LRNKernelBase::SetDefault(params);
+ CommonDispatchData dispatchData = LRNKernelBase::SetDefault(params);
const auto& input = params.inputs[0];
unsigned int ofm_per_simd = GetOfmPerSimd(params);
const auto& out = params.output;
const unsigned int alignment = out.X().v > 16 ? 32 : 16;
- runInfo.gws0 = Align(out.X().v, alignment);
- runInfo.gws1 = out.Y().v;
- runInfo.gws2 = (out.Feature().v * out.Batch().v) / ofm_per_simd;
+ dispatchData.gws[0] = Align(out.X().v, alignment);
+ dispatchData.gws[1] = out.Y().v;
+ dispatchData.gws[2] = (out.Feature().v * out.Batch().v) / ofm_per_simd;
- runInfo.lws0 = alignment;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = alignment;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
} else if (input.GetLayout() == DataLayout::yxfb) {
- runInfo.gws0 /= ofm_per_simd;
- runInfo.lws0 = std::min(std::max(runInfo.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
- while (runInfo.gws0 % runInfo.lws0 != 0) {
- --runInfo.lws0;
+ dispatchData.gws[0] /= ofm_per_simd;
+ dispatchData.lws[0] = std::min(std::max(dispatchData.gws[0], static_cast<size_t>(1)), static_cast<size_t>(32));
+ while (dispatchData.gws[0] % dispatchData.lws[0] != 0) {
+ --dispatchData.lws[0];
}
}
- runInfo.efficiency = FORCE_PRIORITY_6;
+ dispatchData.efficiency = FORCE_PRIORITY_6;
- return runInfo;
+ return dispatchData;
}
bool LRNKernelAcrossChannelMultipleFeatures::Validate(const Params& p, const optional_params& o) const {
return true;
}
-JitConstants LRNKernelAcrossChannelMultipleFeatures::GetJitConstants(const lrn_params& params, const DispatchData& kd) const {
- JitConstants jit = Parent::GetJitConstants(params, kd);
+JitConstants LRNKernelAcrossChannelMultipleFeatures::GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
const auto& input = params.inputs[0];
const auto& input_dt = params.inputs[0].GetDType();
const auto& output = params.output;
FusedOpType::ACTIVATION };
}
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const lrn_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
}
CommonDispatchData LRNKernelAcrossChannelMultipleFeaturesFSV16::SetDefault(const lrn_params& params) const {
- CommonDispatchData runInfo = LRNKernelBase::SetDefault(params);
+ CommonDispatchData dispatchData = LRNKernelBase::SetDefault(params);
const auto& out = params.output;
const unsigned int alignment = 16;
- std::vector<size_t> global = {Align(out.Feature().v, alignment),
- out.X().v,
- out.Y().v * out.Batch().v};
+ dispatchData.gws = { Align(out.Feature().v, alignment),
+ out.X().v,
+ out.Y().v * out.Batch().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.efficiency = FORCE_PRIORITY_6;
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- runInfo.efficiency = FORCE_PRIORITY_6;
-
- return runInfo;
+ return dispatchData;
}
-JitConstants LRNKernelAcrossChannelMultipleFeaturesFSV16::GetJitConstants(const lrn_params& params, const DispatchData& kd) const {
- JitConstants jit = LRNKernelBase::GetJitConstants(params, kd);
+JitConstants LRNKernelAcrossChannelMultipleFeaturesFSV16::GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const {
+ JitConstants jit = LRNKernelBase::GetJitConstants(params, dispatchData);
const auto& input_dt = params.inputs[0].GetDType();
if (!params.fused_ops.empty()) {
private:
DispatchData SetDefault(const lrn_params& params) const override;
- JitConstants GetJitConstants(const lrn_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
}
CommonDispatchData LRNKernelAcrossChannel_b8::SetDefault(const lrn_params& params) const {
- CommonDispatchData run_info = LRNKernelBase::SetDefault(params);
+ CommonDispatchData dispatchData = LRNKernelBase::SetDefault(params);
- run_info.gws0 /= 8;
- run_info.lws0 = 8; // gws0 is dividable by 64, so after correction it will be dividable by 8.
+ dispatchData.gws[0] /= 8;
+ dispatchData.lws[0] = 8; // gws[0] is dividable by 64, so after correction it will be dividable by 8.
- return run_info;
+ return dispatchData;
}
bool LRNKernelAcrossChannel_b8::Validate(const Params& p, const optional_params& o) const {
return true;
}
-JitConstants LRNKernelAcrossChannel_b8::GetJitConstants(const lrn_params& params, const DispatchData& kd) const {
- JitConstants jit = Parent::GetJitConstants(params, kd);
+JitConstants LRNKernelAcrossChannel_b8::GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
const auto& input_dt = params.inputs[0].GetDType();
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8));
FusedOpType::ACTIVATION };
}
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const lrn_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
}
CommonDispatchData LRNKernelAcrossChannelRef::SetDefault(const lrn_params& params) const {
- CommonDispatchData runInfo = LRNKernelBase::SetDefault(params);
+ CommonDispatchData dispatchData = LRNKernelBase::SetDefault(params);
if (params.inputs[0].GetLayout() == DataLayout::bfyx) {
const auto& out = params.output;
- runInfo.gws0 = Align(out.X().v, 32);
- runInfo.gws1 = out.Y().v;
- runInfo.gws2 = out.Feature().v * out.Batch().v;
+ dispatchData.gws[0] = Align(out.X().v, 32);
+ dispatchData.gws[1] = out.Y().v;
+ dispatchData.gws[2] = out.Feature().v * out.Batch().v;
- runInfo.lws0 = 32;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 32;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
}
- return runInfo;
+ return dispatchData;
}
JitConstants LRNKernelAcrossChannelRef::GetJitConstants(const lrn_params& params,
- const LRNKernelBase::DispatchData& kd) const {
- JitConstants jit = Parent::GetJitConstants(params, kd);
+ const LRNKernelBase::DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
const auto& input_dt = params.inputs[0].GetDType();
if (!params.fused_ops.empty()) {
FusedOpType::SCALE,
FusedOpType::ACTIVATION };
}
- JitConstants GetJitConstants(const lrn_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return true;
}
-JitConstants LRNKernelBase::GetJitConstants(const lrn_params& params, const LRNKernelBase::DispatchData& kd) const {
+JitConstants LRNKernelBase::GetJitConstants(const lrn_params& params, const LRNKernelBase::DispatchData& /*dispatchData*/) const {
JitConstants mem_consts = MakeBaseParamsJitConstants(params);
const auto padding = (params.localSize - 1) / 2;
auto alpha_div_by_size_abs_sqrt = std::sqrt(std::abs(alpha_div_by_size));
mem_consts.AddConstants({
- MakeJitConstant("ALPHA_AFTER_FACTORED", kd.fp16UnitUsed ? alpha_sign : alpha),
- MakeJitConstant("ALPHA_DIV_BY_SIZE", kd.fp16UnitUsed ? alpha_sign : alpha_div_by_size),
- MakeJitConstant("ALPHA_VAL_FACTOR", kd.fp16UnitUsed ? alpha_abs_sqrt : 1.0f),
- MakeJitConstant("ALPHA_VAL_FACTOR_DIV_BY_SIZE", kd.fp16UnitUsed ? alpha_div_by_size_abs_sqrt : 1.0f),
+ MakeJitConstant("ALPHA_AFTER_FACTORED", params.inputs[0].GetDType() == Datatype::F16 ? alpha_sign : alpha),
+ MakeJitConstant("ALPHA_DIV_BY_SIZE", params.inputs[0].GetDType() == Datatype::F16 ? alpha_sign : alpha_div_by_size),
+ MakeJitConstant("ALPHA_VAL_FACTOR", params.inputs[0].GetDType() == Datatype::F16 ? alpha_abs_sqrt : 1.0f),
+ MakeJitConstant("ALPHA_VAL_FACTOR_DIV_BY_SIZE", params.inputs[0].GetDType() == Datatype::F16 ? alpha_div_by_size_abs_sqrt : 1.0f),
});
return mem_consts;
LRNKernelBase::DispatchData LRNKernelBase::SetDefault(const lrn_params& params) const {
const auto& output = params.output;
- DispatchData kd;
+ DispatchData dispatchData;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
// Determine global work sizes.
- kd.gws0 = output.Batch().v * output.Feature().v; // B, F
- kd.gws1 = output.X().v; // X
- kd.gws2 = output.Y().v; // Y
+ dispatchData.gws[0] = output.Batch().v * output.Feature().v; // B, F
+ dispatchData.gws[1] = output.X().v; // X
+ dispatchData.gws[2] = output.Y().v; // Y
// Find largest positive local work size that is divider for global work size.
- kd.lws0 = std::min(std::max(kd.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
- while (kd.gws0 % kd.lws0 != 0) {
- --kd.lws0;
+ dispatchData.lws[0] = std::min(std::max(dispatchData.gws[0], static_cast<size_t>(1)), static_cast<size_t>(32));
+ while (dispatchData.gws[0] % dispatchData.lws[0] != 0) {
+ --dispatchData.lws[0];
}
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return kd;
+ return dispatchData;
}
KernelsData LRNKernelBase::GetCommonKernelsData(const Params& params,
const lrn_params& orgParams = static_cast<const lrn_params&>(params);
- DispatchData runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<lrn_params>(params);
- auto cldnnJit = GetJitConstants(orgParams, runInfo);
+ auto cldnnJit = GetJitConstants(orgParams, dispatchData);
auto entryPoint = GetEntryPoint(kernelName, orgParams.layerID, options);
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
auto fused_deps_total = GetFusedPrimitiveInputsCount(params);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
protected:
bool Validate(const Params& p, const optional_params& o) const override;
- virtual JitConstants GetJitConstants(const lrn_params& params, const DispatchData& kd) const;
+ virtual JitConstants GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const;
virtual DispatchData SetDefault(const lrn_params& params) const;
KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
};
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return k;
}
-JitConstants LRNKernelRef::GetJitConstants(const lrn_params& params, const LRNKernelRef::Parent::DispatchData& kd) const {
+JitConstants LRNKernelRef::GetJitConstants(const lrn_params& params, const LRNKernelRef::Parent::DispatchData& dispatchData) const {
const uint32_t round_norm_size = (params.localSize / 2) * 2 + 1;
uint32_t numElement = round_norm_size * round_norm_size;
const auto& input_dt = params.inputs[0].GetDType();
const float num_element_div = 1.f / static_cast<float>(numElement);
- JitConstants jit = Parent::GetJitConstants(params, kd);
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
jit.AddConstants({
MakeJitConstant("NUM_ELEMENTS_DIV", num_element_div),
MakeJitConstant("GWS_BATCH", 2),
}
LRNKernelRef::Parent::DispatchData LRNKernelRef::SetDefault(const lrn_params& params) const {
- DispatchData kd = Parent::SetDefault(params);
+ DispatchData dispatchData = Parent::SetDefault(params);
const auto& out = params.output;
- std::vector<size_t> global = {out.X().v * out.Y().v, out.Feature().v, out.Batch().v};
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { out.X().v * out.Y().v, out.Feature().v, out.Batch().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData LRNKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
FusedOpType::SCALE,
FusedOpType::ACTIVATION };
}
- JitConstants GetJitConstants(const lrn_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
/*
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return k;
}
-JitConstants LRNKernelWithinChannelByxfOpt::GetJitConstants(
- const lrn_params& params,
- const LRNKernelBase::DispatchData& kd) const {
+JitConstants LRNKernelWithinChannelByxfOpt::GetJitConstants(const lrn_params& params,
+ const LRNKernelBase::DispatchData& dispatchData) const {
const uint32_t round_norm_size = (params.localSize / 2) * 2 + 1;
uint32_t numElement = round_norm_size * round_norm_size;
const auto& input_dt = params.inputs[0].GetDType();
const float num_element_div = 1.f / static_cast<float>(numElement);
- JitConstants jit = Parent::GetJitConstants(params, kd);
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
jit.AddConstants({
MakeJitConstant("NUM_ELEMENTS_DIV", num_element_div),
MakeJitConstant("GWS_BATCH", 2),
LRNKernelWithinChannelByxfOpt::Parent::DispatchData LRNKernelWithinChannelByxfOpt::SetDefault(
const lrn_params& params) const {
- DispatchData kd = Parent::SetDefault(params);
+ DispatchData dispatchData = Parent::SetDefault(params);
const auto& out = params.output;
- std::vector<size_t> global = {out.X().v * out.Y().v, CeilDiv(out.Feature().v, 8), out.Batch().v};
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { out.X().v * out.Y().v, CeilDiv(out.Feature().v, 8), out.Batch().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
bool LRNKernelWithinChannelByxfOpt::Validate(const Params& p, const optional_params& o) const {
FusedOpType::ACTIVATION };
}
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const lrn_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
CommonDispatchData LRNKernelWithinChannel::SetDefault(const lrn_params& params) const {
- CommonDispatchData runInfo = LRNKernelBase::SetDefault(params);
+ CommonDispatchData dispatchData = LRNKernelBase::SetDefault(params);
- runInfo.gws0 = 128 * 128;
- runInfo.gws1 = 1;
- runInfo.gws2 = 1;
+ dispatchData.gws[0] = 128 * 128;
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
- runInfo.lws0 = 128;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 128;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return runInfo;
+ return dispatchData;
}
JitConstants LRNKernelWithinChannel::GetJitConstants(const lrn_params& params,
- const LRNKernelWithinChannel::Parent::DispatchData& kd) const {
- JitConstants jit = Parent::GetJitConstants(params, kd);
+ const LRNKernelWithinChannel::Parent::DispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
const auto& input_dt = params.inputs[0].GetDType();
if (!params.fused_ops.empty()) {
FusedOpType::ACTIVATION };
}
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const lrn_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
CommonDispatchData LRNKernelWithinChannelOpt::SetDefault(const lrn_params& params) const {
- CommonDispatchData runInfo = LRNKernelBase::SetDefault(params);
+ CommonDispatchData dispatchData = LRNKernelBase::SetDefault(params);
const auto totalSize = params.inputs[0].LogicalSize();
const unsigned work_group_size = (totalSize < 128) ? 32 : 128;
- runInfo.gws0 = Align(params.inputs[0].LogicalSize(), work_group_size);
- runInfo.gws1 = 1;
- runInfo.gws2 = 1;
+ dispatchData.gws[0] = Align(params.inputs[0].LogicalSize(), work_group_size);
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
- runInfo.lws0 = work_group_size;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = work_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return runInfo;
+ return dispatchData;
}
bool LRNKernelWithinChannelOpt::Validate(const Params& p, const optional_params& o) const {
return true;
}
-JitConstants LRNKernelWithinChannelOpt::GetJitConstants(const lrn_params& params, const LRNKernelWithinChannelOpt::Parent::DispatchData& kd) const {
+JitConstants LRNKernelWithinChannelOpt::GetJitConstants(const lrn_params& params, const LRNKernelWithinChannelOpt::Parent::DispatchData& dispatchData) const {
const auto& input_dt = params.inputs[0].GetDType();
- JitConstants jit = Parent::GetJitConstants(params, kd);
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
FusedOpsConfiguration conf = {"", {"batch_id", "feature_id", "y", "x"}, "lrn_result", input_dt, 1};
FusedOpType::ACTIVATION };
}
bool Validate(const Params& params, const optional_params& options) const override;
- JitConstants GetJitConstants(const lrn_params& params, const DispatchData& kd) const override;
+ JitConstants GetJitConstants(const lrn_params& params, const DispatchData& dispatchData) const override;
};
} // namespace kernel_selector
/*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return {};
}
- DispatchData run_info;
+ DispatchData dispatchData;
KernelData kd = KernelData::Default<lstm_dynamic_input_params>(params);
lstm_dynamic_input_params& dlstm_params = *static_cast<lstm_dynamic_input_params*>(kd.params.get());
const auto& out = dlstm_params.output;
auto hidden_size = out.X().v;
- std::vector<size_t> global = { hidden_size / simd_size, out.Batch().v * out.Y().v, out.Feature().v };
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- run_info.gws0 = global[0];
- run_info.gws1 = global[1];
- run_info.gws2 = global[2];
-
- run_info.lws0 = local[0];
- run_info.lws1 = local[1];
- run_info.lws2 = local[2];
-
- run_info.fp16UnitUsed = dlstm_params.inputs[0].GetDType() == Datatype::F16;
+ dispatchData.gws = { hidden_size / simd_size, out.Batch().v * out.Y().v, out.Feature().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
bool succeed = UpdateWeightsParams(dlstm_params,
options,
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- kernel.workGroups.global = { run_info.gws0, run_info.gws1, run_info.gws2 };
- kernel.workGroups.local = { run_info.lws0, run_info.lws1, run_info.lws2 };
+ kernel.workGroups.global = dispatchData.gws;
+ kernel.workGroups.local = dispatchData.lws;
kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
SetKernelArguments(dlstm_params, kernel);
/*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
LSTM_DynamicInputKernelBase::DispatchData LSTM_DynamicInputKernelBase::SetDefault(
const lstm_dynamic_input_params& params) {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& out = params.output;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
// 4 * hidden, batch * dir, seq_len
- std::vector<size_t> global = {out.X().v, out.Batch().v * out.Y().v, out.Feature().v};
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { out.X().v, out.Batch().v * out.Y().v, out.Feature().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
void kernel_selector::LSTM_DynamicInputKernelBase::SetKernelArguments(const lstm_dynamic_input_params& params, clKernelData& kernel) const {
const lstm_dynamic_input_params& orgParams = static_cast<const lstm_dynamic_input_params&>(params);
- auto run_info = SetDefault(orgParams);
+ auto dispatchData = SetDefault(orgParams);
KernelData k_data = KernelData::Default<lstm_dynamic_input_params>(params, 1);
auto cldnn_jit = GetJitConstants(orgParams);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = k_data.kernels[0];
- kernel.workGroups.global = {run_info.gws0, run_info.gws1, run_info.gws2};
+ kernel.workGroups.global = dispatchData.gws;
kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
SetKernelArguments(orgParams, kernel);
/*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
LSTM_DynamicTimeloopKernelBase::DispatchData LSTM_DynamicTimeloopKernelBase::SetDefault(
const lstm_dynamic_timeloop_params& params) {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& out = params.output;
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
auto out_x_size = out.X().v;
auto gws0 = out_x_size > 256 ? 256 : out_x_size;
- std::vector<size_t> global = {gws0, out.Batch().v, static_cast<size_t>(params.direction)};
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { gws0, out.Batch().v, static_cast<size_t>(params.direction) };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
void kernel_selector::LSTM_DynamicTimeloopKernelBase::SetKernelArguments(const lstm_dynamic_timeloop_params& params, clKernelData& kernel) const {
const lstm_dynamic_timeloop_params& org_params = static_cast<const lstm_dynamic_timeloop_params&>(params);
- auto run_info = SetDefault(org_params);
+ auto dispatchData = SetDefault(org_params);
KernelData k_data = KernelData::Default<lstm_dynamic_timeloop_params>(params, 1);
auto cldnn_jit = GetJitConstants(org_params);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = k_data.kernels[0];
- kernel.workGroups.global = {run_info.gws0, run_info.gws1, run_info.gws2};
- kernel.workGroups.local = {run_info.lws0, run_info.lws1, run_info.lws2};
+ kernel.workGroups.global = dispatchData.gws;
+ kernel.workGroups.local = dispatchData.lws;
kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
SetKernelArguments(org_params, kernel);
k_data.estimatedTime = estimated_time;
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
MaxUnpoolingKernelBase::DispatchData MaxUnpoolingKernelBase::SetDefault(const max_unpooling_params& params) const {
const auto& input = params.inputs[0];
- DispatchData kd;
+ DispatchData dispatchData;
if (input.GetLayout() == DataLayout::bfyx || input.GetLayout() == DataLayout::byxf) {
// Determine global work sizes.
- kd.gws2 = input.Batch().v * input.Feature().v; // B, F
- kd.gws0 = Align(input.X().v, 32); // X
- kd.gws1 = input.Y().v; // Y
+ dispatchData.gws[2] = input.Batch().v * input.Feature().v; // B, F
+ dispatchData.gws[0] = Align(input.X().v, 32); // X
+ dispatchData.gws[1] = input.Y().v; // Y
- kd.lws0 = 32;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 32;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
} else {
// Determine global work sizes.
- kd.gws0 = input.Batch().v * input.Feature().v; // B, F
- kd.gws1 = input.X().v; // X
- kd.gws2 = input.Y().v; // Y
+ dispatchData.gws[0] = input.Batch().v * input.Feature().v; // B, F
+ dispatchData.gws[1] = input.X().v; // X
+ dispatchData.gws[2] = input.Y().v; // Y
- kd.lws0 = std::min(std::max(kd.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
- while (kd.gws0 % kd.lws0 != 0) {
- --kd.lws0;
+ dispatchData.lws[0] = std::min(std::max(dispatchData.gws[0], static_cast<size_t>(1)), static_cast<size_t>(32));
+ while (dispatchData.gws[0] % dispatchData.lws[0] != 0) {
+ --dispatchData.lws[0];
}
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
}
- return kd;
+ return dispatchData;
}
KernelsData MaxUnpoolingKernelBase::GetCommonKernelsData(const Params& params,
const max_unpooling_params& orgParams = static_cast<const max_unpooling_params&>(params);
- DispatchData runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<max_unpooling_params>(params);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
kd.estimatedTime = estimatedTime;
}
MVNKernelBase::DispatchData MVNKernel_b_fs_yx_fsv16_imad::SetDefault(const mvn_params& params) const {
- auto kd = Parent::SetDefault(params);
+ auto dispatchData = Parent::SetDefault(params);
auto items_num = params.output.X().v * params.output.Y().v * params.output.Z().v;
auto max_wg = params.engineInfo.maxWorkGroupSize;
auto lws = std::max(std::min(items_num, max_lws) / simd, (size_t)1) * simd;
- kd.gws0 = lws;
- kd.gws1 = CeilDiv(params.output.Feature().v, fsv);
- kd.gws2 = params.output.Batch().v;
+ dispatchData.gws[0] = lws;
+ dispatchData.gws[1] = CeilDiv(params.output.Feature().v, fsv);
+ dispatchData.gws[2] = params.output.Batch().v;
- kd.lws0 = lws;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = lws;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- kd.itemsNum = 1;
+ dispatchData.itemsNum = 1;
- return kd;
+ return dispatchData;
}
-JitConstants MVNKernel_b_fs_yx_fsv16_imad::GetJitConstants(const mvn_params& params, DispatchData kd) const {
- auto jits = Parent::GetJitConstants(params, kd);
+JitConstants MVNKernel_b_fs_yx_fsv16_imad::GetJitConstants(const mvn_params& params, DispatchData dispatchData) const {
+ auto jits = Parent::GetJitConstants(params, dispatchData);
auto activation_dt = GetActivationType(params);
jits.Merge(MakeTypeJitConstants(activation_dt, "MEAN"));
jits.AddConstant(MakeJitConstant("SIMD", simd));
- jits.AddConstant(MakeJitConstant("LWS", kd.lws0));
- jits.AddConstant(MakeJitConstant("GWS", kd.gws0));
- jits.AddConstant(MakeJitConstant("ITEM_GROUPS", kd.itemsNum));
+ jits.AddConstant(MakeJitConstant("LWS", dispatchData.lws[0]));
+ jits.AddConstant(MakeJitConstant("GWS", dispatchData.gws[0]));
+ jits.AddConstant(MakeJitConstant("ITEM_GROUPS", dispatchData.itemsNum));
if (!params.fused_ops.empty()) {
std::vector<std::string> idx_order;
MVNKernel_b_fs_yx_fsv16_imad::MultiDispatchData MVNKernel_b_fs_yx_fsv16_imad::SetDefaultForMulti(
const mvn_params& params) const {
- MultiDispatchData md;
+ MultiDispatchData dispatchData;
auto items_num = params.output.X().v * params.output.Y().v * params.output.Z().v;
auto max_wg = params.engineInfo.maxWorkGroupSize;
// TODO Check if larger number of work-groups does not provide benefit
size_t item_groups = pref_work_groups;
- md.item_groups = item_groups;
+ dispatchData.item_groups = item_groups;
size_t stage1_lws = lws;
- md.stage_1.gws0 = stage1_lws * item_groups;
- md.stage_1.gws1 = CeilDiv(params.output.Feature().v, fsv);
- md.stage_1.gws2 = params.output.Batch().v;
+ dispatchData.stage_1.gws[0] = stage1_lws * item_groups;
+ dispatchData.stage_1.gws[1] = CeilDiv(params.output.Feature().v, fsv);
+ dispatchData.stage_1.gws[2] = params.output.Batch().v;
- md.stage_1.lws0 = stage1_lws;
- md.stage_1.lws1 = 1;
- md.stage_1.lws2 = 1;
+ dispatchData.stage_1.lws[0] = stage1_lws;
+ dispatchData.stage_1.lws[1] = 1;
+ dispatchData.stage_1.lws[2] = 1;
- md.stage_1.itemsNum = item_groups;
+ dispatchData.stage_1.itemsNum = item_groups;
size_t stage2_lws = std::max(std::min(item_groups, max_lws) / simd, (size_t)1) * simd;
- md.stage_2.gws0 = stage2_lws;
- md.stage_2.gws1 = CeilDiv(params.output.Feature().v, fsv);
- md.stage_2.gws2 = params.output.Batch().v;
+ dispatchData.stage_2.gws[0] = stage2_lws;
+ dispatchData.stage_2.gws[1] = CeilDiv(params.output.Feature().v, fsv);
+ dispatchData.stage_2.gws[2] = params.output.Batch().v;
- md.stage_2.lws0 = stage2_lws;
- md.stage_2.lws1 = 1;
- md.stage_2.lws2 = 1;
+ dispatchData.stage_2.lws[0] = stage2_lws;
+ dispatchData.stage_2.lws[1] = 1;
+ dispatchData.stage_2.lws[2] = 1;
- md.stage_2.itemsNum = item_groups;
+ dispatchData.stage_2.itemsNum = item_groups;
- md.stage_final.gws0 = std::max(items_num / simd, (size_t)1) * simd;
- md.stage_final.gws1 = CeilDiv(params.output.Feature().v, fsv);
- md.stage_final.gws2 = params.output.Batch().v;
+ dispatchData.stage_final.gws[0] = std::max(items_num / simd, (size_t)1) * simd;
+ dispatchData.stage_final.gws[1] = CeilDiv(params.output.Feature().v, fsv);
+ dispatchData.stage_final.gws[2] = params.output.Batch().v;
- md.stage_final.lws0 = simd;
- md.stage_final.lws1 = 1;
- md.stage_final.lws2 = 1;
+ dispatchData.stage_final.lws[0] = simd;
+ dispatchData.stage_final.lws[1] = 1;
+ dispatchData.stage_final.lws[2] = 1;
- md.stage_final.itemsNum = 1;
+ dispatchData.stage_final.itemsNum = 1;
- return md;
+ return dispatchData;
}
KernelsData MVNKernel_b_fs_yx_fsv16_imad::GetMultiStageKernelsData(const mvn_params& params,
constexpr size_t intermidiate_bytes = 4;
const mvn_params& orgParams = static_cast<const mvn_params&>(params);
- auto runInfo = SetDefaultForMulti(orgParams);
+ auto dispatchData = SetDefaultForMulti(orgParams);
size_t kernels_num = params.mvnNormalizeVariance ? 5 : 3;
KernelData kd = KernelData::Default<mvn_params>(params, kernels_num);
auto finalKernelName = GetKernelName(orgParams);
{
// Mean first stage
- auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_1);
+ auto cldnn_jit = GetJitConstants(orgParams, dispatchData.stage_1);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_1", 1));
auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo.stage_1,
+ dispatchData.stage_1,
params.engineInfo,
finalKernelName,
jit,
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
kernel.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) *
- runInfo.item_groups * intermidiate_bytes);
+ dispatchData.item_groups * intermidiate_bytes);
}
{
// Mean second stage
- auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_2);
+ auto cldnn_jit = GetJitConstants(orgParams, dispatchData.stage_2);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_2", 1));
auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[1];
FillCLKernelData(kernel,
- runInfo.stage_2,
+ dispatchData.stage_2,
params.engineInfo,
finalKernelName,
jit,
}
if (params.mvnNormalizeVariance) {
// Variance first stage
- auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_1);
+ auto cldnn_jit = GetJitConstants(orgParams, dispatchData.stage_1);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_VAR_1", 1));
auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[2];
FillCLKernelData(kernel,
- runInfo.stage_1,
+ dispatchData.stage_1,
params.engineInfo,
finalKernelName,
jit,
}
if (params.mvnNormalizeVariance) {
// Variance second stage
- auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_2);
+ auto cldnn_jit = GetJitConstants(orgParams, dispatchData.stage_2);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_VAR_2", 1));
auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[3];
FillCLKernelData(kernel,
- runInfo.stage_2,
+ dispatchData.stage_2,
params.engineInfo,
finalKernelName,
jit,
intermidiate_bytes);
}
{ // Final
- auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_final);
+ auto cldnn_jit = GetJitConstants(orgParams, dispatchData.stage_final);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MAIN", 1));
cldnn_jit.AddConstant(MakeJitConstant("PRECALC_MEAN", 1));
cldnn_jit.AddConstant(MakeJitConstant("PRECALC_VARIANCE", params.mvnNormalizeVariance));
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[kernels_num - 1];
FillCLKernelData(kernel,
- runInfo.stage_final,
+ dispatchData.stage_final,
params.engineInfo,
finalKernelName,
jit,
bool Validate(const Params&, const optional_params&) const override;
DispatchData SetDefault(const mvn_params& params) const override;
- JitConstants GetJitConstants(const mvn_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const mvn_params& params, DispatchData dispatchData) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {
FusedOpType::ACTIVATION,
MVNKernelBase::DispatchData MVNKernelBase::SetDefault(const mvn_params& params) const {
const auto& output = params.output;
- DispatchData kd;
-
- std::vector<size_t> global(3);
-
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
+ DispatchData dispatchData;
if (params.mvnMode == MVNMode::WITHIN_CHANNELS) {
- global = {output.Batch().v, output.Feature().v, 1};
+ dispatchData.gws = {output.Batch().v, output.Feature().v, 1};
} else {
- global = {output.Batch().v, 1, 1};
+ dispatchData.gws = {output.Batch().v, 1, 1};
}
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData MVNKernelBase::GetCommonKernelsData(const Params& params,
const mvn_params& orgParams = static_cast<const mvn_params&>(params);
- DispatchData runInfo;
-
- runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<mvn_params>(params);
auto finalKernelName = GetKernelName(orgParams);
- auto cldnn_jit = GetJitConstants(orgParams, runInfo);
+ auto cldnn_jit = GetJitConstants(orgParams, dispatchData);
auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
finalKernelName,
jit,
protected:
bool Validate(const Params&, const optional_params&) const override;
- virtual JitConstants GetJitConstants(const mvn_params& params, DispatchData kd) const;
+ virtual JitConstants GetJitConstants(const mvn_params& params, DispatchData dispatchData) const;
virtual DispatchData SetDefault(const mvn_params& params) const;
virtual std::string GetKernelName(const mvn_params&) const { return kernelName; }
KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const;
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
MVNKernelBfyxOpt::Parent::DispatchData MVNKernelBfyxOpt::SetDefault(const mvn_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& input = params.inputs[0];
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
if (params.mvnMode == MVNMode::WITHIN_CHANNELS) {
- kd.dataSetSize = input.X().v * input.Y().v * input.Z().v;
- kd.dataSetsCount = input.Batch().v * input.Feature().v;
+ dispatchData.dataSetSize = input.X().v * input.Y().v * input.Z().v;
+ dispatchData.dataSetsCount = input.Batch().v * input.Feature().v;
} else {
- kd.dataSetSize = input.X().v * input.Y().v * input.Z().v * input.Feature().v;
- kd.dataSetsCount = input.Batch().v;
+ dispatchData.dataSetSize = input.X().v * input.Y().v * input.Z().v * input.Feature().v;
+ dispatchData.dataSetsCount = input.Batch().v;
}
// start with 1 thread per data set
- kd.gws0 = 1;
- kd.gws1 = kd.dataSetsCount;
- kd.gws2 = 1;
- kd.itemsNum = kd.dataSetSize;
+ dispatchData.gws[0] = 1;
+ dispatchData.gws[1] = dispatchData.dataSetsCount;
+ dispatchData.gws[2] = 1;
+ dispatchData.itemsNum = dispatchData.dataSetSize;
// We have two units of data per work item in current implementation.
- auto local_mem_per_wi = 2 * (kd.fp16UnitUsed ? sizeof(short) : sizeof(float));
+ auto local_mem_per_wi = 2 * BytesPerElement(params.inputs[0].GetDType());
// Combining device execution and local memory restrictions to compute maximum possible LWS.
auto max_lws = std::min(params.engineInfo.maxWorkGroupSize, params.engineInfo.maxLocalMemSize / local_mem_per_wi);
- kd.lws0 = 1;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
// Compute maximum possible LWS that does not exceed device capabilities and optimizes number of global memory
// reads.
- while ((kd.itemsNum > 32 || kd.lws0 < kd.itemsNum) && (2 * kd.lws0 <= max_lws)) {
- kd.lws0 *= 2;
- kd.itemsNum /= 2;
+ while ((dispatchData.itemsNum > 32 || dispatchData.lws[0] < dispatchData.itemsNum) && (2 * dispatchData.lws[0] <= max_lws)) {
+ dispatchData.lws[0] *= 2;
+ dispatchData.itemsNum /= 2;
}
- kd.gws0 = kd.lws0;
- kd.leftovers = kd.dataSetSize % kd.lws0;
+ dispatchData.gws[0] = dispatchData.lws[0];
+ dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
- return kd;
+ return dispatchData;
}
-JitConstants MVNKernelBfyxOpt::GetJitConstants(const mvn_params& params, MVNKernelBase::DispatchData kd) const {
- auto jit = MVNKernelBase::GetJitConstants(params, kd);
+JitConstants MVNKernelBfyxOpt::GetJitConstants(const mvn_params& params, MVNKernelBase::DispatchData dispatchData) const {
+ auto jit = MVNKernelBase::GetJitConstants(params, dispatchData);
jit.AddConstants({
- MakeJitConstant("ITEMS_NUM", kd.itemsNum),
- MakeJitConstant("LWS", kd.lws0),
- MakeJitConstant("GWS", kd.gws0),
- MakeJitConstant("DATA_SETS_COUNT", kd.dataSetsCount),
- MakeJitConstant("DATA_SET_SIZE", kd.dataSetSize),
- MakeJitConstant("LEFTOVERS", kd.leftovers),
+ MakeJitConstant("ITEMS_NUM", dispatchData.itemsNum),
+ MakeJitConstant("LWS", dispatchData.lws[0]),
+ MakeJitConstant("GWS", dispatchData.gws[0]),
+ MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount),
+ MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
+ MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
});
auto activation_dt = GetActivationType(params);
jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
};
}
DispatchData SetDefault(const mvn_params& params) const override;
- JitConstants GetJitConstants(const mvn_params& params, MVNKernelBase::DispatchData kd) const override;
+ JitConstants GetJitConstants(const mvn_params& params, MVNKernelBase::DispatchData dispatchData) const override;
};
} // namespace kernel_selector
return k;
}
-JitConstants MVNKernelRef::GetJitConstants(const mvn_params& params, DispatchData kd) const {
- auto jits = Parent::GetJitConstants(params, kd);
+JitConstants MVNKernelRef::GetJitConstants(const mvn_params& params, DispatchData dispatchData) const {
+ auto jits = Parent::GetJitConstants(params, dispatchData);
auto activation_dt = GetActivationType(params);
jits.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
ParamsKey GetSupportedKey() const override;
protected:
- JitConstants GetJitConstants(const mvn_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const mvn_params& params, DispatchData dispatchData) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {
FusedOpType::ACTIVATION,
NormalizeKernelBase::DispatchData NormalizeKernelBase::SetDefault(const normalize_params& params) const {
const auto& output = params.output;
- DispatchData kd;
-
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
- std::vector<size_t> global(3);
-
+ DispatchData dispatchData;
if (params.normMode == NormalizeMode::WITHIN_SPATIAL) {
- global = {output.X().v, output.Y().v, output.Batch().v};
+ dispatchData.gws = {output.X().v, output.Y().v, output.Batch().v};
} else {
- global = {output.Batch().v, 1, 1};
+ dispatchData.gws = {output.Batch().v, 1, 1};
}
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData NormalizeKernelBase::GetCommonKernelsData(const Params& params,
const normalize_params& orgParams = static_cast<const normalize_params&>(params);
- DispatchData runInfo;
-
- runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<normalize_params>(params);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
OneHotKernelBase::DispatchData OneHotKernelBase::SetDefault(const one_hot_params& params) {
const auto& input = params.inputs[0];
- DispatchData kd;
-
- kd.fp16UnitUsed = input.GetDType() == Datatype::F16;
-
- std::vector<size_t> global{input.Batch().v, input.Feature().v, input.Y().v * input.X().v};
+ DispatchData dispatchData;
if (params.output.GetDims().size() == 5) {
- global[0] = input.Batch().v;
- global[1] = input.Feature().v * input.Z().v;
- global[2] = input.Y().v * input.X().v;
+ dispatchData.gws = { input.Batch().v, input.Feature().v * input.Z().v, input.Y().v * input.X().v };
+ } else {
+ dispatchData.gws = { input.Batch().v, input.Feature().v, input.Y().v * input.X().v };
}
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- return kd;
+ return dispatchData;
}
KernelsData OneHotKernelBase::GetCommonKernelsData(const Params& params,
const auto& prim_params =
static_cast<const one_hot_params&>(params);
- auto run_info = SetDefault(prim_params);
+ auto dispatchData = SetDefault(prim_params);
KernelData k_data = KernelData::Default<one_hot_params>(params);
auto cldnn_jit = GetJitConstants(prim_params);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = k_data.kernels[0];
- FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
k_data.estimatedTime = estimated_time;
return {k_data};
}
-JitConstants PoolingKernelBase::GetJitConstants(const pooling_params& pp, PoolingKernelBase::DispatchData kd) const {
+JitConstants PoolingKernelBase::GetJitConstants(const pooling_params& pp, PoolingKernelBase::DispatchData dispatchData) const {
JitConstants mem_consts = MakeBaseParamsJitConstants(pp);
mem_consts.AddConstants({
MakeJitConstant(toString(pp.divMode) + "_KERNEL_DIVIDER", 1),
});
- if (kd.needsBoundary) {
+ if (dispatchData.needsBoundary) {
mem_consts.AddConstant(MakeJitConstant("CHECK_BOUNDRY", 1));
}
PoolingKernelBase::DispatchData PoolingKernelBase::SetDefault(const pooling_params& params) const {
const auto& output = params.output;
- DispatchData kd;
-
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+ DispatchData dispatchData;
if (output.GetLayout() == DataLayout::bfyx || output.GetLayout() == DataLayout::b_fs_yx_fsv4 ||
output.GetLayout() == DataLayout::byxf ||
output.GetLayout() == DataLayout::bfzyx || output.GetLayout() == DataLayout::b_fs_zyx_fsv16 ||
output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) {
// Determine global work sizes.
- kd.gws0 = Align(output.X().v, 32); // X
- kd.gws1 = output.Y().v * output.Z().v; // Y, Z
- kd.gws2 = output.Batch().v * output.Feature().v; // B, F
+ dispatchData.gws[0] = Align(output.X().v, 32); // X
+ dispatchData.gws[1] = output.Y().v * output.Z().v; // Y, Z
+ dispatchData.gws[2] = output.Batch().v * output.Feature().v; // B, F
// Find largest positive local work size that is divider for global work size.
- kd.lws0 = 32;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 32;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
} else if (output.GetLayout() == DataLayout::b_fs_yx_fsv32 || output.GetLayout() == DataLayout::b_fs_zyx_fsv32) {
- kd.gws0 = 32;
- kd.gws1 = output.Y().v * output.X().v * output.Z().v;
- kd.gws2 = output.Batch().v * CeilDiv(output.Feature().v, 32);
+ dispatchData.gws[0] = 32;
+ dispatchData.gws[1] = output.Y().v * output.X().v * output.Z().v;
+ dispatchData.gws[2] = output.Batch().v * CeilDiv(output.Feature().v, 32);
- kd.lws0 = 32;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 32;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
} else {
// Determine global work sizes.
- kd.gws0 = output.Batch().v * output.Feature().v; // B, F
- kd.gws1 = output.X().v; // X
- kd.gws2 = output.Y().v * output.Z().v; // Y * Z
+ dispatchData.gws[0] = output.Batch().v * output.Feature().v; // B, F
+ dispatchData.gws[1] = output.X().v; // X
+ dispatchData.gws[2] = output.Y().v * output.Z().v; // Y * Z
- kd.lws0 = std::min(std::max(kd.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
- while (kd.gws0 % kd.lws0 != 0) {
- --kd.lws0;
+ dispatchData.lws[0] = std::min(std::max(dispatchData.gws[0], static_cast<size_t>(1)), static_cast<size_t>(32));
+ while (dispatchData.gws[0] % dispatchData.lws[0] != 0) {
+ --dispatchData.lws[0];
}
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
}
- kd.needsBoundary = NeedsBoundaryCheck(params);
+ dispatchData.needsBoundary = NeedsBoundaryCheck(params);
- return kd;
+ return dispatchData;
}
KernelsData PoolingKernelBase::GetCommonKernelsData(const Params& params,
const pooling_params& orgParams = static_cast<const pooling_params&>(params);
- DispatchData runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<pooling_params>(params);
- auto cldnn_jit = GetJitConstants(orgParams, runInfo);
+ auto cldnn_jit = GetJitConstants(orgParams, dispatchData);
auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, DEFAULT, false, false, 1,
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, DEFAULT, false, false, 1,
GetFusedPrimitiveInputsCount(params));
if (orgParams.poolType == PoolType::MAX_WITH_ARGMAX)
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
protected:
bool Validate(const Params&, const optional_params&) const override;
- virtual JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const;
+ virtual JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const;
virtual DispatchData SetDefault(const pooling_params& params) const;
KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
Datatype GetAccumulatorType(const pooling_params& p) const;
}
PoolingKernelBase::DispatchData PoolingKernel_b_fs_yx_fsv16::SetDefault(const pooling_params& params) const {
- DispatchData kd = PoolingKernelBase::SetDefault(params);
+ DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
const auto& out = params.output;
const size_t alignment = GetSimdSize(params);
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = CeilDiv(x, x_block_size) * y;
- kd.gws1 = Align(f, alignment);
- kd.gws2 = b;
+ dispatchData.gws[0] = CeilDiv(x, x_block_size) * y;
+ dispatchData.gws[1] = Align(f, alignment);
+ dispatchData.gws[2] = b;
- kd.lws0 = 1;
- kd.lws1 = alignment;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = alignment;
+ dispatchData.lws[2] = 1;
- kd.efficiency = FORCE_PRIORITY_2;
+ dispatchData.efficiency = FORCE_PRIORITY_2;
- return kd;
+ return dispatchData;
}
-JitConstants PoolingKernel_b_fs_yx_fsv16::GetJitConstants(const pooling_params& params, DispatchData runInfo) const {
+JitConstants PoolingKernel_b_fs_yx_fsv16::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
const size_t alignment = GetSimdSize(params);
size_t x_block_size = GetBlockSize(params);
auto input = params.inputs[0];
auto output = params.output;
- auto jit = PoolingKernelBase::GetJitConstants(params, runInfo);
+ auto jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
size_t input_line_size = params.poolStride.x * (x_block_size - 1) + params.poolSize.x;
protected:
bool Validate(const Params&, const optional_params&) const override;
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
DispatchData SetDefault(const pooling_params& params) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE,
}
PoolingKernelBase::DispatchData PoolingKerneGPU_b_fs_yx_fsv4::SetDefault(const pooling_params& params) const {
- DispatchData runInfo = PoolingKernelBase::SetDefault(params);
+ DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
- runInfo.gws0 = params.output.X().v; // X
- runInfo.gws1 = params.output.Y().v; // Y
+ dispatchData.gws[0] = params.output.X().v; // X
+ dispatchData.gws[1] = params.output.Y().v; // Y
// we got b_fs_yx_fsv4 format, we process 4 features per workitem
- runInfo.gws2 = CeilDiv(params.output.Feature().v, 4) * params.output.Batch().v;
+ dispatchData.gws[2] = CeilDiv(params.output.Feature().v, 4) * params.output.Batch().v;
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- auto local = GetOptimalLocalWorkGroupSizes({ runInfo.gws0, runInfo.gws1, runInfo.gws2 }, params.engineInfo);
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
-JitConstants PoolingKerneGPU_b_fs_yx_fsv4::GetJitConstants(const pooling_params& params, DispatchData kd) const {
- auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+JitConstants PoolingKerneGPU_b_fs_yx_fsv4::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
+ auto jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
const size_t in_x_pitch = 4;
const size_t in_y_pitch = 4 * params.inputs[0].X().LogicalDimPadded();
}
protected:
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
};
} // namespace kernel_selector
}
PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_zyx_fsv16_imad::SetDefault(const pooling_params& params) const {
- DispatchData runInfo = PoolingKernelBase::SetDefault(params);
+ DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
const auto& out = params.output;
auto x = out.X().v;
auto f = out.Feature().v;
auto b = out.Batch().v;
- runInfo.gws0 = x;
- runInfo.gws1 = y * z;
+ dispatchData.gws[0] = x;
+ dispatchData.gws[1] = y * z;
// we got b_fs_yx_fsv16 format, we process 16 features per workitem
- runInfo.gws2 = CeilDiv(f, FEATURE_SLICE_SIZE) * b;
+ dispatchData.gws[2] = CeilDiv(f, FEATURE_SLICE_SIZE) * b;
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- auto local = GetOptimalLocalWorkGroupSizes({ runInfo.gws0, runInfo.gws1, runInfo.gws2 }, params.engineInfo);
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
-JitConstants PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetJitConstants(const pooling_params& params, DispatchData kd) const {
- auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+JitConstants PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
+ auto jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
const size_t in_x_pitch = FEATURE_SLICE_SIZE;
const size_t in_y_pitch = FEATURE_SLICE_SIZE * params.inputs[0].X().LogicalDimPadded();
}
protected:
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
};
} // namespace kernel_selector
PoolingKernelBase::DispatchData PoolingKernelGPUBfyxBlockOpt::SetDefault(const pooling_params& params) const {
const auto& output = params.output;
- DispatchData runInfo = PoolingKernelBase::SetDefault(params);
+ DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
- runInfo.gws1 = CeilDiv(output.Y().v, params.poolSize.y);
+ dispatchData.gws[1] = CeilDiv(output.Y().v, params.poolSize.y);
- return runInfo;
+ return dispatchData;
}
-JitConstants PoolingKernelGPUBfyxBlockOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const {
- auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+JitConstants PoolingKernelGPUBfyxBlockOpt::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
+ auto jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
jit.AddConstant(
MakeJitConstant("BLOCK_SIZE_Y", params.poolSize.y + params.poolSize.y * params.poolStride.y - 1));
protected:
bool Validate(const Params&, const optional_params&) const override;
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
DispatchData SetDefault(const pooling_params& params) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE,
}
PoolingKernelBase::DispatchData Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::SetDefault(const pooling_params& params) const {
- DispatchData runInfo = PoolingKernelBase::SetDefault(params);
+ DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
- runInfo.gws0 = params.output.Feature().v/16;
- runInfo.gws1 = params.output.X().v * params.output.Y().v;
- runInfo.gws2 = params.output.Batch().v;
+ dispatchData.gws[0] = params.output.Feature().v/16;
+ dispatchData.gws[1] = params.output.X().v * params.output.Y().v;
+ dispatchData.gws[2] = params.output.Batch().v;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = SIMD_SIZE;
- runInfo.efficiency = FORCE_PRIORITY_1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = SIMD_SIZE;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
- return runInfo;
+ return dispatchData;
}
-JitConstants Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::GetJitConstants(const pooling_params& params, DispatchData kd) const {
- auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+JitConstants Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
+ auto jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
if (!params.fused_ops.empty()) {
auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params);
}
protected:
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
};
} // namespace kernel_selector
}
PoolingKernelBase::DispatchData PoolingKernel_bsv16_fsv16::SetDefault(const pooling_params& params) const {
- DispatchData kd = PoolingKernelBase::SetDefault(params);
+ DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
const auto& out = params.output;
auto f = out.Feature().v;
auto b = out.Batch().v;
- kd.gws0 = Align(f, feature_block_size);
- kd.gws1 = x * y * z;
- kd.gws2 = CeilDiv(b, batch_block_size);
+ dispatchData.gws[0] = Align(f, feature_block_size);
+ dispatchData.gws[1] = x * y * z;
+ dispatchData.gws[2] = CeilDiv(b, batch_block_size);
- kd.lws0 = sub_group_size;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = sub_group_size;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- kd.efficiency = FORCE_PRIORITY_1;
+ dispatchData.efficiency = FORCE_PRIORITY_1;
- return kd;
+ return dispatchData;
}
bool PoolingKernel_bsv16_fsv16::Validate(const Params& p, const optional_params& o) const {
return true;
}
-JitConstants PoolingKernel_bsv16_fsv16::GetJitConstants(const pooling_params& params, DispatchData runInfo) const {
+JitConstants PoolingKernel_bsv16_fsv16::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
auto input = params.inputs[0];
auto output = params.output;
- auto jit = PoolingKernelBase::GetJitConstants(params, runInfo);
+ auto jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
jit.AddConstant(MakeJitConstant("OC_BLOCK", feature_block_size));
jit.AddConstant(MakeJitConstant("MB_BLOCK", batch_block_size));
protected:
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
DispatchData SetDefault(const pooling_params& params) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE,
PoolingKernelBase::DispatchData PoolingKernelGPUByxfOpt::SetDefault(const pooling_params& params) const {
const auto& output = params.output;
- DispatchData runInfo = PoolingKernelBase::SetDefault(params);
+ DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
- runInfo.gws2 = output.Batch().v * (CeilDiv(output.Feature().v, 8));
+ dispatchData.gws[2] = output.Batch().v * (CeilDiv(output.Feature().v, 8));
- return runInfo;
+ return dispatchData;
}
-JitConstants PoolingKernelGPUByxfOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const {
- auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+JitConstants PoolingKernelGPUByxfOpt::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
+ auto jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
protected:
bool Validate(const Params&, const optional_params&) const override;
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
DispatchData SetDefault(const pooling_params& params) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE,
PoolingKernelBase::DispatchData PoolingKernelGPUByxfPaddingOpt::SetDefault(const pooling_params& params) const {
const auto& output = params.output;
- DispatchData runInfo = PoolingKernelBase::SetDefault(params);
+ DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
- runInfo.gws2 = output.Batch().v * (CeilDiv(output.Feature().v, 8));
+ dispatchData.gws[2] = output.Batch().v * (CeilDiv(output.Feature().v, 8));
- return runInfo;
+ return dispatchData;
}
-JitConstants PoolingKernelGPUByxfPaddingOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const {
- auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+JitConstants PoolingKernelGPUByxfPaddingOpt::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
+ auto jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
protected:
bool Validate(const Params&, const optional_params&) const override;
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
DispatchData SetDefault(const pooling_params& params) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE,
}
PoolingKernelBase::DispatchData PoolingKerneGPU_fs_b_yx_fsv32::SetDefault(const pooling_params& params) const {
- DispatchData runInfo = PoolingKernelBase::SetDefault(params);
+ DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
- runInfo.gws0 = params.output.X().v; // X output blocks
- runInfo.gws1 = params.output.Y().v; // Y output clocks
+ dispatchData.gws[0] = params.output.X().v; // X output blocks
+ dispatchData.gws[1] = params.output.Y().v; // Y output clocks
// in fs_b_yx_fsv32 format we will process 2 features per work item, so reads/writes are done in full writes for
// fp16
- runInfo.gws2 = RoundUp(params.output.Feature().v, 32) * params.output.Batch().v / 2;
+ dispatchData.gws[2] = RoundUp(params.output.Feature().v, 32) * params.output.Batch().v / 2;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 16;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 16;
- return runInfo;
+ return dispatchData;
}
bool PoolingKerneGPU_fs_b_yx_fsv32::Validate(const Params& p, const optional_params& o) const {
return true;
}
-JitConstants PoolingKerneGPU_fs_b_yx_fsv32::GetJitConstants(const pooling_params& params, DispatchData kd) const {
- auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+JitConstants PoolingKerneGPU_fs_b_yx_fsv32::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
+ auto jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
auto pp = static_cast<const pooling_params&>(params);
// Heurestic needed for very big pool size.
protected:
bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE,
FusedOpType::SCALE,
return GetCommonKernelsData(params, options, FORCE_PRIORITY_9);
}
-JitConstants PoolingKernelGPUInt8Ref::GetJitConstants(const pooling_params& params, DispatchData kd) const {
- JitConstants jit = PoolingKernelBase::GetJitConstants(params, kd);
+JitConstants PoolingKernelGPUInt8Ref::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
+ JitConstants jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
bool Validate(const Params&, const optional_params&) const override;
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE,
FusedOpType::SCALE,
return k;
}
-JitConstants PoolingKernelGPURef::GetJitConstants(const pooling_params& params, DispatchData kd) const {
- auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+JitConstants PoolingKernelGPURef::GetJitConstants(const pooling_params& params, DispatchData dispatchData) const {
+ auto jit = PoolingKernelBase::GetJitConstants(params, dispatchData);
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
}
protected:
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const pooling_params& params, DispatchData dispatchData) const override;
};
} // namespace kernel_selector
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
PyramidROIAlignKernelBase::DispatchData PyramidROIAlignKernelBase::SetDefault(const PyramidROIAlign_params& params) const {
- DispatchData kd;
-
- kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
- std::vector<size_t> global;
- global = {1, 1, 1};
-
- const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ DispatchData dispatchData;
+ dispatchData.gws = {1, 1, 1};
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
+ return dispatchData;
}
KernelsData PyramidROIAlignKernelBase::GetCommonKernelsData(const Params& params,
const auto& prim_params =
static_cast<const PyramidROIAlign_params&>(params);
- auto run_info = SetDefault(prim_params);
+ auto dispatchData = SetDefault(prim_params);
KernelData k_data = KernelData::Default<PyramidROIAlign_params>(params);
auto cldnn_jit = GetJitConstants(prim_params);
auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, options);
auto& kernel = k_data.kernels[0];
FillCLKernelData(kernel,
- run_info,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
PyramidROIAlignKernelBase::DispatchData PyramidROIAlignKernelRef::SetDefault(const PyramidROIAlign_params& params) const {
- auto dispatch = PyramidROIAlignKernelBase::SetDefault(params);
+ auto dispatchData = PyramidROIAlignKernelBase::SetDefault(params);
- std::vector<size_t> global = {
+ dispatchData.gws = {
params.output.X().v * params.output.Y().v,
params.output.Feature().v,
params.output.Batch().v };
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- dispatch.gws0 = global[0];
- dispatch.gws1 = global[1];
- dispatch.gws2 = global[2];
-
- dispatch.lws0 = local[0];
- dispatch.lws1 = local[1];
- dispatch.lws2 = local[2];
-
- return dispatch;
+ return dispatchData;
}
KernelsData PyramidROIAlignKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
return true;
}
-JitConstants QuantizeKernelBase::GetJitConstants(const quantize_params& params, const CommonDispatchData& runInfo) const {
+JitConstants QuantizeKernelBase::GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const {
JitConstants jit = MakeBaseParamsJitConstants(params);
if (params.packed_binary_output) {
jit.AddConstant(MakeJitConstant("LEVELS", static_cast<float>(params.levels)));
- jit.AddConstant(MakeJitConstant("LWS_0", runInfo.lws0));
- jit.AddConstant(MakeJitConstant("LWS_1", runInfo.lws1));
- jit.AddConstant(MakeJitConstant("LWS_2", runInfo.lws2));
+ jit.AddConstant(MakeJitConstant("LWS_0", dispatchData.lws[0]));
+ jit.AddConstant(MakeJitConstant("LWS_1", dispatchData.lws[1]));
+ jit.AddConstant(MakeJitConstant("LWS_2", dispatchData.lws[2]));
return jit;
}
return {};
}
- auto runInfo = SetDefault(newParams, options);
+ auto dispatchData = SetDefault(newParams, options);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
- auto cldnn_jit = GetJitConstants(newParams, runInfo);
+ auto cldnn_jit = GetJitConstants(newParams, dispatchData);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- kernel.workGroups.global = {runInfo.gws0, runInfo.gws1, runInfo.gws2};
- kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2};
+ kernel.workGroups.global = dispatchData.gws;
+ kernel.workGroups.local = dispatchData.lws;
kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
kernel.arguments = GetArgsDesc(static_cast<int>(newParams.inputs.size()), false, false);
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
protected:
- virtual JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& runInfo) const;
+ virtual JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const;
virtual CommonDispatchData SetDefault(const quantize_params& params, const optional_params&) const = 0;
};
} // namespace kernel_selector
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// See the License for the specific language governing permissions and
// limitations under the License.
-
-#include <iostream>
#include "quantize_kernel_ref.h"
#include "kernel_selector_utils.h"
#include <string>
}
CommonDispatchData QuantizeKernelRef::SetDefault(const quantize_params& params, const optional_params&) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
auto output = params.output;
if (output.GetLayout() == DataLayout::b_fs_yx_fsv16 && !params.packed_binary_output) {
- runInfo.gws0 = output.Batch().v;
- runInfo.gws1 = Align(output.Feature().v, sub_group_size);
- runInfo.gws2 = output.Y().v * output.X().v * output.Z().v;
+ dispatchData.gws[0] = output.Batch().v;
+ dispatchData.gws[1] = Align(output.Feature().v, sub_group_size);
+ dispatchData.gws[2] = output.Y().v * output.X().v * output.Z().v;
- runInfo.lws0 = 1;
- runInfo.lws1 = sub_group_size;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = sub_group_size;
+ dispatchData.lws[2] = 1;
} else {
- runInfo.gws0 = output.Batch().v;
- runInfo.gws1 = params.packed_binary_output ? CeilDiv(output.Feature().v, 32) : output.Feature().v;
- runInfo.gws2 = Align(output.X().v * output.Y().v * output.Z().v, 16);
+ dispatchData.gws[0] = output.Batch().v;
+ dispatchData.gws[1] = params.packed_binary_output ? CeilDiv(output.Feature().v, 32) : output.Feature().v;
+ dispatchData.gws[2] = Align(output.X().v * output.Y().v * output.Z().v, 16);
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 16;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 16;
}
- runInfo.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
- return runInfo;
+ return dispatchData;
}
-JitConstants QuantizeKernelRef::GetJitConstants(const quantize_params& params, const CommonDispatchData& runInfo) const {
- JitConstants jit = Parent::GetJitConstants(params, runInfo);
+JitConstants QuantizeKernelRef::GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && !params.packed_binary_output) {
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
}
QuantizeKernelRef() : QuantizeKernelBase("quantize_gpu_ref") {}
virtual ~QuantizeKernelRef() {}
- JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& runInfo) const override;
+ JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const override;
CommonDispatchData SetDefault(const quantize_params& params, const optional_params&) const override;
bool Validate(const Params& p, const optional_params& o) const override;
ParamsKey GetSupportedKey() const override;
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
CommonDispatchData QuantizeKernelScaleShift::SetDefault(const quantize_params& params, const optional_params&) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
auto output = params.output;
if (output.GetLayout() == DataLayout::b_fs_yx_fsv16) {
- runInfo.gws0 = output.Y().v * output.X().v;
- runInfo.gws1 = Align(output.Feature().v, sub_group_size);
- runInfo.gws2 = output.Batch().v;
+ dispatchData.gws[0] = output.Y().v * output.X().v;
+ dispatchData.gws[1] = Align(output.Feature().v, sub_group_size);
+ dispatchData.gws[2] = output.Batch().v;
- runInfo.lws0 = 1;
- runInfo.lws1 = sub_group_size;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = sub_group_size;
+ dispatchData.lws[2] = 1;
} else {
- auto global = GetTensorFriendlyWorkGroups(output);
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
+ dispatchData.gws = GetTensorFriendlyWorkGroups(output);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
}
- runInfo.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
- return runInfo;
+ return dispatchData;
}
-JitConstants QuantizeKernelScaleShift::GetJitConstants(const quantize_params& params, const CommonDispatchData& runInfo) const {
- JitConstants jit = Parent::GetJitConstants(params, runInfo);
+JitConstants QuantizeKernelScaleShift::GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const {
+ JitConstants jit = Parent::GetJitConstants(params, dispatchData);
if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16) {
jit.AddConstant(MakeJitConstant("GWS_BATCH", 2));
QuantizeKernelScaleShift() : QuantizeKernelBase("quantize_gpu_scale_shift_opt") {}
virtual ~QuantizeKernelScaleShift() {}
- JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& runInfo) const override;
+ JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const override;
CommonDispatchData SetDefault(const quantize_params& params, const optional_params&) const override;
bool Validate(const Params& p, const optional_params& o) const override;
ParamsKey GetSupportedKey() const override;
}
CommonDispatchData ReduceKernel_b_fs_yx_fsv16::SetDefault(const reduce_params& params, const optional_params&) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
auto in_dims = calc_in_dims(params);
- std::vector<size_t> global = {16,
- CeilDiv(in_dims[3].v, calc_read_offset(params)) * in_dims[2].v, // X, Y
- CeilDiv(in_dims[1].v, SIMD) * in_dims[0].v}; // F, B
+ dispatchData.gws = { 16,
+ CeilDiv(in_dims[3].v, calc_read_offset(params)) * in_dims[2].v, // X, Y
+ CeilDiv(in_dims[1].v, SIMD) * in_dims[0].v }; // F, B
+ dispatchData.lws = { SIMD, 1, 1 };
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = SIMD;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
-
- return runInfo;
+ return dispatchData;
}
JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& params) const {
}
const reduce_params& params = static_cast<const reduce_params&>(p);
- DispatchData runInfo = SetDefault(params, options);
+ DispatchData dispatchData = SetDefault(params, options);
KernelData kd = KernelData::Default<reduce_params>(params);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
- runInfo,
+ dispatchData,
params.engineInfo,
kernelName,
jit,
}
CommonDispatchData ReduceKernelRef::SetDefault(const reduce_params& params, const optional_params&) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
- std::vector<size_t> global = {params.output.X().v * params.output.Y().v,
- params.output.Z().v * params.output.W().v,
- params.output.Batch().v * params.output.Feature().v};
+ dispatchData.gws = { params.output.X().v * params.output.Y().v,
+ params.output.Z().v * params.output.W().v,
+ params.output.Batch().v * params.output.Feature().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
JitConstants ReduceKernelRef::GetJitConstants(const reduce_params& params) const {
/*
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
RegionYoloKernelRef::DispatchData SetDefault(const region_yolo_params& params) {
- RegionYoloKernelRef::DispatchData kd;
-
- kd.fp16UnitUsed = (params.inputs[0].GetDType() == Datatype::F16);
+ RegionYoloKernelRef::DispatchData dispatchData;
const auto& input = params.inputs[0];
- std::vector<size_t> global;
if (input.GetLayout() == DataLayout::bfyx) {
- global = {input.X().v * input.Y().v, 1, 1};
+ dispatchData.gws = {input.X().v * input.Y().v, 1, 1};
} else {
- global = {input.Feature().v * input.Batch().v, input.X().v, input.Y().v};
+ dispatchData.gws = {input.Feature().v * input.Batch().v, input.X().v, input.Y().v};
}
- // Determine global work sizes.
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- return kd;
+ return dispatchData;
}
KernelsData RegionYoloKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
assert(params.GetType() == KernelType::REGION_YOLO);
const region_yolo_params& orgParams = static_cast<const region_yolo_params&>(params);
- DispatchData runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<region_yolo_params>(params);
auto cldnn_jit = GetJitConstants(orgParams);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kd.estimatedTime = FORCE_PRIORITY_9;
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
ReorderFromWinograd2x3Kernel::DispatchData ReorderFromWinograd2x3Kernel::SetDefault(
const reorder_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
constexpr auto output_tile_width = 2; // by definition of F(2,3)
const auto& input = params.inputs[0];
const auto& output = params.output;
- kd.gws0 = static_cast<size_t>(output.Feature().v * output.Batch().v);
- kd.gws1 = static_cast<size_t>(output.X().v / output_tile_width);
- kd.gws2 = static_cast<size_t>(output.Y().v);
+ dispatchData.gws[0] = static_cast<size_t>(output.Feature().v * output.Batch().v);
+ dispatchData.gws[1] = static_cast<size_t>(output.X().v / output_tile_width);
+ dispatchData.gws[2] = static_cast<size_t>(output.Y().v);
- kd.lws0 = input.Feature().v > 32 ? 32 : static_cast<size_t>(input.Feature().v);
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = input.Feature().v > 32 ? 32 : static_cast<size_t>(input.Feature().v);
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return kd;
+ return dispatchData;
}
KernelsData ReorderFromWinograd2x3Kernel::GetKernelsData(const Params& params, const optional_params& options) const {
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
ReorderKernelBase::DispatchData ReorderKernelBase::SetDefault(const reorder_weights_params& params) const {
const auto& out = params.output;
- DispatchData kd;
+ DispatchData dispatchData;
- std::vector<size_t> global(3);
+ dispatchData.gws = { out.G().v * out.OFM().v, out.IFM().v, out.X().v * out.Y().v * out.Z().v };
+ dispatchData.lws= GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- global = {out.G().v * out.OFM().v, out.IFM().v, out.X().v * out.Y().v * out.Z().v};
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
ReorderKernelBase::DispatchData ReorderKernelBase::SetDefault(const reorder_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
auto& input = params.inputs[0];
DataTensor input_tensor = input;
input_tensor = DataTensor(input_sizes, input.GetDType(), DataLayout::image_2d_rgba);
}
- auto global = GetTensorFriendlyWorkGroups(input_tensor);
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.gws = GetTensorFriendlyWorkGroups(input_tensor);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
if (params.inputs[0].GetLayout() == DataLayout::fs_b_yx_fsv32) {
std::vector<size_t> sizes = { 32, 16, 8, 4 };
for (auto& s : sizes) {
- if (kd.gws2 % s == 0) {
- kd.lws0 = 1;
- kd.lws1 = 1;
- kd.lws2 = s;
+ if (dispatchData.gws[2] % s == 0) {
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = s;
break;
}
}
}
if (params.output.GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 && params.inputs[0].Feature().v % 16 == 0) {
- kd.lws0 = 1;
- kd.lws1 = 16;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 16;
+ dispatchData.lws[2] = 1;
}
- return kd;
+ return dispatchData;
}
KernelsData ReorderKernelBase::GetCommonKernelsData(const reorder_weights_params& params, const optional_params& options, float estimated_time) const {
KernelData kd = KernelData::Default<reorder_weights_params>(params);
reorder_weights_params& newParams = *static_cast<reorder_weights_params*>(kd.params.get());
- DispatchData runInfo;
+ DispatchData dispatchData;
- runInfo = SetDefault(newParams);
+ dispatchData = SetDefault(newParams);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kernel.arguments = GetArgsDesc(1, false, false);
KernelData kd = KernelData::Default<reorder_params>(params);
reorder_params& newParams = *static_cast<reorder_params*>(kd.params.get());
- DispatchData runInfo;
-
- runInfo = SetDefault(newParams);
+ DispatchData dispatchData = SetDefault(newParams);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kernel.arguments = GetArgsDesc(1, false, false);
if (newParams.mode == MeanSubtractMode::IN_BUFFER) {
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
ReorderKernelBinary::DispatchData ReorderKernelBinary::SetDefault(const reorder_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& input = params.inputs[0];
- std::vector<size_t> global{input.Batch().v, CeilDiv(input.Feature().v, 32), input.Y().v * input.X().v};
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { input.Batch().v, CeilDiv(input.Feature().v, 32), input.Y().v * input.X().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData ReorderKernelBinary::GetKernelsData(const Params& params, const optional_params& options) const {
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
ReorderKernelFastBatch1::DispatchData ReorderKernelFastBatch1::SetDefault(const reorder_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& output = params.output;
unsigned int gws = (unsigned int)output.LogicalSize();
- kd.gws0 = Align(gws, 32);
- kd.gws1 = 1;
- kd.gws2 = 1;
+ dispatchData.gws[0] = Align(gws, 32);
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
- kd.lws0 = 32;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 32;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return kd;
+ return dispatchData;
}
KernelsData ReorderKernelFastBatch1::GetKernelsData(const Params& params, const optional_params& options) const {
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
ReorderKernelBase::DispatchData ReorderKernel_fs_b_yx_fsv32_to_bfyx::SetDefault(const reorder_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
auto x_aligned = Align(params.output.X().v, x_block_align);
- kd.gws0 = params.output.Batch().v;
- kd.gws1 = Align(params.output.Feature().v, fsv);
- kd.gws2 = params.output.Y().v * x_aligned / GetOptimalSize(x_aligned, optimal_x_sizes);
+ dispatchData.gws[0] = params.output.Batch().v;
+ dispatchData.gws[1] = Align(params.output.Feature().v, fsv);
+ dispatchData.gws[2] = params.output.Y().v * x_aligned / GetOptimalSize(x_aligned, optimal_x_sizes);
- kd.lws0 = 1;
- kd.lws1 = GetOptimalSize(kd.gws1, optimal_feature_sizes);
- kd.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = GetOptimalSize(dispatchData.gws[1], optimal_feature_sizes);
+ dispatchData.lws[2] = 1;
- return kd;
+ return dispatchData;
}
KernelsData ReorderKernel_fs_b_yx_fsv32_to_bfyx::GetKernelsData(const Params& params, const optional_params& options) const {
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
ReorderKernelBase::DispatchData ReorderKernel_to_yxfb_batched::SetDefault(const reorder_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& input = params.inputs[0];
unsigned int gws = (unsigned int)input.LogicalSize();
- kd.gws0 = Align(gws, 8 * input.Batch().v) / input.Batch().v;
- kd.gws1 = 1;
- kd.gws2 = 1;
+ dispatchData.gws[0] = Align(gws, 8 * input.Batch().v) / input.Batch().v;
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
- kd.lws0 = 8;
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = 8;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return kd;
+ return dispatchData;
}
KernelsData ReorderKernel_to_yxfb_batched::GetKernelsData(const Params& params, const optional_params& options) const {
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
ReorderToWinograd2x3Kernel::DispatchData ReorderToWinograd2x3Kernel::SetDefault(const reorder_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& input = params.inputs[0];
const auto& output = params.output;
- kd.gws0 = static_cast<size_t>(input.Feature().v * input.Batch().v);
- kd.gws1 = static_cast<size_t>(params.winograd_nr_tiles_x);
- kd.gws2 = static_cast<size_t>(output.Y().v);
+ dispatchData.gws[0] = static_cast<size_t>(input.Feature().v * input.Batch().v);
+ dispatchData.gws[1] = static_cast<size_t>(params.winograd_nr_tiles_x);
+ dispatchData.gws[2] = static_cast<size_t>(output.Y().v);
- kd.lws0 = input.Feature().v > 32 ? 32 : static_cast<size_t>(input.Feature().v);
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[0] = input.Feature().v > 32 ? 32 : static_cast<size_t>(input.Feature().v);
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return kd;
+ return dispatchData;
}
KernelsData ReorderToWinograd2x3Kernel::GetKernelsData(const Params& params, const optional_params& options) const {
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
const reorder_weights_params& params) const {
const auto& out = params.output;
- DispatchData kd;
+ DispatchData dispatchData;
- std::vector<size_t> global = {out.OFM().v, CeilDiv(out.IFM().v, 32), out.X().v * out.Y().v};
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { out.OFM().v, CeilDiv(out.IFM().v, 32), out.X().v * out.Y().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData ReorderWeightsBinaryKernel::GetKernelsData(const Params& params, const optional_params& options) const {
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return k;
}
-ReorderWeightsImage_fyx_b_Kernel::DispatchData ReorderWeightsImage_fyx_b_Kernel::SetDefault(
- const reorder_weights_params& params) const {
+ReorderWeightsImage_fyx_b_Kernel::DispatchData ReorderWeightsImage_fyx_b_Kernel::SetDefault(const reorder_weights_params& params) const {
const auto& out = params.output;
- DispatchData kd;
+ DispatchData dispatchData;
- std::vector<size_t> global(3);
+ dispatchData.gws = { out.OFM().v, Align(out.X().v * out.Y().v * out.IFM().v, 4) / 4, 1 };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- global = {out.OFM().v, Align(out.X().v * out.Y().v * out.IFM().v, 4) / 4, 1};
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData ReorderWeightsImage_fyx_b_Kernel::GetKernelsData(const Params& params,
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
ReorderWeightsImageWinograd6x3Kernel::DispatchData ReorderWeightsImageWinograd6x3Kernel::SetDefault(
const reorder_weights_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& input = params.input;
- kd.gws0 = 1;
- kd.gws1 = 3;
- kd.gws2 = static_cast<size_t>(input.IFM().v * input.OFM().v);
+ dispatchData.gws[0] = 1;
+ dispatchData.gws[1] = 3;
+ dispatchData.gws[2] = static_cast<size_t>(input.IFM().v * input.OFM().v);
- kd.lws0 = 1;
- kd.lws1 = 1;
- kd.lws2 = 32;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 32;
- return kd;
+ return dispatchData;
}
KernelsData ReorderWeightsImageWinograd6x3Kernel::GetKernelsData(const Params& params,
ReorderWeightsOpt::DispatchData ReorderWeightsOpt::SetDefault(
const reorder_weights_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& output = params.output;
const auto output_layout = output.GetLayout();
const auto ifm_block = (osv_first) ? ifm_block_supported ? GetOptimalSize(output.IFM().v, preferred_sizes) : 1
: subgroup_size;
- std::vector<size_t> global;
if (osv_first) {
- global = {output.G().v * (output.IFM().v / ifm_block), output.Z().v * output.Y().v * output.X().v, Align(output.OFM().v, ofm_block)};
+ dispatchData.gws = { output.G().v * (output.IFM().v / ifm_block),
+ output.Z().v * output.Y().v * output.X().v,
+ Align(output.OFM().v, ofm_block) };
} else {
- global = {output.G().v * (output.OFM().v / ofm_block), output.Z().v * output.Y().v * output.X().v, Align(output.IFM().v, ifm_block)};
+ dispatchData.gws = { output.G().v * (output.OFM().v / ofm_block),
+ output.Z().v * output.Y().v * output.X().v,
+ Align(output.IFM().v, ifm_block) };
}
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
+ dispatchData.lws = { 1, 1, 16 };
- kd.lws0 = 1;
- kd.lws1 = 1;
- kd.lws2 = 16;
-
- return kd;
+ return dispatchData;
}
JitConstants ReorderWeightsOpt::GetJitConstants(const reorder_weights_params& params) const {
const auto& p = static_cast<const reorder_weights_params&>(params);
const auto& input = p.input;
const auto& output = p.output;
-
+
if (input.GroupedLayout() != output.GroupedLayout()) {
return false;
}
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
ReorderWeightsWinograd2x3Kernel::DispatchData ReorderWeightsWinograd2x3Kernel::SetDefault(
const reorder_weights_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& input = params.input;
- kd.gws0 = 1;
- kd.gws1 = 3;
- kd.gws2 = static_cast<size_t>(input.IFM().v * input.OFM().v);
+ dispatchData.gws[0] = 1;
+ dispatchData.gws[1] = 3;
+ dispatchData.gws[2] = static_cast<size_t>(input.IFM().v * input.OFM().v);
- kd.lws0 = 1;
- kd.lws1 = 1;
- kd.lws2 = 32;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 32;
- return kd;
+ return dispatchData;
}
KernelsData ReorderWeightsWinograd2x3Kernel::GetKernelsData(const Params& params,
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
ReorderWeightsWinograd6x3Kernel::DispatchData ReorderWeightsWinograd6x3Kernel::SetDefault(
const reorder_weights_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& input = params.input;
- kd.gws0 = 1;
- kd.gws1 = 3;
- kd.gws2 = static_cast<size_t>(input.IFM().v * input.OFM().v);
+ dispatchData.gws[0] = 1;
+ dispatchData.gws[1] = 3;
+ dispatchData.gws[2] = static_cast<size_t>(input.IFM().v * input.OFM().v);
- kd.lws0 = 1;
- kd.lws1 = 1;
- kd.lws2 = 32;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 32;
- return kd;
+ return dispatchData;
}
KernelsData ReorderWeightsWinograd6x3Kernel::GetKernelsData(const Params& params,
/*
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return jit;
}
ReorgYoloKernelRef::DispatchData SetDefault(const reorg_yolo_params& params) {
- ReorgYoloKernelRef::DispatchData kd;
-
- kd.fp16UnitUsed = (params.inputs[0].GetDType() == Datatype::F16);
+ ReorgYoloKernelRef::DispatchData dispatchData;
const auto& input = params.inputs[0];
- std::vector<size_t> global;
if (input.GetLayout() == DataLayout::bfyx) {
- global = {input.X().v, input.Y().v, input.Feature().v};
+ dispatchData.gws = {input.X().v, input.Y().v, input.Feature().v};
} else {
- global = {input.Feature().v * input.Batch().v, input.X().v, input.Y().v};
+ dispatchData.gws = {input.Feature().v * input.Batch().v, input.X().v, input.Y().v};
}
- // Determine global work sizes.
- kd.gws0 = global[0];
- kd.gws1 = global[1];
- kd.gws2 = global[2];
-
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- return kd;
+ return dispatchData;
}
KernelsData ReorgYoloKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
assert(params.GetType() == KernelType::REORG_YOLO);
const reorg_yolo_params& orgParams = static_cast<const reorg_yolo_params&>(params);
- DispatchData runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<reorg_yolo_params>(params);
auto cldnn_jit = GetJitConstants(orgParams);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kd.estimatedTime = FORCE_PRIORITY_9;
}
ResampleKernelBase::DispatchData ResampleKernelBase::SetDefault(const kernel_selector::resample_params &arg) const {
- DispatchData runInfo;
- std::vector<size_t> global;
- std::vector<size_t> local;
+ DispatchData dispatchData;
const auto& out = arg.output;
if (arg.resampleType == ResampleType::NEAREST_NEIGHBOR)
- global = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
+ dispatchData.gws = { out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v };
else if (arg.resampleType == ResampleType::BILINEAR_INTERP || arg.resampleType == ResampleType::LINEAR_ONNX)
- global = {Align(out.X().v, 32), out.Y().v, out.Batch().v};
+ dispatchData.gws = { Align(out.X().v, 32), out.Y().v, out.Batch().v };
else if (arg.resampleType == ResampleType::CAFFE_BILINEAR_INTERP)
- global = {out.X().v * out.Y().v, CeilDiv(out.Feature().v, GetFeatureBlockSize(arg)), out.Batch().v * out.Z().v};
+ dispatchData.gws = { out.X().v * out.Y().v, CeilDiv(out.Feature().v, GetFeatureBlockSize(arg)), out.Batch().v * out.Z().v };
else
- global = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
+ dispatchData.gws = { out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v };
- local = GetOptimalLocalWorkGroupSizes(global, arg.engineInfo);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo);
if (arg.resampleType == ResampleType::BILINEAR_INTERP || arg.resampleType == ResampleType::LINEAR_ONNX) {
- local[0] = 32;
- local[1] = 1;
- local[2] = 1;
+ dispatchData.lws[0] = 32;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
}
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
+ dispatchData.efficiency = FORCE_PRIORITY_7;
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- runInfo.efficiency = FORCE_PRIORITY_7;
- runInfo.fp16UnitUsed = out.GetDType() == Datatype::F16;
-
- return runInfo;
+ return dispatchData;
}
bool ResampleKernelBase::Validate(const Params& p, const optional_params& o) const {
KernelData kd = KernelData::Default<resample_params>(params);
resample_params& newParams = *static_cast<resample_params*>(kd.params.get());
- auto runInfo = SetDefault(newParams);
+ auto dispatchData = SetDefault(newParams);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point,
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
DEFAULT, false, false, 1, GetFusedPrimitiveInputsCount(params));
- kd.estimatedTime = runInfo.efficiency;
+ kd.estimatedTime = dispatchData.efficiency;
return {kd};
}
}
ResampleKernelBase::DispatchData ResampleKernelOpt::SetDefault(const kernel_selector::resample_params &arg) const {
- DispatchData runInfo;
+ DispatchData dispatchData;
const auto& out = arg.output;
- runInfo.gws0 = CeilDiv(out.X().v, GetOptimalBlockSize(arg)) * out.Y().v;
- runInfo.gws1 = Align(out.Feature().v, sub_group_size);
- runInfo.gws2 = arg.output.Batch().v;
+ dispatchData.gws[0] = CeilDiv(out.X().v, GetOptimalBlockSize(arg)) * out.Y().v;
+ dispatchData.gws[1] = Align(out.Feature().v, sub_group_size);
+ dispatchData.gws[2] = arg.output.Batch().v;
- runInfo.lws0 = 1;
- runInfo.lws1 = sub_group_size;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = sub_group_size;
+ dispatchData.lws[2] = 1;
- runInfo.efficiency = FORCE_PRIORITY_3;
- runInfo.fp16UnitUsed = out.GetDType() == Datatype::F16;
+ dispatchData.efficiency = FORCE_PRIORITY_3;
- return runInfo;
+ return dispatchData;
}
bool ResampleKernelOpt::Validate(const Params& p, const optional_params& o) const {
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
ResampleKernelBase::DispatchData ResampleKernelRef::SetDefault(const resample_params& arg) const {
- auto dispatch = Parent::SetDefault(arg);
+ auto dispatchData = Parent::SetDefault(arg);
if (use_packing(arg)) {
auto pack = packing_factor(arg);
- std::vector<size_t> global;
- std::vector<size_t> local;
-
- global = { arg.output.X().v, arg.output.Y().v * arg.output.Z().v, CeilDiv(arg.output.Feature().v, pack) * arg.output.Batch().v };
- local = GetOptimalLocalWorkGroupSizes(global, arg.engineInfo);
-
- dispatch.gws0 = global[0];
- dispatch.gws1 = global[1];
- dispatch.gws2 = global[2];
-
- dispatch.lws0 = local[0];
- dispatch.lws1 = local[1];
- dispatch.lws2 = local[2];
+ dispatchData.gws = { arg.output.X().v, arg.output.Y().v * arg.output.Z().v, CeilDiv(arg.output.Feature().v, pack) * arg.output.Batch().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo);
}
- return dispatch;
+ return dispatchData;
}
} // namespace kernel_selector
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
CommonDispatchData ReverseSequenceKernelRef::SetDefault(const reverse_sequence_params& params,
const optional_params&) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
- std::vector<size_t> global = {params.output.Batch().v,
- params.output.Feature().v,
- params.output.Y().v * params.output.X().v};
+ dispatchData.gws = { params.output.Batch().v,
+ params.output.Feature().v,
+ params.output.Y().v * params.output.X().v };
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
JitConstants ReverseSequenceKernelRef::GetJitConstants(const reverse_sequence_params& params) const {
assert(params.GetType() == KernelType::REVERSE_SEQUENCE);
- auto runInfo = SetDefault(newParams, options);
+ auto dispatchData = SetDefault(newParams, options);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2);
kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
namespace kernel_selector {
static ROIPoolingKernelBase::DispatchData SetDefault(const roi_pooling_params& params) {
- ROIPoolingKernelBase::DispatchData kd;
-
- kd.fp16UnitUsed = (params.inputs[0].GetDType() == Datatype::F16);
+ ROIPoolingKernelBase::DispatchData dispatchData;
// Determine global work sizes.
- kd.gws0 = params.output.LogicalSize();
- kd.gws1 = 1;
- kd.gws2 = 1;
+ dispatchData.gws[0] = params.output.LogicalSize();
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
// Find largest positive local work size that is divider for global work size.
- kd.lws0 = std::min(std::max(kd.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
- while (kd.gws0 % kd.lws0 != 0) {
- --kd.lws0;
+ dispatchData.lws[0] = std::min(std::max(dispatchData.gws[0], static_cast<size_t>(1)), static_cast<size_t>(32));
+ while (dispatchData.gws[0] % dispatchData.lws[0] != 0) {
+ --dispatchData.lws[0];
}
- kd.lws1 = 1;
- kd.lws2 = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- return kd;
+ return dispatchData;
}
JitConstants ROIPoolingKernelBase::GetJitConstants(const roi_pooling_params& rp) const {
return {};
}
- DispatchData runInfo = SetDefault(orgParams);
+ DispatchData dispatchData = SetDefault(orgParams);
KernelData kd = KernelData::Default<roi_pooling_params>(params);
auto cldnn_jit = GetJitConstants(orgParams);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
if (orgParams.mode == PoolType::DEFORMABLE_BILINEAR && !orgParams.no_trans)
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2});
std::string order_str = order[0];
for (size_t i = 1; i < order.size(); i++)
order_str += ", " + order[i];
-
+
return order_str;
}
std::string FYX_indices_size = "(INPUT1_FEATURE_NUM * INPUT1_SIZE_Y * INPUT1_SIZE_X)";
std::string YX_indices_size = "(INPUT1_SIZE_Y * INPUT1_SIZE_X)";
std::string X_indices_size = "(INPUT1_SIZE_X)";
-
+
// Shift indices of ScatterUpdate updates input related to Indices dims
for (size_t i = default_order.size() - 1; i > (axis + indices_non_empty_dims - 1); i--)
default_order[i] = default_order[i - indices_non_empty_dims + 1];
}
CommonDispatchData ScatterUpdateKernelRef::SetDefault(const scatter_update_params& params, const optional_params&, bool is_second) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
const auto& output = params.output;
- std::vector<size_t> global(3);
const size_t indices_size = params.inputs[1].LogicalSize();
switch (params.inputs[0].GetLayout()) {
case DataLayout::bfyx:
- global = {output.X().v, output.Y().v, output.Feature().v * output.Batch().v};
+ dispatchData.gws = {output.X().v, output.Y().v, output.Feature().v * output.Batch().v};
if (is_second) {
if (params.axis == ScatterUpdateAxis::BATCH)
- global[2] = indices_size * output.Feature().v;
+ dispatchData.gws[2] = indices_size * output.Feature().v;
else if (params.axis == ScatterUpdateAxis::FEATURE)
- global[2] = indices_size * output.Batch().v;
+ dispatchData.gws[2] = indices_size * output.Batch().v;
else if (params.axis == ScatterUpdateAxis::Y)
- global[1] = indices_size;
+ dispatchData.gws[1] = indices_size;
else
- global[0] = indices_size;
+ dispatchData.gws[0] = indices_size;
}
break;
case DataLayout::bfzyx:
- global = {output.X().v * output.Y().v, output.Z().v, output.Feature().v * output.Batch().v};
+ dispatchData.gws = {output.X().v * output.Y().v, output.Z().v, output.Feature().v * output.Batch().v};
if (is_second) {
if (params.axis == ScatterUpdateAxis::BATCH)
- global[2] = indices_size * output.Feature().v;
+ dispatchData.gws[2] = indices_size * output.Feature().v;
else if (params.axis == ScatterUpdateAxis::FEATURE)
- global[2] = indices_size * output.Batch().v;
+ dispatchData.gws[2] = indices_size * output.Batch().v;
else if (params.axis == ScatterUpdateAxis::Z)
- global[1] = indices_size;
+ dispatchData.gws[1] = indices_size;
else if (params.axis == ScatterUpdateAxis::Y)
- global[0] = indices_size * output.X().v;
+ dispatchData.gws[0] = indices_size * output.X().v;
else
- global[0] = indices_size * output.Y().v;
+ dispatchData.gws[0] = indices_size * output.Y().v;
}
break;
case DataLayout::bfwzyx:
- global = {output.X().v * output.Y().v, output.Z().v * output.W().v, output.Feature().v * output.Batch().v};
+ dispatchData.gws = {output.X().v * output.Y().v, output.Z().v * output.W().v, output.Feature().v * output.Batch().v};
if (is_second) {
if (params.axis == ScatterUpdateAxis::BATCH)
- global[2] = indices_size * output.Feature().v;
+ dispatchData.gws[2] = indices_size * output.Feature().v;
else if (params.axis == ScatterUpdateAxis::FEATURE)
- global[2] = indices_size * output.Batch().v;
+ dispatchData.gws[2] = indices_size * output.Batch().v;
else if (params.axis == ScatterUpdateAxis::Z)
- global[1] = indices_size * output.W().v;
+ dispatchData.gws[1] = indices_size * output.W().v;
else if (params.axis == ScatterUpdateAxis::W)
- global[1] = indices_size * output.Z().v;
+ dispatchData.gws[1] = indices_size * output.Z().v;
else if (params.axis == ScatterUpdateAxis::Y)
- global[0] = indices_size * output.X().v;
+ dispatchData.gws[0] = indices_size * output.X().v;
else
- global[0] = indices_size * output.Y().v;
+ dispatchData.gws[0] = indices_size * output.Y().v;
}
break;
default: break;
}
-
- std::vector<size_t> local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
- runInfo.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- return runInfo;
+ return dispatchData;
}
static std::string GetOutputIndexOnAxis(const scatter_update_params& params, size_t axis) {
const scatter_update_params& orgParams = static_cast<const scatter_update_params&>(params);
const size_t indices_size = orgParams.inputs[1].LogicalSize();
int start_with_iteration = 0;
-
+
// if dim of output along axis is equal to logical size of indices, we miss copying kernel
if (orgParams.inputs[0].Extract(orgParams.inputs[0].GetLayout(), Tensor::DataChannelName(orgParams.axis), orgParams.inputs[0].GetDims()).v == indices_size) {
start_with_iteration = 1;
auto cldnn_jit = GetJitConstants(newParams);
for (int i = start_with_iteration; i < 2; i++) {
- auto runInfo = SetDefault(newParams, options, (i == 1));
+ auto dispatchData = SetDefault(newParams, options, (i == 1));
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
if (i == 1){
clKernelData& kernel = kd.kernels[i - start_with_iteration];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 3, GetFusedPrimitiveInputsCount(params));
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, "", false, false, 3, GetFusedPrimitiveInputsCount(params));
}
kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-
+
return {kd};
}
} // namespace kernel_selector
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
SelectKernelBase::DispatchData SelectKernelBase::SetDefault(const select_params& params) const {
- DispatchData kd;
+ DispatchData dispatchData;
const auto& out = params.output;
gws.push_back(1U);
}
- kd.gws0 = gws[0];
- kd.gws1 = gws[1];
- kd.gws2 = gws[2] * gws[3];
+ dispatchData.gws[0] = gws[0];
+ dispatchData.gws[1] = gws[1];
+ dispatchData.gws[2] = gws[2] * gws[3];
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- auto local = GetOptimalLocalWorkGroupSizes({kd.gws0, kd.gws1, kd.gws2}, params.engineInfo);
- kd.lws0 = local[0];
- kd.lws1 = local[1];
- kd.lws2 = local[2];
-
- return kd;
+ return dispatchData;
}
KernelsData SelectKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options) const {
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
- DispatchData runInfo = SetDefault(newParams);
+ DispatchData dispatchData = SetDefault(newParams);
auto& kernel = kd.kernels[0];
- kernel.workGroups.global = {runInfo.gws0, runInfo.gws1, runInfo.gws2};
- kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2};
+ kernel.workGroups.global = dispatchData.gws;
+ kernel.workGroups.local = dispatchData.lws;
kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(), false, false);
CommonDispatchData ShuffleChannelsKernelRef::SetDefault(const shuffle_channels_params& params,
const optional_params&) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
- std::vector<size_t> global = {params.output.Batch().v,
- params.output.Feature().v,
- params.output.Y().v * params.output.X().v};
+ dispatchData.gws = { params.output.Batch().v,
+ params.output.Feature().v,
+ params.output.Y().v * params.output.X().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
JitConstants ShuffleChannelsKernelRef::GetJitConstants(const shuffle_channels_params& params) const {
assert(params.GetType() == KernelType::SHUFFLE_CHANNELS);
- auto runInfo = SetDefault(newParams, options);
+ auto dispatchData = SetDefault(newParams, options);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
}
}
-JitConstants SoftmaxItemsClassKernelBase::GetJitConstants(const softmax_params& params, DispatchData kd) const {
- auto jit = SoftmaxKernelBase::GetJitConstants(params, kd);
+JitConstants SoftmaxItemsClassKernelBase::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {
+ auto jit = SoftmaxKernelBase::GetJitConstants(params, dispatchData);
switch (params.dim) {
case SoftmaxDim::X:
virtual ~SoftmaxItemsClassKernelBase() {}
protected:
- JitConstants GetJitConstants(const softmax_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const softmax_params& params, DispatchData dispatchData) const override;
static ParamsKey GetDefaultSupportedKey();
static std::vector<size_t> GetSoftmaxDimGlobalSizes(SoftmaxDim dim, const DataTensor& output);
};
namespace kernel_selector {
JitConstants SoftmaxKernelBase::GetJitConstants(const softmax_params& params,
- SoftmaxKernelBase::DispatchData kd) const {
+ SoftmaxKernelBase::DispatchData dispatchData) const {
JitConstants mem_consts = MakeBaseParamsJitConstants(params);
mem_consts.AddConstants({MakeJitConstant("ALONG_" + toString(params.dim), "")});
mem_consts.AddConstants({
- MakeJitConstant("ITEMS_NUM", kd.itemsNum),
- MakeJitConstant("LWS", kd.lws0),
- MakeJitConstant("GWS", kd.gws0),
- MakeJitConstant("DATA_SETS_COUNT", kd.dataSetsCount),
- MakeJitConstant("DATA_SET_SIZE", kd.dataSetSize),
- MakeJitConstant("LEFTOVERS", kd.leftovers),
+ MakeJitConstant("ITEMS_NUM", dispatchData.itemsNum),
+ MakeJitConstant("LWS", dispatchData.lws[0]),
+ MakeJitConstant("GWS", dispatchData.gws[0]),
+ MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount),
+ MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
+ MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
});
return mem_consts;
}
-SoftmaxKernelBase::DispatchData SoftmaxKernelBase::SetDefault(const softmax_params& params,
+SoftmaxKernelBase::DispatchData SoftmaxKernelBase::SetDefault(const softmax_params&,
const optional_params&) const {
- DispatchData runInfo;
+ DispatchData dispatchData;
- runInfo.gws0 = 1;
- runInfo.gws1 = 1;
- runInfo.gws2 = 1;
+ dispatchData.gws[0] = 1;
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
- runInfo.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
- runInfo.leftovers = 0;
- runInfo.itemsNum = 0;
- runInfo.normIndex = 0;
- runInfo.dataSetsCount = 0;
- runInfo.dataSetSize = 0;
+ dispatchData.leftovers = 0;
+ dispatchData.itemsNum = 0;
+ dispatchData.normIndex = 0;
+ dispatchData.dataSetsCount = 0;
+ dispatchData.dataSetSize = 0;
- return runInfo;
+ return dispatchData;
}
bool SoftmaxKernelBase::Validate(const Params& p, const optional_params& o) const {
const softmax_params& orgParams = static_cast<const softmax_params&>(params);
KernelData kd = KernelData::Default<softmax_params>(params);
- auto runInfo = SetDefault(orgParams, options);
- auto cldnn_jit = GetJitConstants(orgParams, runInfo);
+ auto dispatchData = SetDefault(orgParams, options);
+ auto cldnn_jit = GetJitConstants(orgParams, dispatchData);
auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
- kd.estimatedTime = runInfo.efficiency;
+ kd.estimatedTime = dispatchData.efficiency;
return {kd};
}
const optional_params& options) const {
const auto& input = params.inputs[0];
- DispatchData kd = Parent::SetDefault(params, options);
+ DispatchData dispatchData = Parent::SetDefault(params, options);
auto flatten_input = input.FlattenFeatureAndSpatials();
- kd.dataSetSize = flatten_input.Feature().v;
- kd.dataSetsCount = input.Batch().v;
+ dispatchData.dataSetSize = flatten_input.Feature().v;
+ dispatchData.dataSetsCount = input.Batch().v;
- return kd;
+ return dispatchData;
}
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
protected:
virtual bool Validate(const Params&, const optional_params&) const;
- virtual JitConstants GetJitConstants(const softmax_params& params, DispatchData kd) const;
+ virtual JitConstants GetJitConstants(const softmax_params& params, DispatchData dispatchData) const;
virtual DispatchData SetDefault(const softmax_params& params, const optional_params& optParams) const;
KernelsData GetCommonKernelsData(const Params& params, const optional_params& optParams) const;
};
bool Validate(const Params&, const optional_params&) const override;
DispatchData SetDefault(const softmax_params& params, const optional_params& optParams) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
SoftmaxKernel_bf::Parent::DispatchData SoftmaxKernel_bf::SetDefault(const softmax_params& params,
const optional_params& optParams) const {
- auto kd = Parent::SetDefault(params, optParams);
+ auto dispatchData = Parent::SetDefault(params, optParams);
// start with 1 thread per data set
- kd.gws0 = 1;
- kd.gws1 = kd.dataSetsCount;
- kd.itemsNum = kd.dataSetSize;
+ dispatchData.gws[0] = 1;
+ dispatchData.gws[1] = dispatchData.dataSetsCount;
+ dispatchData.itemsNum = dispatchData.dataSetSize;
- kd.normIndex = 0;
+ dispatchData.normIndex = 0;
// We have two units of data per work item in current implementation.
- auto local_mem_per_wi = 2 * (kd.fp16UnitUsed ? sizeof(short) : sizeof(float));
+ auto local_mem_per_wi = 2 * BytesPerElement(params.inputs[0].GetDType());
// Combining device execution and local memory restrictions to compute maximum possible LWS.
auto max_lws = std::min(params.engineInfo.maxWorkGroupSize, params.engineInfo.maxLocalMemSize / local_mem_per_wi);
- kd.lws0 = 1;
+ dispatchData.lws[0] = 1;
// Compute maximum possible LWS that does not exceed device capabilities and optimizes number of global memory
// reads.
- while ((kd.itemsNum > 32 || kd.lws0 < kd.itemsNum) && (2 * kd.lws0 <= max_lws)) {
- kd.lws0 *= 2;
- kd.itemsNum /= 2;
+ while ((dispatchData.itemsNum > 32 || dispatchData.lws[0] < dispatchData.itemsNum) && (2 * dispatchData.lws[0] <= max_lws)) {
+ dispatchData.lws[0] *= 2;
+ dispatchData.itemsNum /= 2;
}
- assert((kd.itemsNum + 1) * kd.lws0 >= kd.dataSetSize && "More than 'lws0' items per batch remains! Lws too small?");
+ assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?");
- kd.gws0 = kd.lws0;
- kd.leftovers = kd.dataSetSize % kd.lws0;
+ dispatchData.gws[0] = dispatchData.lws[0];
+ dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
- assert(kd.itemsNum > 0 && kd.lws0 && kd.gws0 > 0);
+ assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0);
- kd.efficiency = FORCE_PRIORITY_6;
- return kd;
+ dispatchData.efficiency = FORCE_PRIORITY_6;
+ return dispatchData;
}
KernelsData SoftmaxKernel_bf::GetKernelsData(const Params& params, const optional_params& optionalParams) const {
SoftmaxKernel_fb::Parent::DispatchData SoftmaxKernel_fb::SetDefault(const softmax_params& params,
const optional_params& optParams) const {
- auto kd = Parent::SetDefault(params, optParams);
+ auto dispatchData = Parent::SetDefault(params, optParams);
// start with 1 thread per data set
- kd.gws0 = kd.dataSetsCount;
- kd.gws1 = 1;
- kd.itemsNum = kd.dataSetSize;
+ dispatchData.gws[0] = dispatchData.dataSetsCount;
+ dispatchData.gws[1] = 1;
+ dispatchData.itemsNum = dispatchData.dataSetSize;
- kd.normIndex = 1;
+ dispatchData.normIndex = 1;
// We have two units of data per work item in current implementation.
- auto local_mem_per_wi = 2 * (kd.fp16UnitUsed ? sizeof(short) : sizeof(float));
+ auto local_mem_per_wi = 2 * BytesPerElement(params.inputs[0].GetDType());
// Combining device execution and local memory restrictions to compute maximum possible LWS.
auto max_lws = static_cast<std::size_t>(
std::min(params.engineInfo.maxWorkGroupSize, params.engineInfo.maxLocalMemSize / local_mem_per_wi));
- kd.lws0 = std::min(kd.dataSetsCount, max_lws);
+ dispatchData.lws[0] = std::min(dispatchData.dataSetsCount, max_lws);
// Compute maximum possible LWS that does not exceed device capabilities and optimizes number of global memory
// reads.
- while ((kd.itemsNum > 32 || kd.lws0 < kd.itemsNum) && (2 * kd.lws0 <= max_lws)) {
- kd.lws0 *= 2;
- kd.itemsNum /= 2;
+ while ((dispatchData.itemsNum > 32 || dispatchData.lws[0] < dispatchData.itemsNum) && (2 * dispatchData.lws[0] <= max_lws)) {
+ dispatchData.lws[0] *= 2;
+ dispatchData.itemsNum /= 2;
}
- kd.gws0 = kd.lws0;
- kd.gws1 = 1;
- kd.leftovers = (kd.dataSetSize * kd.dataSetsCount) % kd.lws0;
+ dispatchData.gws[0] = dispatchData.lws[0];
+ dispatchData.gws[1] = 1;
+ dispatchData.leftovers = (dispatchData.dataSetSize * dispatchData.dataSetsCount) % dispatchData.lws[0];
- assert(kd.itemsNum > 0 && kd.lws0 && kd.gws0 > 0);
+ assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0);
- kd.efficiency = FORCE_PRIORITY_6;
- return kd;
+ dispatchData.efficiency = FORCE_PRIORITY_6;
+ return dispatchData;
}
bool kernel_selector::SoftmaxKernel_fb::Validate(const Params& params, const optional_params& o) const {
const auto& softmax_params = static_cast<const kernel_selector::softmax_params&>(params);
- auto kd = Parent::SetDefault(softmax_params, o);
- auto local_mem_per_wi = 2 * (kd.fp16UnitUsed ? sizeof(short) : sizeof(float));
+ auto local_mem_per_wi = 2 * BytesPerElement(softmax_params.inputs[0].GetDType());
auto max_lws = static_cast<std::size_t>(
std::min(params.engineInfo.maxWorkGroupSize, params.engineInfo.maxLocalMemSize / local_mem_per_wi));
SoftmaxKerneItemsClassOptimized::Parent::DispatchData SoftmaxKerneItemsClassOptimized::SetDefault(
const softmax_params& params,
const optional_params& optParams) const {
- auto runInfo = Parent::SetDefault(params, optParams);
+ auto dispatchData = Parent::SetDefault(params, optParams);
auto& input = params.inputs[0];
break;
}
- runInfo.gws0 = global[0];
- runInfo.gws1 =
- global[1] * workitems_per_classes; // we multiply it by workitems_per_classes because we split computations of
- // one "full item classes output" into multiple workitems by "full item
- // classes output" i mean N outputs where N is number of item classes.
- runInfo.gws2 = global[2];
+ dispatchData.gws[0] = global[0];
+ dispatchData.gws[1] = global[1] * workitems_per_classes; // we multiply it by workitems_per_classes because we split computations of
+ // one "full item classes output" into multiple workitems by "full item
+ // classes output" i mean N outputs where N is number of item classes.
+ dispatchData.gws[2] = global[2];
- runInfo.lws0 = 1;
- runInfo.lws1 = workitems_per_classes;
- runInfo.lws2 = 1;
+ dispatchData.lws = { 1, workitems_per_classes, 1 };
- runInfo.leftovers = item_class_count % workitems_per_classes;
+ dispatchData.leftovers = item_class_count % workitems_per_classes;
if (item_class_count >= 32) {
- runInfo.efficiency = FORCE_PRIORITY_7;
+ dispatchData.efficiency = FORCE_PRIORITY_7;
} else {
- runInfo.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
+ dispatchData.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
}
- return runInfo;
+ return dispatchData;
}
-JitConstants SoftmaxKerneItemsClassOptimized::GetJitConstants(const softmax_params& params, DispatchData kd) const {
- auto jit = SoftmaxItemsClassKernelBase::GetJitConstants(params, kd);
+JitConstants SoftmaxKerneItemsClassOptimized::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {
+ auto jit = SoftmaxItemsClassKernelBase::GetJitConstants(params, dispatchData);
jit.AddConstant(MakeJitConstant("WORKITEMS_PER_CLASSES", workitems_per_classes));
jit.AddConstant(MakeJitConstant("HAS_DRIVER_PROBLEMS", params.engineInfo.bIMADSupport));
const optional_params& options) const {
return GetCommonKernelsData(params, options);
}
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
ParamsKey GetSupportedKey() const override;
protected:
- JitConstants GetJitConstants(const softmax_params& params, DispatchData kd) const override;
+ JitConstants GetJitConstants(const softmax_params& params, DispatchData dispatchData) const override;
DispatchData SetDefault(const softmax_params& params, const optional_params& optParams) const override;
};
-} // namespace kernel_selector
\ No newline at end of file
+} // namespace kernel_selector
SoftmaxKernelRef::Parent::DispatchData SoftmaxKernelRef::SetDefault(const softmax_params& params,
const optional_params& optParams) const {
- auto runInfo = Parent::SetDefault(params, optParams);
+ auto dispatchData = Parent::SetDefault(params, optParams);
- const auto global = GetSoftmaxDimGlobalSizes(params.dim, params.output);
+ dispatchData.gws = GetSoftmaxDimGlobalSizes(params.dim, params.output);
- assert(global.size() == 3);
+ assert(dispatchData.gws.size() == 3);
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
+ dispatchData.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- runInfo.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-
- return runInfo;
+ return dispatchData;
}
KernelsData SoftmaxKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
CommonDispatchData SpaceToBatchKernelBase::SetDefault(const space_to_batch_params& params, const optional_params&) const {
const auto& out = params.output;
- CommonDispatchData runInfo;
- std::vector<size_t> global;
- std::vector<size_t> local;
-
+ CommonDispatchData dispatchData;
if (out.GetLayout() == DataLayout::b_fs_yx_fsv16 && out.Feature().v % 16 == 0) {
- global = { out.Batch().v, out.Feature().v, out.Y().v * out.X().v };
- local = {1, 16, 1};
+ dispatchData.gws = { out.Batch().v, out.Feature().v, out.Y().v * out.X().v };
+ dispatchData.lws = {1, 16, 1};
} else {
- global = { out.Batch().v, out.Feature().v, out.W().v * out.Z().v * out.Y().v * out.X().v };
- local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+ dispatchData.gws = { out.Batch().v, out.Feature().v, out.W().v * out.Z().v * out.Y().v * out.X().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
}
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
JitConstants SpaceToBatchKernelBase::GetJitConstants(const space_to_batch_params& params) const {
return {};
}
- auto runInfo = SetDefault(newParams, options);
+ auto dispatchData = SetDefault(newParams, options);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point,
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
"", false, false, 1, GetFusedPrimitiveInputsCount(params));
kd.estimatedTime = estimatedTime;
CommonDispatchData SpaceToDepthKernelRef::SetDefault(const space_to_depth_params& params,
const optional_params&) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
- std::vector<size_t> global = {params.output.Batch().v,
- params.output.Feature().v,
- params.output.Z().v * params.output.Y().v * params.output.X().v};
+ dispatchData.gws = { params.output.Batch().v,
+ params.output.Feature().v,
+ params.output.Z().v * params.output.Y().v * params.output.X().v };
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
- runInfo.gws0 = global[0];
- runInfo.gws1 = global[1];
- runInfo.gws2 = global[2];
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
+ return dispatchData;
}
JitConstants SpaceToDepthKernelRef::GetJitConstants(const space_to_depth_params& params) const {
return {};
}
- auto runInfo = SetDefault(newParams, options);
+ auto dispatchData = SetDefault(newParams, options);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point,
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
DEFAULT, false, false, 1, GetFusedPrimitiveInputsCount(params));
kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
}
CommonDispatchData StridedSliceKernelRef::SetDefault(const strided_slice_params& params, const optional_params&) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
// If the new_axis_mask is set, then begin, end, and stride are ignored
// and a new length 1 dimension is adding. Input data just copying to output
// TODO: remove data copying in case where only shape size changing
- std::vector<size_t> gws = {params.output.Batch().v, params.output.Feature().v,
- params.output.Z().v * params.output.Y().v * params.output.X().v};
+ dispatchData.gws = { params.output.Batch().v,
+ params.output.Feature().v,
+ params.output.Z().v * params.output.Y().v * params.output.X().v };
- auto lws = GetOptimalLocalWorkGroupSizes(gws, params.engineInfo);
+ dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
- runInfo.gws0 = gws[0];
- runInfo.gws1 = gws[1];
- runInfo.gws2 = gws[2];
-
- runInfo.lws0 = lws[0];
- runInfo.lws1 = lws[1];
- runInfo.lws2 = lws[2];
-
- return runInfo;
+ return dispatchData;
}
JitConstants StridedSliceKernelRef::GetJitConstants(const strided_slice_params& params) const {
assert(params.GetType() == KernelType::STRIDED_SLICE);
- auto runInfo = SetDefault(newParams, options);
+ auto dispatchData = SetDefault(newParams, options);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
CommonDispatchData TileKernelRef::SetDefault(const tile_params& params, const optional_params&) const {
- CommonDispatchData runInfo;
+ CommonDispatchData dispatchData;
auto in = params.inputs[0];
}
if (inner_size > 1) {
- runInfo.gws0 = outer_size;
- runInfo.gws1 = inner_size;
- runInfo.gws2 = 1;
+ dispatchData.gws[0] = outer_size;
+ dispatchData.gws[1] = inner_size;
+ dispatchData.gws[2] = 1;
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 1;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
} else {
- runInfo.gws0 = Align(outer_size, 16);
- runInfo.gws1 = 1;
- runInfo.gws2 = 1;
+ dispatchData.gws[0] = Align(outer_size, 16);
+ dispatchData.gws[1] = 1;
+ dispatchData.gws[2] = 1;
- runInfo.lws0 = 16;
- runInfo.lws1 = 1;
- runInfo.lws2 = 1;
+ dispatchData.lws[0] = 16;
+ dispatchData.lws[1] = 1;
+ dispatchData.lws[2] = 1;
}
- runInfo.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
- return runInfo;
+ return dispatchData;
}
JitConstants TileKernelRef::GetJitConstants(const tile_params& params) const {
KernelData kd = KernelData::Default<tile_params>(params);
tile_params& newParams = *static_cast<tile_params*>(kd.params.get());
- auto runInfo = SetDefault(newParams, options);
+ auto dispatchData = SetDefault(newParams, options);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
- FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+ FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
return kernel_string;
}
-static void Check_RunInfoData(const std::string& kernelName, const kernel_selector::CommonDispatchData& runInfo) {
- if (runInfo.lws0 * runInfo.lws1 * runInfo.lws2 > 256) {
- std::cout << "ERROR: dispatch data for kernel: " << kernelName << " LWS cannot be greater than 256!\n"
- << std::endl;
- }
- if (runInfo.gws0 == 0 || runInfo.gws1 == 0 || runInfo.gws2 == 0 || runInfo.lws0 == 0 || runInfo.lws1 == 0 ||
- runInfo.lws2 == 0) {
- std::cout << "ERROR: dispatch data for kernel: " << kernelName << " dispatch data cannot contain zeros!"
- << std::endl;
- }
- if (runInfo.gws0 % runInfo.lws0 != 0) {
- std::cout << "ERROR: dispatch data for kernel: " << kernelName << " is incorrect: GWS0: " << runInfo.gws0
- << " LWS0: " << runInfo.lws0 << std::endl;
- }
- if (runInfo.gws1 % runInfo.lws1 != 0) {
- std::cout << "ERROR: dispatch data for kernel: " << kernelName << " is incorrect: GWS1: " << runInfo.gws1
- << " LWS1: " << runInfo.lws1 << std::endl;
- }
- if (runInfo.gws2 % runInfo.lws2 != 0) {
- std::cout << "ERROR: dispatch data for kernel: " << kernelName << " is incorrect: GWS2: " << runInfo.gws2
- << " LWS2: " << runInfo.lws2 << std::endl;
- }
-}
-
uint32_t common_kernel_base::GetFusedPrimitiveInputsCount(const Params ¶ms) const {
auto p = dynamic_cast<const base_params&>(params);
uint32_t fused_deps_total = 0;
}
void common_kernel_base::FillCLKernelData(clKernelData& kernel,
- const CommonDispatchData& runInfo,
+ const CommonDispatchData& dispatchData,
const EngineInfo& engine_info,
const std::string& kernelMapName,
const std::string& jit,
bool bias,
int number_of_inputs,
uint32_t number_of_inputs_for_fused_prims) const {
- Check_RunInfoData(kernelMapName, runInfo);
- kernel.workGroups.global = {runInfo.gws0, runInfo.gws1, runInfo.gws2};
- kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2};
+ CheckDispatchData(kernelMapName, dispatchData);
+ kernel.workGroups.global = dispatchData.gws;
+ kernel.workGroups.local = dispatchData.lws;
kernel.kernelString = GetKernelString(kernelMapName, jit, entryPoint, engine_info, exeMode);
- kernel.arguments =
- GetArgsDesc(number_of_inputs, weights, bias, number_of_inputs_for_fused_prims);
+ kernel.arguments = GetArgsDesc(number_of_inputs, weights, bias, number_of_inputs_for_fused_prims);
}
} // namespace kernel_selector
#include <vector>
namespace kernel_selector {
-struct CommonDispatchData {
- // TODO: change it to std::vector<size_t>
- size_t gws0, gws1, gws2;
- size_t lws0, lws1, lws2;
- bool
- fp16UnitUsed; ///< Value indicating that FP16 half precision floating point type will be used (instead of single precision).
- float efficiency;
-
- CommonDispatchData() : gws0(0), gws1(0), gws2(0), lws0(0), lws1(0), lws2(0), fp16UnitUsed(false), efficiency(0.0f){}
-};
class common_kernel_base : public KernelBase {
public:
uint32_t GetFusedPrimitiveInputsCount(const Params ¶ms) const;
void FillCLKernelData(clKernelData& kernel,
- const CommonDispatchData& runInfo,
+ const CommonDispatchData& dispatchData,
const EngineInfo& engine_info,
const std::string& kernel_map_name,
const std::string& jit,
std::string calcFunction;
WeightIndexFuncDesc() = default;
- WeightIndexFuncDesc(const WeightsLayout l) {
+ WeightIndexFuncDesc(std::string tensor_name, const WeightsLayout l) {
+ const auto layout_name = toString(l);
using args = std::initializer_list<std::string>;
- if (l == WeightsLayout::oiyx || l == WeightsLayout::oizyx || l == WeightsLayout::goiyx ||
+ if (l == WeightsLayout::oiyx ||
+ l == WeightsLayout::oizyx ||
+ l == WeightsLayout::goiyx ||
l == WeightsLayout::goizyx) {
args macroNameArgs = {"prefix", "g", "o", "i", "z", "y", "x"};
- const auto name = toString(l);
- this->calcFunction = FuncBody(name);
- this->macroName = MacroName(name, macroNameArgs);
+ this->calcFunction = FuncBody(layout_name);
+ this->macroName = MacroName(tensor_name, layout_name, macroNameArgs);
this->macroBody = R"V0G0N( \
CAT(prefix, _OFFSET) + \
(x)*CAT(prefix, _X_PITCH) + \
} else if (l == WeightsLayout::os_is_yx_isv16_osv16 || l == WeightsLayout::os_is_zyx_isv16_osv16 ||
l == WeightsLayout::g_os_is_yx_isv16_osv16 || l == WeightsLayout::g_os_is_zyx_isv16_osv16) {
args macroNameArgs = {"prefix", "g", "o", "i", "z", "y", "x", "sub_group_size"};
- const auto name = toString(l);
- this->calcFunction = FuncBody(name);
- this->macroName = MacroName(name, macroNameArgs);
+ this->calcFunction = FuncBody(layout_name);
+ this->macroName = MacroName(tensor_name, layout_name, macroNameArgs);
this->macroBody = R"V0G0N( \
CAT(prefix, _OFFSET) + \
(g)*CAT(prefix, _GROUPS_PITCH) + \
l == WeightsLayout::os_iyx_osv32__ai32 || l == WeightsLayout::g_os_iyx_osv16 ||
l == WeightsLayout::g_os_iyx_osv32) {
args macroNameArgs = {"prefix", "g", "o", "i", "y", "x", "sub_group_size"};
- const auto name = toString(l);
- this->calcFunction = FuncBody(name);
- this->macroName = MacroName(name, macroNameArgs);
+ this->calcFunction = FuncBody(layout_name);
+ this->macroName = MacroName(tensor_name, layout_name, macroNameArgs);
this->macroBody = R"V0G0N( \
CAT(prefix, _OFFSET) + \
(g * CAT(prefix, _GROUPS_PITCH)) + \
} else if (l == WeightsLayout::is_os_yx_isv16_osv16 || l == WeightsLayout::is_os_zyx_isv16_osv16 ||
l == WeightsLayout::g_is_os_yx_isv16_osv16 || l == WeightsLayout::g_is_os_zyx_isv16_osv16) {
args macroNameArgs = {"prefix", "g", "o", "i", "z", "y", "x", "sub_group_size"};
- const auto name = toString(l);
- this->calcFunction = FuncBody(name);
- this->macroName = MacroName(name, macroNameArgs);
+ this->calcFunction = FuncBody(layout_name);
+ this->macroName = MacroName(tensor_name, layout_name, macroNameArgs);
this->macroBody = R"V0G0N( \
CAT(prefix, _OFFSET) + \
(g)*CAT(prefix, _GROUPS_PITCH) + \
l == WeightsLayout::os_is_zyx_osv64_isv16) {
args macroNameArgs = {"prefix", "o", "i", "z", "y", "x"};
args funcArgs = {"o", "i", "z", "y", "x", "x_size", "y_size", "z_size", "i_size", "o_size", "osv_size", "isv_size"};
- const auto name = toString(l);
const auto body = R"V0G0N( \
const uint isv = i % isv_size; \
const uint osv = o % osv_size; \
os * os_pitch; \
return output_offset; \
)V0G0N";
- this->macroName = MacroName(name, macroNameArgs);
- this->calcFunction = FuncBody(name, funcArgs, body);
+ this->macroName = MacroName(tensor_name, layout_name, macroNameArgs);
+ this->calcFunction = FuncBody(layout_name, funcArgs, body);
if (l == WeightsLayout::os_is_yx_osv16_isv16)
- this->macroBody = FuncCall(name, {"o", "i", "0", "y", "x", Cat("_SIZE_X"), Cat("_SIZE_Y"), "1", Cat("_IFM_NUM"), Cat("_OFM_NUM"), "16", "16"});
+ this->macroBody = FuncCall(layout_name, {"o", "i", "0", "y", "x", Cat("_SIZE_X"), Cat("_SIZE_Y"), "1", Cat("_IFM_NUM"), Cat("_OFM_NUM"), "16", "16"});
else if (l == WeightsLayout::os_is_zyx_osv32_isv16)
- this->macroBody = FuncCall(name, {"o", "i", "z", "y", "x", Cat("_SIZE_X"), Cat("_SIZE_Y"), Cat("_SIZE_Z"), Cat("_IFM_NUM"), Cat("_OFM_NUM"), "32", "16"});
+ this->macroBody = FuncCall(layout_name, {"o", "i", "z", "y", "x", Cat("_SIZE_X"), Cat("_SIZE_Y"), Cat("_SIZE_Z"), Cat("_IFM_NUM"), Cat("_OFM_NUM"), "32", "16"});
else if (l == WeightsLayout::os_is_zyx_osv64_isv16)
- this->macroBody = FuncCall(name, {"o", "i", "z", "y", "x", Cat("_SIZE_X"), Cat("_SIZE_Y"), Cat("_SIZE_Z"), Cat("_IFM_NUM"), Cat("_OFM_NUM"), "64", "16"});
+ this->macroBody = FuncCall(layout_name, {"o", "i", "z", "y", "x", Cat("_SIZE_X"), Cat("_SIZE_Y"), Cat("_SIZE_Z"), Cat("_IFM_NUM"), Cat("_OFM_NUM"), "64", "16"});
} else if (l == WeightsLayout::g_os_zyx_is_osv16_isv16 || l == WeightsLayout::g_os_zyx_is_osv16_isv32 ||
l == WeightsLayout::g_os_zyx_is_osv32_isv16 || l == WeightsLayout::g_os_zyx_is_osv32_isv32) {
args macroNameArgs = {"prefix", "g", "o", "i", "z", "y", "x"};
args funcArgs = {"g", "o", "i", "z", "y", "x", "g_size", "o_size", "i_size", "z_size", "y_size", "x_size", "osv", "isv"};
- const auto name = toString(l);
const auto body = R"V0G0N( \
uint is_size = (i_size + isv - 1) / isv; \
uint os_size = (o_size + osv - 1) / osv; \
index += g * g_pitch; \
return index; \
)V0G0N";
- this->macroName = MacroName(name, macroNameArgs);
- this->calcFunction = FuncBody(name, funcArgs, body);
+ this->macroName = MacroName(tensor_name, layout_name, macroNameArgs);
+ this->calcFunction = FuncBody(layout_name, funcArgs, body);
std::string osv = "16", isv = "16";
if (l == WeightsLayout::g_os_zyx_is_osv16_isv16) {
osv = "16"; isv = "16";
} else if (l == WeightsLayout::g_os_zyx_is_osv32_isv32) {
osv = "32"; isv = "32";
}
- this->macroBody = FuncCall(name, {"g", "o", "i", "z", "y", "x", Cat("_GROUPS_NUM"), Cat("_OFM_NUM"), Cat("_IFM_NUM"), Cat("_SIZE_Z"),
- Cat("_SIZE_Y"), Cat("_SIZE_X"), osv, isv});
+ this->macroBody = FuncCall(layout_name, {"g", "o", "i", "z", "y", "x", Cat("_GROUPS_NUM"), Cat("_OFM_NUM"), Cat("_IFM_NUM"), Cat("_SIZE_Z"),
+ Cat("_SIZE_Y"), Cat("_SIZE_X"), osv, isv});
} else if (l == WeightsLayout::os_is_yx_osv16_isv4 || l == WeightsLayout::os_is_yx_osv32_isv4) {
args macroNameArgs = {"prefix", "o", "i", "y", "x"};
args funcArgs = {"o", "i", "y", "x", "i_size", "o_size", "x_size", "otd"};
- const auto name = toString(l);
const auto body = R"V0G0N( \
uint out_depth_tile = o / otd; \
uint od = o - out_depth_tile * otd; \
+ id; \
return idx; \
)V0G0N";
- this->macroName = MacroName(name, macroNameArgs);
- this->calcFunction = FuncBody(name, funcArgs, body);
+ this->macroName = MacroName(tensor_name, layout_name, macroNameArgs);
+ this->calcFunction = FuncBody(layout_name, funcArgs, body);
if (l == WeightsLayout::os_is_yx_osv16_isv4)
- this->macroBody = FuncCall(name, {"o", "i", "y", "x", Cat("_IFM_PITCH"), Cat("_OFM_PITCH"), Cat("_SIZE_X"), "16"});
+ this->macroBody = FuncCall(layout_name, {"o", "i", "y", "x", Cat("_IFM_PITCH"), Cat("_OFM_PITCH"), Cat("_SIZE_X"), "16"});
else if (l == WeightsLayout::os_is_yx_osv32_isv4)
- this->macroBody = FuncCall(name, {"o", "i", "y", "x", Cat("_IFM_PITCH"), Cat("_OFM_PITCH"), Cat("_SIZE_X"), "32"});
+ this->macroBody = FuncCall(layout_name, {"o", "i", "y", "x", Cat("_IFM_PITCH"), Cat("_OFM_PITCH"), Cat("_SIZE_X"), "32"});
} else {
// throw error?
}
return "FUNC_CALL(" + name + ")(" + args_str + ")";
}
- static const std::string MacroName(std::string name, std::initializer_list<std::string> args) {
+ static const std::string MacroName(std::string tensor_name, std::string layout_name, std::initializer_list<std::string> args) {
std::string args_str = "";
size_t counter = 0;
for (auto& arg : args)
args_str += (++counter == args.size()) ? (arg) : (arg + ", ");
- return "GET_WEIGHTS_" + name + "_INDEX(" + args_str + ")";
+ return "GET_" + tensor_name + "_" + layout_name + "_INDEX(" + args_str + ")";
}
static const std::string FuncBody(std::string name, std::initializer_list<std::string> args = {}, std::string body = "return 0;") {
std::string index_func_val;
auto layout = _tensor.GetLayout();
- WeightIndexFuncDesc indexFuncDesc {layout};
+ auto layout_str = toString(layout);
+ WeightIndexFuncDesc indexFuncDesc{_name, layout};
+ std::string called_func_name = "GET_" + _name + "_" + layout_str + "_INDEX";
if (WeightsTensor::DoesGroupDimExist(layout)) {
if (WeightsTensor::ChannelsCount(layout) <= 5) {
std::vector<Tensor::WeightsChannelName> grouped_4d_channels = {
bool is_grouped_4d_layout = is_common_nd_layout(grouped_4d_channels, layout);
if (is_grouped_4d_layout) {
index_macro_name = _name + "_GET_INDEX(g, o, i, y, x)";
- auto layout_str = toString(layout);
if (layout == WeightsLayout::goiyx)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", g, o, i, 0, y, x)";
+ index_func_val = called_func_name + "(" + _name + ", g, o, i, 0, y, x)";
else if (layout == WeightsLayout::g_os_is_yx_isv16_osv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", g, o, i, 0, y, x, 16)";
+ index_func_val = called_func_name + "(" + _name + ", g, o, i, 0, y, x, 16)";
else if (layout == WeightsLayout::g_os_iyx_osv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", g, o, i, y, x, 16)";
+ index_func_val = called_func_name + "(" + _name + ", g, o, i, y, x, 16)";
else if (layout == WeightsLayout::g_is_os_yx_isv16_osv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", g, o, i, 0, y, x, 16)";
+ index_func_val = called_func_name + "(" + _name + ", g, o, i, 0, y, x, 16)";
} else {
assert(0);
}
bool is_grouped_5d_layout = is_common_nd_layout(grouped_5d_channels, layout);
if (is_grouped_5d_layout) {
index_macro_name = _name + "_GET_INDEX(g, o, i, z, y, x)";
- auto layout_str = toString(layout);
if (layout == WeightsLayout::goizyx)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", g, o, i, z, y, x)";
+ index_func_val = called_func_name + "(" + _name + ", g, o, i, z, y, x)";
else if (layout == WeightsLayout::g_os_is_zyx_isv16_osv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", g, o, i, z, y, x, 16)";
+ index_func_val = called_func_name + "(" + _name + ", g, o, i, z, y, x, 16)";
else if (layout == WeightsLayout::g_is_os_zyx_isv16_osv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", g, o, i, z, y, x, 16)";
+ index_func_val = called_func_name + "(" + _name + ", g, o, i, z, y, x, 16)";
} else {
assert(0);
}
bool is_common_4d_layout = is_common_nd_layout(base_4d_channels, layout);
if (is_common_4d_layout) {
index_macro_name = _name + "_GET_INDEX(o, i, y, x)";
- auto layout_str = toString(layout);
if (layout == WeightsLayout::oiyx)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", 0, o, i, 0, y, x)";
+ index_func_val = called_func_name + "(" + _name + ", 0, o, i, 0, y, x)";
else if (layout == WeightsLayout::os_is_yx_isv16_osv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", 0, o, i, 0, y, x, 16)";
+ index_func_val = called_func_name + "(" + _name + ", 0, o, i, 0, y, x, 16)";
else if (layout == WeightsLayout::os_iyx_osv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", 0, o, i, y, x, 16)";
+ index_func_val = called_func_name + "(" + _name + ", 0, o, i, y, x, 16)";
else if (layout == WeightsLayout::os_iyx_osv32 || layout == WeightsLayout::os_iyx_osv32__ai32)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", 0, o, i, y, x, 32)";
+ index_func_val = called_func_name + "(" + _name + ", 0, o, i, y, x, 32)";
else if (layout == WeightsLayout::is_os_yx_isv16_osv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", 0, o, i, 0, y, x, 16)";
+ index_func_val = called_func_name + "(" + _name + ", 0, o, i, 0, y, x, 16)";
else if (layout == WeightsLayout::os_is_yx_osv16_isv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", o, i, 0, y, x)";
+ index_func_val = called_func_name + "(" + _name + ", o, i, 0, y, x)";
} else {
assert(0);
}
bool is_common_5d_layout = is_common_nd_layout(base_5d_channels, layout);
if (is_common_5d_layout) {
index_macro_name = _name + "_GET_INDEX(o, i, z, y, x)";
- auto layout_str = toString(layout);
if (layout == WeightsLayout::oizyx)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", 0, o, i, z, y, x)";
+ index_func_val = called_func_name + "(" + _name + ", 0, o, i, z, y, x)";
else if (layout == WeightsLayout::os_is_zyx_isv16_osv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", 0, o, i, z, y, x, 16)";
+ index_func_val = called_func_name + "(" + _name + ", 0, o, i, z, y, x, 16)";
else if (layout == WeightsLayout::is_os_zyx_isv16_osv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", 0, o, i, z, y, x, 16)";
+ index_func_val = called_func_name + "(" + _name + ", 0, o, i, z, y, x, 16)";
else if (layout == WeightsLayout::os_is_zyx_osv32_isv16 || layout == WeightsLayout::os_is_zyx_osv64_isv16)
- index_func_val = "GET_WEIGHTS_" + layout_str + "_INDEX(" + _name + ", o, i, z, y, x)";
+ index_func_val = called_func_name + "(" + _name + ", o, i, z, y, x)";
} else {
assert(0);
}
#include "kernel_base.h"
+#include <sstream>
+
namespace kernel_selector {
const primitive_db KernelBase::db;
thread_local size_t KernelBase::counter = 0;
+std::string toString(const kernel_selector::CommonDispatchData& dispatchData) {
+ auto gws = dispatchData.gws;
+ auto lws = dispatchData.lws;
+ std::stringstream os;
+ os << "GWS(" << gws.size() << "): ";
+ for (auto e : gws) {
+ os << e << " ";
+ }
+ os << "LWS(" << lws.size() << "): ";
+ for (auto e : lws) {
+ os << e << " ";
+ }
+ return os.str();
+}
+
+void KernelBase::CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData) {
+ if (dispatchData.gws.size() != 3 || dispatchData.lws.size() != 3)
+ throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName + ": " +
+ ": LWS and GWS size is expected to be equal to 3. Actual: " +
+ toString(dispatchData));
+
+ if (dispatchData.lws[0] * dispatchData.lws[1] * dispatchData.lws[2] > 256) {
+ throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName +
+ ": LWS cannot be greater than 256. Actual: " +
+ toString(dispatchData));
+ }
+ for (size_t i = 0; i < dispatchData.gws.size(); i++) {
+ if (dispatchData.gws[i] == 0 || dispatchData.lws[i] == 0)
+ throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName +
+ ": Dispatch data cannot contain zeros. Actual: " +
+ toString(dispatchData));
+
+ if (dispatchData.gws[i] % dispatchData.lws[i] != 0)
+ throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName +
+ ": GWS must be divisible by corresponding LWS. Actual: " +
+ toString(dispatchData));
+ }
+}
+
static bool IsTypeUsedIn(Datatype type, const base_params& params) {
return params.output.GetDType() == type ||
std::any_of(params.inputs.begin(), params.inputs.end(), [=](const DataTensor& input) -> bool {
namespace kernel_selector {
using primitive_db = kernel_selector::gpu::cache::primitive_db;
+struct CommonDispatchData {
+ std::vector<size_t> gws;
+ std::vector<size_t> lws;
+ float efficiency;
+
+ CommonDispatchData() : gws({0, 0, 0}), lws({0, 0, 0}), efficiency(0.0f) {}
+};
+
+std::string toString(const kernel_selector::CommonDispatchData& dispatchData);
+
+static inline std::ostream &operator<<(std::ostream &os, CommonDispatchData disptchData) {
+ return os << toString(disptchData);
+}
+
class KernelBase {
public:
using FusedOpType = KernelType;
static const primitive_db db;
const std::string kernelName;
+ static void CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData);
static size_t UniqeID() { return counter++; } // TODO: use interlocked
virtual Datatype GetUnitType(const base_params& params) const;