/*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
os_is_y_x8_osv8_isv4, ///< format for weights for 1x1 MMAD convolutions
os_is_y_x8_osv8_isv4_swizzled_by_4, ///< format for weights for 1x1 MMAD convolutions
os_is_yx_osv16_isv4, ///< format for weights for IMAD convolutions
+ os_is_zyx_osv16_isv16, ///< format for weights for IMAD convolutions
os_is_yx_osv32_isv4_swizzled_by_2, ///< format for weights for IMAD convolutions
os_is_yx_osv32_isv4, ///< format for weights for IMAD convolutions
os_is_yx_osv32_isv32p, ///< format for weights for binary convolutions
gs_oiyx_gsv32, ///< format used for weights for 2D convolution
g_is_os_zyx_osv16_isv16, ///< format used for grouped weights for blocked 3D deconvolution
g_os_is_yx_osv16_isv4,
+ g_os_is_zyx_osv16_isv16,
g_is_os_yx_osv16_isv16,
g_os_is_zyx_isv8_osv16_isv2,
g_os_is_yx_isv8_osv16_isv2,
{ b_fs_yx_32fp, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
{ b_fs_zyx_fsv16, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{1, 16}}}},
{ bs_fs_zyx_bsv16_fsv16, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{0, 16 }, {1, 16}}}},
- { bs_fs_yx_bsv16_fsv16, { 1, 1, 3, 0, 0, "bfyx", "bfxy?", {{0, 16 }, {1, 16}}}},
+ { bs_fs_yx_bsv16_fsv16, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{0, 16 }, {1, 16}}}},
{ nv12, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
{ image_2d_rgba, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
{ is_o32_yx_isv32_swizzled_by_4, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
{ os_is_y_x8_osv8_isv4, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
{ os_is_y_x8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
- { os_is_yx_osv16_isv4, { 1, 1, 2, 0, 0, "bfxy", "bfxy?", {{0, 16}, {1, 4}}}},
+ { os_is_yx_osv16_isv4, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{0, 16}, {1, 4}}}},
+ { os_is_zyx_osv16_isv16, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{0, 16}, {1, 16}}}},
{ os_is_yx_osv32_isv4_swizzled_by_2, { 1, 1, 2, 0, 0, "bfxy", "bfxy?", {{0, 32}, {1, 4}}}},
{ os_is_yx_osv32_isv4, { 1, 1, 2, 0, 0, "bfxy", "bfxy?", {{0, 32}, {1, 4}}}},
{ os_is_yx_osv32_isv32p, { 1, 1, 1, 0, 0, "bfxy", "bfxy?", {}}},
{ g_os_is_zyx_isv8_osv16_isv2, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{1, 8}, {0, 16}, {1, 2}}}},
{ g_os_is_yx_isv8_osv16_isv2, { 1, 1, 2, 0, 1, "gbfyx", "bfxy????g", {{1, 8}, {0, 16}, {1, 2}}}},
{ g_os_is_zyx_isv16_osv16, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{0, 16}, {1, 16}}}},
- { g_os_is_yx_osv16_isv4, { 1, 1, 2, 0, 1, "gbfxy", "bfxy????g", {{0, 16}, {1, 4}}}},
+ { g_os_is_yx_osv16_isv4, { 1, 1, 2, 0, 1, "gbfyx", "bfxy????g", {{0, 16}, {1, 4}}}},
+ { g_os_is_zyx_osv16_isv16, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{0, 16}, {1, 16}}}},
{ g_os_zyx_is_osv16_isv4, { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g", {{0, 16}, {1, 4}}}},
{ g_os_zyx_is_osv16_isv16, { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g", {{0, 16}, {1, 16}}}},
{ g_os_zyx_is_osv16_isv32, { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g", {{0, 16}, {1, 32}}}},
/*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
{ WeightsLayout::os_i_osv16__ai8, { -1, -1, -1, 0, 1, -1, -1, -1 } },
{ WeightsLayout::os_i_osv16, { -1, -1, -1, 0, 1, -1, -1, -1 } },
{ WeightsLayout::os_is_yx_osv16_isv16, { 0, 1, -1, 2, 3, -1, -1, -1 } },
+ { WeightsLayout::os_is_zyx_osv16_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
+ { WeightsLayout::g_os_is_zyx_osv16_isv16, { 0, 1, 2, 3, 4, -1, -1, 5 } },
{ WeightsLayout::os_is_zyx_osv32_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
{ WeightsLayout::os_is_zyx_osv64_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
{ WeightsLayout::i_yxs_os_yxsv2_osv16, { 1, 2, -1, 3, 0, -1, -1, -1 } },
NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l) {
std::vector<size_t> newDims = d;
- // TOOD: it's not the right pitches. it's here in order to calculate physical size
+ // TODO: It's not the right pitches. it's here in order to calculate physical size
switch (l) {
case os_iyx_osv16:
case os_iyx_osv16_rotate_180:
newDims[2] = RoundUp(newDims[2], 16);
newDims[3] = RoundUp(newDims[3], 16);
break;
+ case os_is_zyx_osv16_isv16:
+ assert(newDims.size() == 5);
+ newDims[3] = RoundUp(newDims[3], 16);
+ newDims[4] = RoundUp(newDims[4], 16);
+ break;
+ case g_os_is_zyx_osv16_isv16:
+ assert(newDims.size() == 6);
+ newDims[3] = RoundUp(newDims[3], 16);
+ newDims[4] = RoundUp(newDims[4], 16);
+ break;
case os_is_zyx_osv32_isv16:
newDims[3] = RoundUp(newDims[3], 16);
newDims[4] = RoundUp(newDims[4], 32);
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
os_i_osv16__ai8,
os_i_osv16,
os_is_yx_osv16_isv16, // wieghts for int8 blocked conv
+ os_is_zyx_osv16_isv16,
os_is_zyx_osv32_isv16,
os_is_zyx_osv64_isv16,
i_yxs_os_yxsv2_osv16,
g_os_is_zyx_isv8_osv16_isv2,
g_os_is_yx_isv8_osv16_isv2,
g_os_is_zyx_isv16_osv16,
+ g_os_is_zyx_osv16_isv16,
giy_xs_os_xsv2_osv16__ao32,
giy_xs_os_xsv2_osv8__ao32,
g_os_is_yx_isv16_osv16,
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableInputDataType(Datatype::INT8);
+ k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::INT64);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::INT8);
+ k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::INT64);
k.EnableInputLayout(DataLayout::bfyx);
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "convolution_kernel_b_fs_yx_fsv16_imad.h"
+#include "convolution_kernel_b_fs_zyx_fsv16_imad.h"
#include "kernel_selector_utils.h"
#include "common_tools.h"
#include <vector>
namespace kernel_selector {
-Convolution_kernel_b_fs_yx_fsv16_imad::BlockParams
-Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params& params) const {
+Convolution_kernel_b_fs_zyx_fsv16_imad::BlockParams
+Convolution_kernel_b_fs_zyx_fsv16_imad::GetBlockParams(const convolution_params& params) const {
constexpr float max_reg_pressure = 0.75f;
// TODO Investigate whether below algorithm for selecting optimal block params could be reduced to:
size_t block_features = simd;
{
size_t tmp_block_features = simd * 2;
- auto block2_params = BlockParams{ block_width, 1, tmp_block_features, in_block_width, 1, 1 };
+ auto block2_params = BlockParams{ block_width, 1, 1, tmp_block_features, in_block_width, 1, 1, 1 };
- bool c_mul_f = params.output.Feature().v % tmp_block_features == 0;
+ bool c_mul_f = params.weights.OFM().v % tmp_block_features == 0;
bool c_reg_pressure = EstimateRegPressure(params, block2_params) <= max_reg_pressure;
if (c_mul_f && c_reg_pressure) {
// If not enough occupancy try to perform feature split or/and block reduction
size_t feature_slm_split = 1;
- auto no_split_params = BlockParams{ block_width, 1, block_features, in_block_width, 1, 1 };
+
+ auto no_split_params = BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, 1 };
+
if (EstimateOccupancy(params, no_split_params) < 1.f) {
// Temporary variables for possible reductions in block sizes
bool update_block_params = false;
// Feature split requires extra registers, so check if it can be done with current block sizes
bool can_split =
- EstimateRegPressure(params, BlockParams{ block_width, 1, block_features, in_block_width, 1, 2 }) <= max_reg_pressure;
+ EstimateRegPressure(params, BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, 2 }) <= max_reg_pressure;
+
// Has the occupancy reached sufficient level
bool enough_occupancy = false;
// Reductions to reduce register pressure
// At most twice reduction in output block width is acceptable
for (size_t w = block_width; w >= CeilDiv(block_width, 2); w -= 1) {
size_t tmp_in_width = (w - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
- auto dummy_split_params = BlockParams{ w, 1, block_features, tmp_in_width, 1, 2 };
+ auto dummy_split_params = BlockParams{ w, 1, 1, block_features, tmp_in_width, 1, 1, 2 };
bool c_reg_pressure = EstimateRegPressure(params, dummy_split_params) <= max_reg_pressure;
bool c_mul_x = params.output.X().v % w == 0;
}
// Check if previous reductions haven't improved occupancy enough
{
- auto reduced_params = BlockParams{ split_block_width, 1, split_block_features, split_in_block_width, 1, 1 };
+ auto reduced_params = BlockParams{ split_block_width, 1, 1, split_block_features, split_in_block_width, 1, 1, 1 };
enough_occupancy = EstimateOccupancy(params, reduced_params) >= 1.f;
update_block_params = enough_occupancy;
}
if (can_split && !enough_occupancy) {
// TODO Try other split sizes
for (size_t split = 4; split < 5; ++split) {
- auto tmp_params = BlockParams{ block_width, 1, block_features, in_block_width, 1, split };
+ auto tmp_params = BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, split };
bool c_ifm_mul = CeilDiv(params.weights.IFM().v, fsv) % split == 0;
bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
// At most twice reduction in output block width is acceptable
for (size_t w = block_width; w >= CeilDiv(block_width, 2); w -= 1) {
size_t tmp_in_width = (w - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
- auto tmp_params = BlockParams{ w, 1, split_block_features, tmp_in_width, 1, feature_slm_split };
+ auto tmp_params = BlockParams{ w, 1, 1, split_block_features, tmp_in_width, 1, 1, feature_slm_split };
bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
bool c_mul_x = params.output.X().v % w == 0;
}
}
- // Select biggest block height that fits into registers
+ // Select biggest block height and depth that fits into registers
size_t block_height = 1;
+ size_t block_depth = 1;
size_t in_block_height = 1;
- for (size_t h = 2; h < 16; ++h) {
- if (params.output.Y().v % h != 0)
- continue;
+ size_t in_block_depth = 1;
- size_t tmp_in_block_height = (h - 1) * params.stride.y + (params.filterSize.y - 1) * params.dilation.y + 1;
- auto tmp_params = BlockParams{ block_width, h, block_features, in_block_width, tmp_in_block_height, feature_slm_split };
-
- bool c_reg_pressure = EstimateRegPressure(params, tmp_params) <= max_reg_pressure;
- bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
- bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
+ bool break_external_loop = false;
+
+ for (size_t d = 1; d < 16; ++d) {
+ if (params.output.Z().v % d != 0)
+ continue;
+ for (size_t h = 2; h < 16; ++h) {
+ if (params.output.Y().v % h != 0)
+ continue;
+ size_t tmp_in_block_depth = (d - 1) * params.stride.z + (params.filterSize.z - 1) * params.dilation.z + 1;
+ size_t tmp_in_block_height = (h - 1) * params.stride.y + (params.filterSize.y - 1) * params.dilation.y + 1;
+ auto tmp_params = BlockParams{ block_width, h, d, block_features, in_block_width, tmp_in_block_height, tmp_in_block_depth, feature_slm_split };
+
+ bool c_reg_pressure = EstimateRegPressure(params, tmp_params) <= max_reg_pressure;
+ bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
+ bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
+
+ if (c_reg_pressure && c_occupancy && c_slm) {
+ block_height = h;
+ block_depth = d;
+ in_block_height = tmp_in_block_height;
+ in_block_depth = tmp_in_block_depth;
+ } else {
+ break_external_loop = true;
+ break;
+ }
+ }
- if (c_reg_pressure && c_occupancy && c_slm) {
- block_height = h;
- in_block_height = tmp_in_block_height;
- } else {
+ if (break_external_loop) {
break;
}
}
- return BlockParams{ block_width, block_height, block_features, in_block_width, in_block_height, feature_slm_split };
+ return BlockParams{ block_width, block_height, block_depth, block_features, in_block_width, in_block_height, in_block_depth, feature_slm_split };
}
-float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateRegPressure(const convolution_params& params, const BlockParams& block) const {
+float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateRegPressure(const convolution_params& params, const BlockParams& block) const {
size_t bytes_used = 0;
// accumulator
- size_t accumulator_elements = block.output_block_width * block.output_block_height * block.output_block_features;
+ size_t accumulator_elements = block.output_block_width * block.output_block_height * block.output_block_depth * block.output_block_features;
bytes_used += accumulator_elements * BytesPerElement(GetAccumulatorType(params));
// input block
- size_t input_block_elements = block.input_block_height * Align(block.input_block_width, simd) * fsv;
+ size_t input_block_elements = block.input_block_depth * block.input_block_height * Align(block.input_block_width, simd) * fsv;
bytes_used += input_block_elements * BytesPerElement(params.inputs[0].GetDType());
// weights block
size_t weights_block_elements = block.output_block_features * fsv;
bytes_used += weights_block_elements * BytesPerElement(params.weights.GetDType());
- // Experimentally selected number of registers needed for extra variables (eg. out_x, out_y, filter_idx, etc.)
+ // Experimentally selected number of registers needed for extra variables (eg. out_x, out_y, out_z, filter_idx, etc.)
constexpr size_t experimental_extra_regs = 8 * 32;
bytes_used += experimental_extra_regs;
return static_cast<float>(bytes_used) / static_cast<float>(max_reg_bytes);
}
-float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateOccupancy(const convolution_params& params, const BlockParams& block) const {
+float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateOccupancy(const convolution_params& params, const BlockParams& block) const {
size_t blocks_w = CeilDiv(params.output.X().v, block.output_block_width);
size_t blocks_h = CeilDiv(params.output.Y().v, block.output_block_height);
- size_t blocks_f = CeilDiv(params.output.Feature().v, block.output_block_features) * block.feature_slm_split;
+ size_t blocks_d = CeilDiv(params.output.Z().v, block.output_block_depth);
+ size_t blocks_f = CeilDiv(params.weights.OFM().v, block.output_block_features) * params.groups * block.feature_slm_split;
size_t block_b = params.output.Batch().v;
- auto threads = blocks_w * blocks_h * blocks_f * block_b;
+ auto threads = blocks_w * blocks_h * blocks_d * blocks_f * block_b;
constexpr size_t max_threads_per_cu = 7;
size_t compute_units = params.engineInfo.computeUnitsCount;
size_t max_threads = compute_units * max_threads_per_cu;
return static_cast<float>(threads) / static_cast<float>(max_threads);
}
-float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const {
- size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_features * (block.feature_slm_split - 1);
+float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const {
+ size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_depth *
+ block.output_block_features * (block.feature_slm_split - 1);
size_t slm_bytes = slm_elements * BytesPerElement(GetAccumulatorType(params));
- // TODO Actual maximum slm should also depend on number of work-groups, but this is device specific
+ // TODO: Actual maximum slm should also depend on number of work-groups, but this is device specific
size_t max_slm_bytes = params.engineInfo.maxLocalMemSize;
return static_cast<float>(slm_bytes) / static_cast<float>(max_slm_bytes);
}
-ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
+ParamsKey Convolution_kernel_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputWeightsType(WeightsType::INT8);
+ k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
+ k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
+
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableBiasPerFeature();
k.EnableNonBiasTerm();
k.EnableBatching();
+ k.EnableGroupedConvolution();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.EnableDilation();
k.DisableTuning();
return k;
}
-KernelsData Convolution_kernel_b_fs_yx_fsv16_imad::GetKernelsData(const Params& params,
+KernelsData Convolution_kernel_b_fs_zyx_fsv16_imad::GetKernelsData(const Params& params,
const optional_params& options) const {
return GetCommonKernelsData(params, options);
}
-JitConstants Convolution_kernel_b_fs_yx_fsv16_imad::GetJitConstants(const convolution_params& params,
+JitConstants Convolution_kernel_b_fs_zyx_fsv16_imad::GetJitConstants(const convolution_params& params,
const DispatchData& kd) const {
auto mem_consts = Parent::GetJitConstants(params, kd);
auto block_params = GetBlockParams(params);
bool unroll_filter_y = block_params.output_block_height != 1;
+ bool unroll_filter_z = block_params.output_block_depth != 1;
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block_params.output_block_width));
mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", block_params.input_block_width));
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block_params.output_block_height));
mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_HEIGHT", block_params.input_block_height));
+ mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_DEPTH", block_params.output_block_depth));
+ mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_DEPTH", block_params.input_block_depth));
mem_consts.AddConstant(MakeJitConstant("FILTER_SIZE_Y_UNROLL", unroll_filter_y ? params.filterSize.y : 1));
- mem_consts.AddConstant(MakeJitConstant("OFM_BLOCKS_PER_SIMD", block_params.output_block_features / simd));
+ mem_consts.AddConstant(MakeJitConstant("FILTER_SIZE_Z_UNROLL", unroll_filter_z ? params.filterSize.z : 1));
+ mem_consts.AddConstant(MakeJitConstant("OFM_BLOCKS_PER_SIMD", static_cast<int>(std::ceil(block_params.output_block_features / simd))));
mem_consts.AddConstant(MakeJitConstant("OFM_SIZE_PER_SIMD", block_params.output_block_features));
mem_consts.AddConstant(MakeJitConstant("FEATURE_SLM_SPLIT", block_params.feature_slm_split));
mem_consts.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
std::vector<std::string> idx_order = { "out_b", "(out_f + ofb * 16)", "(out_y + oh)", "(out_x + ow)" };
+ if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
+ idx_order = { "out_b", "(out_f + ofb * 16)", "(out_z + od)", "(out_y + oh)", "(out_x + ow)" };
+ }
+
std::vector<Tensor::DataChannelName> loop_axes = { Tensor::DataChannelName::X };
+
+ if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
+ if (block_params.output_block_depth != 1) {
+ loop_axes.push_back(Tensor::DataChannelName::Z);
+ } else {
+ idx_order[idx_order.size() - 3] = "out_z";
+ }
+ }
+
if (block_params.output_block_height != 1) {
loop_axes.push_back(Tensor::DataChannelName::Y);
} else {
return mem_consts;
} // GetJitConstants
-ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad::SetDefault(const convolution_params& params,
+ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_zyx_fsv16_imad::SetDefault(const convolution_params& params,
int) const {
DispatchData kd;
const auto& output = params.output;
+ const auto& weights = params.weights;
auto block_params = GetBlockParams(params);
kd.gws0 = CeilDiv(output.X().v, block_params.output_block_width);
- kd.gws1 = CeilDiv(output.Y().v, block_params.output_block_height);
- kd.gws2 = output.Batch().v * CeilDiv(output.Feature().v, block_params.output_block_features) * simd * block_params.feature_slm_split;
+ kd.gws1 = CeilDiv(output.Y().v, block_params.output_block_height) * CeilDiv(output.Z().v, block_params.output_block_depth);
+ kd.gws2 = output.Batch().v * CeilDiv(weights.OFM().v, block_params.output_block_features) * params.groups * simd * block_params.feature_slm_split;
kd.lws0 = 1;
kd.lws1 = 1;
kd.gemmStyle = {0, 0, 0, 0, 0, 0};
kd.efficiency = FORCE_PRIORITY_2;
- // TODO Optimize 1x1, because this kernel is better in most cases
- //if (params.filterSize.x == 1 && params.filterSize.y == 1)
- // kd.efficiency = FORCE_PRIORITY_1;
if (static_cast<float>(params.weights.IFM().v) / static_cast<float>(Align(params.weights.IFM().v, fsv)) < 0.5f)
kd.efficiency = FORCE_PRIORITY_4;
return kd;
} // SetDefault
-bool Convolution_kernel_b_fs_yx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
+bool Convolution_kernel_b_fs_zyx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
if (!Parent::Validate(params, options)) {
return false;
}
KernelData kd = KernelData::Default<convolution_params>(params);
convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
- if (newParams.groups != 1 || newParams.split != 1)
+ if (newParams.split != 1)
return false;
return true;
namespace kernel_selector {
-class Convolution_kernel_b_fs_yx_fsv16_imad : public ConvolutionKernelBase {
+class Convolution_kernel_b_fs_zyx_fsv16_imad : public ConvolutionKernelBase {
public:
using Parent = ConvolutionKernelBase;
- Convolution_kernel_b_fs_yx_fsv16_imad() : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad") {}
- virtual ~Convolution_kernel_b_fs_yx_fsv16_imad() {}
+ Convolution_kernel_b_fs_zyx_fsv16_imad() : ConvolutionKernelBase("convolution_gpu_b_fs_zyx_fsv16_imad") {}
+ virtual ~Convolution_kernel_b_fs_zyx_fsv16_imad() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
- WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
- return WeightsLayout::os_is_yx_osv16_isv16;
+ WeightsLayout GetPreferredWeightsLayout(const convolution_params& p) const override {
+ return p.groups > 1 ? WeightsLayout::g_os_is_zyx_osv16_isv16 : WeightsLayout::os_is_zyx_osv16_isv16;
}
std::vector<FusedOpType> GetSupportedFusedOps() const override {
struct BlockParams {
size_t output_block_width;
size_t output_block_height;
+ size_t output_block_depth;
+
size_t output_block_features;
size_t input_block_width;
size_t input_block_height;
+ size_t input_block_depth;
size_t feature_slm_split;
};
#include "convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.h"
#include "convolution_kernel_bfyx_to_bs_fs_yx_bsv16_fsv16.h"
#include "convolution_kernel_b_fs_yx_fsv16_imad_1x1.h"
-#include "convolution_kernel_b_fs_yx_fsv16_imad.h"
+#include "convolution_kernel_b_fs_zyx_fsv16_imad.h"
#include "convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp"
#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.h"
#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.h"
Attach<ConvolutionKernel_Ref>();
Attach<DeformableConvolutionKernel_bfyx_Ref>();
- // b_fs_yx_fsv16 int8
+ // b_fs_yx_fsv16 and b_fs_zyx_fsv16 int8
Attach<Convolution_kernel_b_fs_yx_fsv16_imad_1x1>();
- Attach<Convolution_kernel_b_fs_yx_fsv16_imad>();
+ Attach<Convolution_kernel_b_fs_zyx_fsv16_imad>();
// b_fs_yx_fsv16 and b_fs_zyx_fsv16
Attach<ConvolutionKernel_b_fs_yx_fsv16_depthwise>();
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "pooling_kernel_gpu_b_fs_yx_fsv16_imad.h"
+#include "pooling_kernel_gpu_b_fs_zyx_fsv16_imad.h"
#include "kernel_selector_utils.h"
#define FEATURE_SLICE_SIZE 16
namespace kernel_selector {
-ParamsKey PoolingKernelGPU_b_fs_yx_fsv16_imad::GetSupportedKey() const {
+ParamsKey PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::F32);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+ k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
+ k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
return k;
}
-PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_yx_fsv16_imad::SetDefault(const pooling_params& params) const {
+PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_zyx_fsv16_imad::SetDefault(const pooling_params& params) const {
DispatchData runInfo = PoolingKernelBase::SetDefault(params);
const auto& out = params.output;
auto x = out.X().v;
auto y = out.Y().v;
+ auto z = out.Z().v;
auto f = out.Feature().v;
auto b = out.Batch().v;
runInfo.gws0 = x;
- runInfo.gws1 = y;
+ runInfo.gws1 = y * z;
// we got b_fs_yx_fsv16 format, we process 16 features per workitem
runInfo.gws2 = CeilDiv(f, FEATURE_SLICE_SIZE) * b;
return runInfo;
}
-JitConstants PoolingKernelGPU_b_fs_yx_fsv16_imad::GetJitConstants(const pooling_params& params, DispatchData kd) const {
+JitConstants PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetJitConstants(const pooling_params& params, DispatchData kd) const {
auto jit = PoolingKernelBase::GetJitConstants(params, kd);
const size_t in_x_pitch = FEATURE_SLICE_SIZE;
const size_t in_y_pitch = FEATURE_SLICE_SIZE * params.inputs[0].X().LogicalDimPadded();
+ const size_t in_z_pitch = FEATURE_SLICE_SIZE * params.inputs[0].Y().LogicalDimPadded() * params.inputs[0].X().LogicalDimPadded();
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
+ jit.AddConstant(MakeJitConstant("IN_Z_PITCH", in_z_pitch));
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
if (!params.fused_ops.empty()) {
auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params);
FusedOpsConfiguration conf = {"", {"b", "(f+i)", "y", "x"}, "pool_result[i]", input_dt, 1};
+ if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
+ conf = {"", {"b", "(f+i)", "z", "y", "x"}, "pool_result[i]", input_dt, 1 };
+ }
conf.SetLoopAxes({ Tensor::DataChannelName::FEATURE }, true);
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
}
return jit;
}
-KernelsData PoolingKernelGPU_b_fs_yx_fsv16_imad::GetKernelsData(const Params& params, const optional_params& options) const {
+KernelsData PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetKernelsData(const Params& params, const optional_params& options) const {
return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
}
-bool PoolingKernelGPU_b_fs_yx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
- if (!PoolingKernelBase::Validate(params, options)) {
- return false;
- }
- auto p = dynamic_cast<const pooling_params&>(params);
-
- if (p.inputs[0].Feature().v % FEATURE_SLICE_SIZE != 0)
- return false;
-
- return true;
+bool PoolingKernelGPU_b_fs_zyx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
+ return PoolingKernelBase::Validate(params, options);
}
} // namespace kernel_selector
#include <vector>
namespace kernel_selector {
-class PoolingKernelGPU_b_fs_yx_fsv16_imad: public PoolingKernelBase{
+class PoolingKernelGPU_b_fs_zyx_fsv16_imad: public PoolingKernelBase{
public:
- PoolingKernelGPU_b_fs_yx_fsv16_imad() : PoolingKernelBase("pooling_gpu_b_fs_yx_fsv16_imad") {}
- virtual ~PoolingKernelGPU_b_fs_yx_fsv16_imad() {}
+ PoolingKernelGPU_b_fs_zyx_fsv16_imad() : PoolingKernelBase("pooling_gpu_b_fs_zyx_fsv16_imad") {}
+ virtual ~PoolingKernelGPU_b_fs_zyx_fsv16_imad() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
#include "pooling_kernel_gpu_fs_b_yx_fsv32.h"
#include "pooling_kernel_gpu_b_fs_yx_fsv16.h"
#include "pooling_kernel_gpu_bsv16_fsv16.h"
-#include "pooling_kernel_gpu_b_fs_yx_fsv16_imad.h"
+#include "pooling_kernel_gpu_b_fs_zyx_fsv16_imad.h"
#include "pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.h"
namespace kernel_selector {
Attach<PoolingKerneGPU_fs_b_yx_fsv32>();
Attach<PoolingKernel_b_fs_yx_fsv16>();
Attach<PoolingKernel_bsv16_fsv16>();
- Attach<PoolingKernelGPU_b_fs_yx_fsv16_imad>();
+ Attach<PoolingKernelGPU_b_fs_zyx_fsv16_imad>();
Attach<Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16>();
}
+++ /dev/null
-// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-#include "include/fetch.cl"
-#include "include/imad.cl"
-#include "include/mmad.cl"
-#include "include/data_types.cl"
-
-#define AS_TYPE_N_(type, n, x) as_##type##n(x)
-#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
-#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
-
-#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
-#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
-
-#define SIMD 16
-#define FSV 16
-
-// int8 conv_input and weights data is packed to int32 "batches",
-// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
-__attribute__((intel_reqd_sub_group_size(SIMD)))
-__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
-KERNEL(convolution_gpu_b_fs_yx_fsv16_imad)(
- const __global INPUT0_TYPE *conv_input,
- __global OUTPUT_TYPE *output,
- const __global FILTER_TYPE *weights,
-#if BIAS_TERM
- const __global BIAS_TYPE *biases,
-#endif
-#if HAS_FUSED_OPS_DECLS
- FUSED_OPS_DECLS,
-#endif
- uint split_idx) {
-
- #define LUT_VALUE_CLAMP(x) (( (IN_BLOCK_WIDTH % SIMD == 0) || ((x) < IN_BLOCK_WIDTH % SIMD) ) ? (x) : 0)
- const int tmp = LUT_VALUE_CLAMP(get_sub_group_local_id());
- #undef LUT_VALUE_CLAMP
-
- const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
- const uint out_y = (uint)get_global_id(1) * OUT_BLOCK_HEIGHT;
- const uint out_b = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) / ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
- uint out_fg = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) % ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
- uint out_f = out_fg + get_sub_group_local_id();
-
- const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
- const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-#if FEATURE_SLM_SPLIT == 1
- const uint k_start = 0;
-#else
- const uint k_start = get_sub_group_id() * FSV;
-#endif
-
- uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, k_start, 0, 0);
- const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, 16) * FILTER_SIZE_X * FILTER_SIZE_Y * 16);
-
- uint input_start_idx = INPUT0_GET_INDEX(out_b, k_start, input_y, input_x);
-
- ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { };
- uint4 input_val[IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
-
- __attribute__((opencl_unroll_hint(1)))
- for (uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16) / FEATURE_SLM_SPLIT; k++) {
- __attribute__((opencl_unroll_hint(1)))
- for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
- // Load input block IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
- __attribute__((opencl_unroll_hint))
- for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
- __attribute__((opencl_unroll_hint))
- for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
- uint input_idx = input_start_idx + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
- if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
- input_val[iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * 16));
- } else {
- input_val[iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * 16));
- }
- }
- }
-
- __attribute__((opencl_unroll_hint))
- for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
- __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
- for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
-
- uint4 weights_val[OFM_BLOCKS_PER_SIMD];
- __attribute__((opencl_unroll_hint))
- for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
- weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
- }
-
- __attribute__((opencl_unroll_hint))
- for (uint ive = 0; ive < 4; ive++) {
- __attribute__((opencl_unroll_hint))
- for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
- __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
- for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
- __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
- for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
- const uint ow_offset = ow + OUT_BLOCK_WIDTH;
- const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
- const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
- const uint shuffle_wi = x_block_idx % SIMD;
- const uint shuffle_idx = x_block_idx / SIMD;
-
- dotProd[ofb][oh][ow] = TO_ACCUMULATOR_TYPE(
- IMAD(dotProd[ofb][oh][ow],
- AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[y_block_idx][shuffle_idx][ive], shuffle_wi)),
- AS_FILTER_TYPE_4(weights_val[ofb][ive])));
- }
- }
- }
- }
-
- filter_idx += FSV * FSV;
- }
- }
- input_start_idx += DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
- }
- input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL) * DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
-
- filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * (FEATURE_SLM_SPLIT - 1);
- }
-
-#if FEATURE_SLM_SPLIT != 1
- // Additional local memory reduction for feature split mode
-# if FEATURE_SLM_SPLIT < OFM_BLOCKS_PER_SIMD
-# error convolution_gpu_b_fs_yx_fsv16_imad.cl - OFM_BLOCKS_PER_SIMD must be less or equal to FEATURE_SLM_SPLIT
-# endif
-
- const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH;
- __local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
-
- uint sgid_start_idx = get_sub_group_id();
- sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
- __local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH
- + get_sub_group_local_id();
-
- if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
- __attribute__((opencl_unroll_hint))
- for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
- if (get_sub_group_id() == wg) {
- __attribute__((opencl_unroll_hint))
- for (uint ofb = 0; ofb < wg; ++ofb) {
- __attribute__((opencl_unroll_hint))
- for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
- __attribute__((opencl_unroll_hint))
- for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
- const uint partial_acc_ptr_idx =
- ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
- oh * OUT_BLOCK_WIDTH * SIMD +
- ow * SIMD;
- partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
- }
- }
- }
- __attribute__((opencl_unroll_hint))
- for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
- __attribute__((opencl_unroll_hint))
- for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
- dotProd[0][oh][ow] = dotProd[wg][oh][ow];
- }
- }
- __attribute__((opencl_unroll_hint))
- for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
- __attribute__((opencl_unroll_hint))
- for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
- __attribute__((opencl_unroll_hint))
- for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
- const uint partial_acc_ptr_idx =
- ((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OFM_SIZE_PER_SIMD : 0) +
- ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
- oh * OUT_BLOCK_WIDTH * SIMD +
- ow * SIMD;
- partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
- }
- }
- }
- }
- }
- } else {
- __attribute__((opencl_unroll_hint))
- for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
- __attribute__((opencl_unroll_hint))
- for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
- __attribute__((opencl_unroll_hint))
- for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
- const uint partial_acc_ptr_idx =
- ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
- oh * OUT_BLOCK_WIDTH * SIMD +
- ow * SIMD;
- partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
- }
- }
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_sub_group_id() >= OFM_BLOCKS_PER_SIMD)
- return;
-
- partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * SIMD + get_sub_group_local_id();
- __attribute__((opencl_unroll_hint))
- for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
- __attribute__((opencl_unroll_hint))
- for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
- __attribute__((opencl_unroll_hint))
- for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
- const uint partial_acc_ptr_idx =
- wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
- oh * OUT_BLOCK_WIDTH * SIMD +
- ow * SIMD;
- dotProd[0][oh][ow] += partial_acc_ptr[partial_acc_ptr_idx];
- }
- }
- }
-#endif
-
-#if FEATURE_SLM_SPLIT == 1
-# define OFM_VALUES_PER_WI (OFM_BLOCKS_PER_SIMD)
-#else
-# define OFM_VALUES_PER_WI 1
- out_f += get_sub_group_id() * SIMD;
- out_fg += get_sub_group_id() * SIMD;
-#endif
-
-#if BIAS_TERM
- BIAS_TYPE bias[OFM_VALUES_PER_WI];
- __attribute__((opencl_unroll_hint))
- for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
- bias[ofb] = biases[out_f + ofb * SIMD];
- }
-#endif
-
- ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
- __attribute__((opencl_unroll_hint))
- for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
- __attribute__((opencl_unroll_hint))
- for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
- __attribute__((opencl_unroll_hint))
- for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
- dequantized[ofb][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][oh][ow]);
-#if BIAS_TERM
- dequantized[ofb][oh][ow] += bias[ofb];
-#endif
- }
- }
- }
-
- OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
- __attribute__((opencl_unroll_hint))
- for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
-#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
- FUSED_OPS_PRELOAD_SCALAR;
-#endif
- __attribute__((opencl_unroll_hint))
- for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
- __attribute__((opencl_unroll_hint))
- for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
- ACTIVATION_TYPE dequantized_val = dequantized[ofb][oh][ow];
-#if HAS_FUSED_OPS
-# if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
- FUSED_OPS_CALC_SCALAR;
-# else
- FUSED_OPS_SCALAR;
-# endif
- result[ofb][oh][ow] = FUSED_OPS_RESULT_SCALAR;
-#else
- result[ofb][oh][ow] = TO_OUTPUT_TYPE(dequantized_val);
-#endif
- }
- }
- }
-
- uint dst_index = OUTPUT_GET_INDEX(out_b, out_fg, out_y, out_x);
-
- if ((OUTPUT_SIZE_X % OUT_BLOCK_WIDTH == 0 || out_x + OUT_BLOCK_WIDTH <= OUTPUT_SIZE_X)
- && (OUTPUT_FEATURE_NUM % OFM_BLOCKS_PER_SIMD == 0) ) {
- __attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
- for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
- bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_fg + ofb * SIMD <= OUTPUT_FEATURE_NUM);
- if (good_of_block) {
- __attribute__((opencl_unroll_hint))
- for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
- bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
- if (good_y) {
- uint ow = 0;
- #if OUTPUT_TYPE_SIZE == 1
- __attribute__((opencl_unroll_hint))
- for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
- MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
- __attribute__((opencl_unroll_hint))
- for (uint i = 0; i < 8; ++i) {
- result_val[i] = result[ofb][oh][ow + i];
- }
- DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
- dst_index += 8 * SIMD;
- }
- #endif
- #if OUTPUT_TYPE_SIZE <= 2
- __attribute__((opencl_unroll_hint))
- for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
- MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
- __attribute__((opencl_unroll_hint))
- for (uint i = 0; i < 4; ++i) {
- result_val[i] = result[ofb][oh][ow + i];
- }
- DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
- dst_index += 4 * SIMD;
- }
- #endif
-
- __attribute__((opencl_unroll_hint))
- for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
- MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
- __attribute__((opencl_unroll_hint))
- for (uint i = 0; i < 2; ++i) {
- result_val[i] = result[ofb][oh][ow + i];
- }
- DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
- dst_index += 2 * SIMD;
- }
-
- if (OUT_BLOCK_WIDTH % 2 == 1) {
- OUTPUT_TYPE result_val = result[ofb][oh][ow];
- DT_OUTPUT_BLOCK_WRITE(output, dst_index, result_val);
- dst_index += 1 * SIMD;
- }
- } // if (good_y)
- dst_index += OUTPUT_Y_PITCH * FSV - OUT_BLOCK_WIDTH * FSV;
- } // for (OUT_BLOCK_HEIGHT)
- } // if (good_of_block)
- dst_index += OUTPUT_FEATURE_PITCH * FSV - OUTPUT_Y_PITCH * FSV * OUT_BLOCK_HEIGHT;
- } // for (OFM_VALUES_PER_WI)
- } else {
- __attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
- for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
- bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_fg + ofb * SIMD <= OUTPUT_FEATURE_NUM);
- if (good_of_block) {
- const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_y, out_x);
- __attribute__((opencl_unroll_hint))
- for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
- bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
- if (good_y) {
- __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
- for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
-
-#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
- if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && ow >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
- break;
-#endif
-
-#if OUTPUT_FEATURE_NUM % SIMD != 0
- if (out_fg + (ofb + 1) * SIMD >= OUTPUT_FEATURE_NUM && get_sub_group_local_id() >= OUTPUT_FEATURE_NUM % SIMD)
- result[ofb][oh][ow] = (OUTPUT_TYPE)0;
-#endif
- output[dst_index + ow * FSV + oh * OUTPUT_Y_PITCH * FSV] = result[ofb][oh][ow];
- }
- }
- }
- }
- }
- }
-}
-
-#undef AS_INPUT0_TYPE_4
-#undef AS_TYPE_N
-#undef AS_TYPE_N_
-#undef AS_FILTER_TYPE_4
-
-#undef CEIL_DIV
-#undef ALIGN
-
-#undef SIMD
-#undef FSV
-#undef OFM_VALUES_PER_WI
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#include "include/mmad.cl"
+#include "include/data_types.cl"
+
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+
+#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
+#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
+
+#define SIMD 16
+#define FSV 16
+
+// int8 conv_input and weights data is packed to int32 "batches",
+// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
+KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
+ const __global INPUT0_TYPE *conv_input,
+ __global OUTPUT_TYPE *output,
+ const __global FILTER_TYPE *weights,
+#if BIAS_TERM
+ const __global BIAS_TYPE *biases,
+#endif
+#if HAS_FUSED_OPS_DECLS
+ FUSED_OPS_DECLS,
+#endif
+ uint split_idx) {
+
+ #define LUT_VALUE_CLAMP(x) (( (IN_BLOCK_WIDTH % SIMD == 0) || ((x) < IN_BLOCK_WIDTH % SIMD) ) ? (x) : 0)
+ const int tmp = LUT_VALUE_CLAMP(get_sub_group_local_id());
+ #undef LUT_VALUE_CLAMP
+
+ const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
+ const uint out_y = ((uint)get_global_id(1) / ALIGN(OUTPUT_SIZE_Z, OUT_BLOCK_DEPTH)) * OUT_BLOCK_HEIGHT;
+#if INPUT0_DIMS == 4
+ const uint out_z = 0;
+#else
+ const uint out_z = ((uint)get_global_id(1) % ALIGN(OUTPUT_SIZE_Z, OUT_BLOCK_DEPTH)) * OUT_BLOCK_DEPTH;
+#endif
+ const uint out_b = (uint)(get_group_id(2) / CEIL_DIV(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD)) / FILTER_GROUPS_NUM;
+ const uint g = (uint)(get_group_id(2) / CEIL_DIV(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD)) % FILTER_GROUPS_NUM;
+ uint out_f_sg = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) % (ALIGN(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD) * FILTER_GROUPS_NUM);
+ uint out_f = out_f_sg + get_sub_group_local_id();
+ uint out_f_g = (out_f % ALIGN(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD));
+#if FILTER_OFM_NUM % SIMD != 0
+ out_f = out_f - (out_f / ALIGN(FILTER_OFM_NUM, SIMD)) * (SIMD - (FILTER_OFM_NUM % SIMD));
+#endif
+
+ const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
+ const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+ const int input_z = out_z * STRIDE_SIZE_Z - PADDING_SIZE_Z;
+
+#if FEATURE_SLM_SPLIT == 1
+ const uint k_start = 0;
+#else
+ const uint k_start = get_sub_group_id() * FSV;
+#endif
+
+ uint filter_idx = GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(FILTER, g, out_f_g, k_start, 0, 0, 0);
+ const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * FSV);
+
+#if INPUT0_DIMS == 4
+ uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_y, input_x);
+#else
+ uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_z, input_y, input_x);
+#endif
+
+ ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { };
+#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+ uint in_f_offset = (g * FILTER_IFM_NUM) % FSV;
+#endif
+
+ uint4 input_val[IN_BLOCK_DEPTH][IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
+
+ __attribute__((opencl_unroll_hint(1)))
+ for (uint k = 0; k < CEIL_DIV(FILTER_IFM_NUM, FSV) / FEATURE_SLM_SPLIT; k++) {
+ __attribute__((opencl_unroll_hint(1)))
+ for (uint fzn = 0; fzn < FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL; fzn++) {
+ __attribute__((opencl_unroll_hint(1)))
+ for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
+ // Load input block IN_BLOCK_DEPTH x IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
+ __attribute__((opencl_unroll_hint))
+ for (uint izb = 0; izb < IN_BLOCK_DEPTH; ++izb) {
+ __attribute__((opencl_unroll_hint))
+ for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
+ uint input_idx = input_start_idx + izb * INPUT0_Z_PITCH * FSV + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
+
+ if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
+ #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+ if (in_f_offset == 0) {
+ input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + get_sub_group_local_id() * FSV));
+ #else
+ input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
+ #endif
+ #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+ } else {
+ INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
+ __attribute__((opencl_unroll_hint(FSV)))
+ for (uint v = 0; v < FSV; v++) {
+ if (v + in_f_offset < FSV) {
+ input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
+ } else {
+ input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v +
+ ((INPUT0_SIZE_X + 2*PADDING_SIZE_X) *
+ (INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) *
+ (INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) *
+ FSV];
+ }
+ }
+ }
+ #endif
+ } else {
+ #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+ if (in_f_offset == 0) {
+ input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + tmp * FSV));
+ #else
+ input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
+ #endif
+ #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
+ } else {
+ INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
+ __attribute__((opencl_unroll_hint(FSV)))
+ for (uint v = 0; v < FSV; v++) {
+ if (v + in_f_offset < FSV) {
+ input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
+ } else {
+ input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v +
+ ((INPUT0_SIZE_X + 2*PADDING_SIZE_X) *
+ (INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) *
+ (INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) *
+ FSV];
+ }
+ }
+ }
+ #endif
+ }
+ }
+ }
+ }
+
+ __attribute__((opencl_unroll_hint))
+ for (uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) {
+ __attribute__((opencl_unroll_hint))
+ for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
+ __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+ for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
+
+ uint4 weights_val[OFM_BLOCKS_PER_SIMD];
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+ weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
+ }
+
+ __attribute__((opencl_unroll_hint))
+ for (uint ive = 0; ive < 4; ive++) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_DEPTH)))
+ for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
+ for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+ const uint ow_offset = ow + OUT_BLOCK_WIDTH;
+ const uint z_block_idx = od * STRIDE_SIZE_Z + fzu * DILATION_SIZE_Z;
+ const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
+ const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
+ const uint shuffle_wi = x_block_idx % SIMD;
+ const uint shuffle_idx = x_block_idx / SIMD;
+
+ dotProd[ofb][od][oh][ow] = TO_ACCUMULATOR_TYPE(
+ IMAD(dotProd[ofb][od][oh][ow],
+ AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
+ shuffle_wi)),
+ AS_FILTER_TYPE_4(weights_val[ofb][ive])));
+ }
+ }
+ }
+ }
+ }
+
+ filter_idx += FSV * FSV;
+ }
+ }
+ }
+ input_start_idx += DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
+ }
+ input_start_idx += DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV - (FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL) * DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
+ }
+ input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL) * DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV;
+
+ filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * (FEATURE_SLM_SPLIT - 1);
+ }
+
+#if FEATURE_SLM_SPLIT != 1
+ // Additional local memory reduction for feature split mode
+# if FEATURE_SLM_SPLIT < OFM_BLOCKS_PER_SIMD
+# error convolution_gpu_b_fs_zyx_fsv16_imad.cl - OFM_BLOCKS_PER_SIMD must be less or equal to FEATURE_SLM_SPLIT
+# endif
+
+ const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH;
+ __local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
+
+ uint sgid_start_idx = get_sub_group_id();
+ sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
+ __local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
+ get_sub_group_local_id();
+
+ if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
+ __attribute__((opencl_unroll_hint))
+ for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
+ if (get_sub_group_id() == wg) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < wg; ++ofb) {
+ __attribute__((opencl_unroll_hint))
+ for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+ __attribute__((opencl_unroll_hint))
+ for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+ const uint partial_acc_ptr_idx =
+ ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+ od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+ oh * OUT_BLOCK_WIDTH * SIMD +
+ ow * SIMD;
+ partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
+ }
+ }
+ }
+ }
+ __attribute__((opencl_unroll_hint))
+ for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+ __attribute__((opencl_unroll_hint))
+ for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+ dotProd[0][od][oh][ow] = dotProd[wg][od][oh][ow];
+ }
+ }
+ }
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+ __attribute__((opencl_unroll_hint))
+ for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+ __attribute__((opencl_unroll_hint))
+ for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+ const uint partial_acc_ptr_idx =
+ ((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * OFM_SIZE_PER_SIMD : 0) +
+ ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+ od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+ oh * OUT_BLOCK_WIDTH * SIMD +
+ ow * SIMD;
+ partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
+ }
+ }
+ }
+ }
+ }
+ }
+ } else {
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+ __attribute__((opencl_unroll_hint))
+ for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+ __attribute__((opencl_unroll_hint))
+ for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+ const uint partial_acc_ptr_idx =
+ ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+ od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+ oh * OUT_BLOCK_WIDTH * SIMD +
+ ow * SIMD;
+ partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
+ }
+ }
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (get_sub_group_id() >= OFM_BLOCKS_PER_SIMD)
+ return;
+
+ partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD + get_sub_group_local_id();
+ __attribute__((opencl_unroll_hint))
+ for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
+ __attribute__((opencl_unroll_hint))
+ for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+ __attribute__((opencl_unroll_hint))
+ for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+ const uint partial_acc_ptr_idx =
+ wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
+ od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
+ oh * OUT_BLOCK_WIDTH * SIMD +
+ ow * SIMD;
+ dotProd[0][od][oh][ow] += partial_acc_ptr[partial_acc_ptr_idx];
+ }
+ }
+ }
+ }
+#endif
+
+#if FEATURE_SLM_SPLIT == 1
+# define OFM_VALUES_PER_WI (OFM_BLOCKS_PER_SIMD)
+#else
+# define OFM_VALUES_PER_WI 1
+ out_f += get_sub_group_id() * SIMD;
+ out_f_sg += get_sub_group_id() * SIMD;
+#endif
+
+#if BIAS_TERM
+ BIAS_TYPE bias[OFM_VALUES_PER_WI];
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+ bias[ofb] = biases[out_f + ofb * SIMD];
+ }
+#endif
+
+ ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+ __attribute__((opencl_unroll_hint))
+ for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+ __attribute__((opencl_unroll_hint))
+ for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+ dequantized[ofb][od][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][od][oh][ow]);
+#if BIAS_TERM
+ dequantized[ofb][od][oh][ow] += bias[ofb];
+#endif
+ }
+ }
+ }
+ }
+
+ OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
+ __attribute__((opencl_unroll_hint))
+ for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
+ FUSED_OPS_PRELOAD_SCALAR;
+#endif
+ __attribute__((opencl_unroll_hint))
+ for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+ __attribute__((opencl_unroll_hint))
+ for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+ __attribute__((opencl_unroll_hint))
+ for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+ ACTIVATION_TYPE dequantized_val = dequantized[ofb][od][oh][ow];
+#if HAS_FUSED_OPS
+# if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
+ FUSED_OPS_CALC_SCALAR;
+# else
+ FUSED_OPS_SCALAR;
+# endif
+ result[ofb][od][oh][ow] = FUSED_OPS_RESULT_SCALAR;
+#else
+ result[ofb][od][oh][ow] = TO_OUTPUT_TYPE(dequantized_val);
+#endif
+ }
+ }
+ }
+ }
+
+#if OUTPUT_DIMS == 4
+ uint dst_index = OUTPUT_GET_INDEX(out_b, out_f_sg, out_y, out_x);
+#else
+ uint dst_index = OUTPUT_GET_INDEX(out_b, out_f_sg, out_z, out_y, out_x);
+#endif
+
+#if ((FILTER_OFM_NUM % OFM_BLOCKS_PER_SIMD == 0) && ((FILTER_GROUPS_NUM == 1) || (FILTER_OFM_NUM % SIMD == 0)))
+ if ((OUTPUT_SIZE_X % OUT_BLOCK_WIDTH == 0 || out_x + OUT_BLOCK_WIDTH <= OUTPUT_SIZE_X)) {
+ __attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
+ for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
+ bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
+ if (good_of_block) {
+ __attribute__((opencl_unroll_hint))
+ for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+ bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
+ if (good_z) {
+ __attribute__((opencl_unroll_hint))
+ for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+ bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
+ if (good_y) {
+ uint ow = 0;
+ #if OUTPUT_TYPE_SIZE == 1
+ __attribute__((opencl_unroll_hint))
+ for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
+ MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
+ __attribute__((opencl_unroll_hint))
+ for (uint i = 0; i < 8; ++i) {
+ result_val[i] = result[ofb][od][oh][ow + i];
+ }
+ DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
+ dst_index += 8 * SIMD;
+ }
+ #endif
+ #if OUTPUT_TYPE_SIZE <= 2
+ __attribute__((opencl_unroll_hint))
+ for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
+ MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
+ __attribute__((opencl_unroll_hint))
+ for (uint i = 0; i < 4; ++i) {
+ result_val[i] = result[ofb][od][oh][ow + i];
+ }
+ DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
+ dst_index += 4 * SIMD;
+ }
+ #endif
+
+ __attribute__((opencl_unroll_hint))
+ for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
+ MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
+ __attribute__((opencl_unroll_hint))
+ for (uint i = 0; i < 2; ++i) {
+ result_val[i] = result[ofb][od][oh][ow + i];
+ }
+ DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
+ dst_index += 2 * SIMD;
+ }
+
+ if (OUT_BLOCK_WIDTH % 2 == 1) {
+ OUTPUT_TYPE result_val = result[ofb][od][oh][ow];
+ DT_OUTPUT_BLOCK_WRITE(output, dst_index, result_val);
+ dst_index += 1 * SIMD;
+ }
+ } // if (good_y)
+ dst_index += OUTPUT_Y_PITCH * FSV - OUT_BLOCK_WIDTH * FSV;
+ } // for (OUT_BLOCK_HEIGHT)
+ } // if (good_z)
+ dst_index += OUTPUT_Z_PITCH * FSV - OUTPUT_Y_PITCH * OUT_BLOCK_HEIGHT * FSV;
+ } // for (OUT_BLOCK_DEPTH)
+ } // if (good_of_block)
+ dst_index += OUTPUT_FEATURE_PITCH * FSV - OUTPUT_Z_PITCH * OUT_BLOCK_DEPTH * FSV;
+ } // for (OFM_VALUES_PER_WI)
+ } else {
+#endif
+ __attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
+ for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
+ bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
+ if (good_of_block) {
+ #if OUTPUT_DIMS == 4
+ const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_y, out_x);
+ #else
+ const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_z, out_y, out_x);
+ #endif
+ __attribute__((opencl_unroll_hint))
+ for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+ bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
+ if (good_z) {
+ __attribute__((opencl_unroll_hint))
+ for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+ bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
+ if (good_y) {
+ __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+ for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+
+ #if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
+ if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && ow >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
+ break;
+ #endif
+
+ if (out_f_g < FILTER_OFM_NUM) {
+ output[dst_index + ow * FSV + oh * OUTPUT_Y_PITCH * FSV + od * OUTPUT_Z_PITCH * FSV] = result[ofb][od][oh][ow];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+#if ((FILTER_OFM_NUM % OFM_BLOCKS_PER_SIMD == 0) && ((FILTER_GROUPS_NUM == 1) || (FILTER_OFM_NUM % SIMD == 0)))
+ }
+#endif
+}
+
+#undef AS_INPUT0_TYPE_4
+#undef AS_TYPE_N
+#undef AS_TYPE_N_
+#undef AS_FILTER_TYPE_4
+
+#undef CEIL_DIV
+#undef ALIGN
+
+#undef SIMD
+#undef FSV
+#undef OFM_VALUES_PER_WI
/*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return output_offset;
}
+inline uint FUNC(get_g_os_is_zyx_osv_isv_index)(uint g, uint o, uint i, uint z, uint y, uint x,
+ uint x_size, uint y_size, uint z_size, uint i_size, uint o_size, uint osv_size, uint isv_size)
+{
+ const uint isv = i % isv_size;
+ const uint osv = o % osv_size;
+ const uint is = i / isv_size;
+ const uint os = o / osv_size;
+
+ const uint x_pitch = osv_size * isv_size;
+ const uint y_pitch = x_pitch * x_size;
+ const uint z_pitch = y_pitch * y_size;
+ const uint is_pitch = z_pitch * z_size;
+ const uint os_pitch = is_pitch * ((i_size + isv_size - 1) / isv_size);
+ const uint g_pitch = os_pitch * ((o_size + osv_size - 1) / osv_size);
+
+ const uint output_offset =
+ isv +
+ osv * isv_size +
+ x * x_pitch +
+ y * y_pitch +
+ z * z_pitch +
+ is * is_pitch +
+ os * os_pitch +
+ g * g_pitch;
+
+ return output_offset;
+}
+
+#define GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(prefix, g, o, i, z, y, x) \
+ FUNC_CALL(get_g_os_is_zyx_osv_isv_index)( \
+ g, o, i, z, y, x, \
+ CAT(prefix, _SIZE_X), \
+ CAT(prefix, _SIZE_Y), \
+ CAT(prefix, _SIZE_Z), \
+ CAT(prefix, _IFM_NUM), \
+ CAT(prefix, _OFM_NUM), \
+ 16, \
+ 16)
+
#define GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(prefix, o, i, y, x) \
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
o, i, 0, y, x, \
16, \
16)
+#define GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(prefix, o, i, z, y, x) \
+ FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
+ o, i, z, y, x, \
+ CAT(prefix, _SIZE_X), \
+ CAT(prefix, _SIZE_Y), \
+ CAT(prefix, _SIZE_Z), \
+ CAT(prefix, _IFM_NUM), \
+ CAT(prefix, _OFM_NUM), \
+ 16, \
+ 16)
+
#define GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(prefix, o, i, z, y, x) \
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
o, i, z, y, x, \
+++ /dev/null
-// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-#include "include/data_types.cl"
-
-#define ALIGN_TO(val, multiple) (((val) + (multiple) - 1) / (multiple) * (multiple))
-
-#define AS_TYPE(type, val) CAT(as_, type)(val)
-#define IN_VEC16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
-#define OUT_VEC16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
-
-#define ACTIVATION_VEC16 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 16)
-#define TO_ACTIVATION_VEC16 CAT(convert_, ACTIVATION_VEC16)
-
-#define FEATURE_SLICE_SIZE 16
-
-#if MAX_POOLING
- #define INIT_VAL ACCUMULATOR_VAL_MIN
-#elif AVG_POOLING
- #define INIT_VAL ACCUMULATOR_VAL_ZERO
-#else
-#error
-#endif
-
-
-inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
-{
-#if MAX_POOLING
- return ACCUMULATOR_MAX_FUNC(tmp, in);
-#elif AVG_POOLING
- return tmp + in;
-#endif
-}
-
-__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
-KERNEL(pooling_gpu_b_fs_yx_fsv16)(
- const __global INPUT0_TYPE* input,
- __global OUTPUT_TYPE* output
-#if HAS_FUSED_OPS_DECLS
- , FUSED_OPS_DECLS
-#endif
-)
-{
- const uint x = (uint)get_global_id(0);
- const uint y = (uint)get_global_id(1);
- const uint bf = (uint)get_global_id(2);
- const uint f = (bf * FEATURE_SLICE_SIZE) % ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
- const uint b = (bf * FEATURE_SLICE_SIZE) / ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
-
- const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
- const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
- ACCUMULATOR_TYPE result[FEATURE_SLICE_SIZE] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL,
- INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };
-
-#ifdef CHECK_BOUNDRY
- if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
- offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
- {
- return;
- }
-
-#ifdef DYNAMIC_KERNEL_DIVIDER
- uint num_elements = 0;
-#endif
-
- const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0);
- __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
- for(uint j = 0; j < POOL_SIZE_Y; j++)
- {
- int input_offset_y = offset_y + j;
- bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
- if(!zero_y)
- {
- __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
- for(uint i = 0; i < POOL_SIZE_X; i++)
- {
- int input_offset_x = offset_x + i;
- bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
- if(!zero)
- {
- const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
-
- int4 int_data = vload4(0, (__global int*)(input + input_idx));
- IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
- __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
- for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
- {
- result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
- }
-
-#ifdef DYNAMIC_KERNEL_DIVIDER
- num_elements++;
-#endif
- }
- }
- }
- }
-#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
- const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
- const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
- const uint num_elements = (hend - offset_y) * (wend - offset_x);
-#endif
-#else // !CHECK_BOUNDRY
- uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x);
- __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
- for(uint j = 0; j < POOL_SIZE_Y; j++)
- {
- __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
- for(uint i = 0; i < POOL_SIZE_X; i++)
- {
- int4 int_data = vload4(0, (__global int*)(input + input_idx));
- IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
- __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
- for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
- {
- result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
- }
-
- input_idx += IN_X_PITCH;
- }
- input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
- }
-
-#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
- const uint num_elements = POOL_SIZE_X*POOL_SIZE_Y;
-#endif
-#endif
-
- ACTIVATION_VEC16 pool_result;
-#if defined AVG_POOLING
-#if ENABLE_ROUND
- __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
- for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
- #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
- pool_result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
- #else
- pool_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
- #endif
- }
-#else
- __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
- for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
- #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
- pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
- #else
- pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
- #endif
- }
-#endif // ENABLE_ROUND
-#else // AVG_POOLING
- __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
- for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
- pool_result[i] = result[i];
- }
-#endif // AVG_POOLING
-
- OUT_VEC16 final_result = (OUTPUT_TYPE)(0);
-#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
- FUSED_OPS_PRELOAD;
-#endif
-
- __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
- for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
-#if HAS_FUSED_OPS
-#if FUSED_OPS_CAN_USE_PRELOAD
- FUSED_OPS_CALC;
-#else
- FUSED_OPS;
-#endif
- final_result[i] = FUSED_OPS_RESULT;
-#else
- final_result[i] = TO_OUTPUT_TYPE(ACTIVATION(pool_result[i], ACTIVATION_PARAMS));
-#endif
- }
-
- const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
-
-#if OUTPUT_TYPE_SIZE == 1
- vstore4(as_uint4(final_result), 0, ((__global uint*)(output + output_pos)));
-#else
- *((__global OUT_VEC16*)(output + output_pos)) = final_result;
-#endif
-}
-
-#undef ALIGN_TO
-#undef AS_TYPE
-#undef IN_VEC16
-#undef OUT_VEC16
-#undef ACTIVATION_VEC16
-#undef TO_ACTIVATION_VEC16
-#undef INIT_VAL
-#undef FEATURE_SLICE_SIZE
--- /dev/null
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/include_all.cl"
+#include "include/data_types.cl"
+
+#define ALIGN_TO(val, multiple) (((val) + (multiple) - 1) / (multiple) * (multiple))
+
+#define AS_TYPE(type, val) CAT(as_, type)(val)
+#define IN_VEC16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
+#define OUT_VEC16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
+
+#define ACTIVATION_VEC16 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 16)
+#define TO_ACTIVATION_VEC16 CAT(convert_, ACTIVATION_VEC16)
+
+#define FEATURE_SLICE_SIZE 16
+
+#if MAX_POOLING
+ #define INIT_VAL ACCUMULATOR_VAL_MIN
+#elif AVG_POOLING
+ #define INIT_VAL ACCUMULATOR_VAL_ZERO
+#else
+#error
+#endif
+
+inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
+{
+#if MAX_POOLING
+ return ACCUMULATOR_MAX_FUNC(tmp, in);
+#elif AVG_POOLING
+ return tmp + in;
+#endif
+}
+
+__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
+KERNEL(pooling_gpu_b_fs_zyx_fsv16)(
+ const __global INPUT0_TYPE* input,
+ __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+ , FUSED_OPS_DECLS
+#endif
+)
+{
+ const uint x = (uint)get_global_id(0);
+#if OUTPUT_DIMS == 4
+ const uint y = (uint)get_global_id(1);
+ const uint z = 0;
+#else
+ const uint zy = (uint)get_global_id(1);
+ const uint y = zy % OUTPUT_SIZE_Y;
+ const uint z = zy / OUTPUT_SIZE_Y;
+#endif
+ const uint bf = (uint)get_global_id(2);
+ const uint f = (bf * FEATURE_SLICE_SIZE) % ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
+ const uint b = (bf * FEATURE_SLICE_SIZE) / ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
+
+ const bool last_in_f_group = (f == FEATURE_SLICE_SIZE * ((INPUT0_FEATURE_NUM - 1) / FEATURE_SLICE_SIZE));
+
+ const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
+ const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
+ const int offset_z = (int)z*STRIDE_SIZE_Z - PADDING_SIZE_Z;
+
+ ACCUMULATOR_TYPE result[FEATURE_SLICE_SIZE] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL,
+ INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };
+
+#ifdef CHECK_BOUNDRY
+ if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
+ offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y ||
+ offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z)
+ {
+ return;
+ }
+
+#ifdef DYNAMIC_KERNEL_DIVIDER
+ uint num_elements = 0;
+#endif
+
+#if INPUT0_DIMS == 4
+ const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0);
+#else
+ const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0, 0);
+#endif
+ __attribute__((opencl_unroll_hint(POOL_SIZE_Z)))
+ for(uint pz = 0; pz < POOL_SIZE_Z; pz++)
+ {
+ int input_offset_z = offset_z + pz;
+ bool zero_z = input_offset_z >= INPUT0_SIZE_Z || input_offset_z < 0;
+ if(!zero_z)
+ {
+ __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
+ for(uint py = 0; py < POOL_SIZE_Y; py++)
+ {
+ int input_offset_y = offset_y + py;
+ bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
+ if(!zero_y)
+ {
+ __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
+ for(uint px = 0; px < POOL_SIZE_X; px++)
+ {
+ int input_offset_x = offset_x + px;
+ bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
+ if(!zero)
+ {
+ const uint input_idx = batch_and_feature_offset + input_offset_z*IN_Z_PITCH + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
+ IN_VEC16 ch16_data;
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ if (!last_in_f_group) {
+#endif
+ ch16_data = AS_TYPE(IN_VEC16, vload4(0, (__global int*)(input + input_idx)));
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ } else {
+ __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+ for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
+ ch16_data[k] = input[input_idx + k];
+ }
+ }
+#endif
+
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ if (!last_in_f_group) {
+#endif
+ __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+ for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
+ {
+ result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
+ }
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ } else {
+ __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+ for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++)
+ {
+ result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
+ }
+ }
+#endif
+
+ #ifdef DYNAMIC_KERNEL_DIVIDER
+ num_elements++;
+ #endif
+ }
+ }
+ }
+ }
+ }
+ }
+#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
+ const int dend = min(offset_z + POOL_SIZE_Z, INPUT0_SIZE_Z + PADDING_SIZE_Z);
+ const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
+ const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
+ const uint num_elements = (dend - offset_z) * (hend - offset_y) * (wend - offset_x);
+#endif
+#else // !CHECK_BOUNDRY
+#if INPUT0_DIMS == 4
+ uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x);
+#else
+ uint input_idx = INPUT0_GET_INDEX(b, f, offset_z, offset_y, offset_x);
+#endif
+ __attribute__((opencl_unroll_hint(POOL_SIZE_Z)))
+ for(uint pz = 0; pz < POOL_SIZE_Z; pz++)
+ {
+ __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
+ for(uint py = 0; py < POOL_SIZE_Y; py++)
+ {
+ __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
+ for(uint px = 0; px < POOL_SIZE_X; px++)
+ {
+ IN_VEC16 ch16_data;
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ if (!last_in_f_group) {
+#endif
+ ch16_data = AS_TYPE(IN_VEC16, vload4(0, (__global int*)(input + input_idx)));
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ } else {
+ __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+ for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
+ ch16_data[k] = input[input_idx + k];
+ }
+ }
+#endif
+
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ if (!last_in_f_group) {
+#endif
+ __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+ for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
+ {
+ result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
+ }
+#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ } else {
+ __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+ for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++)
+ {
+ result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
+ }
+ }
+#endif
+ input_idx += IN_X_PITCH;
+ }
+ input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
+ }
+ input_idx += (IN_Z_PITCH - POOL_SIZE_Y*IN_Y_PITCH);
+ }
+
+#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+ const uint num_elements = POOL_SIZE_X*POOL_SIZE_Y*POOL_SIZE_Z;
+#endif
+#endif
+
+ ACTIVATION_VEC16 pool_result;
+#if defined AVG_POOLING
+#if ENABLE_ROUND
+ __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+ for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
+ #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+ pool_result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
+ #else
+ pool_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X)));
+ #endif
+ }
+#else
+ __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+ for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
+ #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+ pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
+ #else
+ pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X);
+ #endif
+ }
+#endif // ENABLE_ROUND
+#else // AVG_POOLING
+ __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+ for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
+ pool_result[i] = result[i];
+ }
+#endif // AVG_POOLING
+
+ OUT_VEC16 final_result = (OUTPUT_TYPE)(0);
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+ FUSED_OPS_PRELOAD;
+#endif
+
+ __attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
+ for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
+#if HAS_FUSED_OPS
+#if FUSED_OPS_CAN_USE_PRELOAD
+ FUSED_OPS_CALC;
+#else
+ FUSED_OPS;
+#endif
+ final_result[i] = FUSED_OPS_RESULT;
+#else
+ final_result[i] = TO_OUTPUT_TYPE(ACTIVATION(pool_result[i], ACTIVATION_PARAMS));
+#endif
+ }
+
+#if OUTPUT_DIMS == 4
+ const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
+#else
+ const uint output_pos = OUTPUT_GET_INDEX(b, f, z, y, x);
+#endif
+
+#if OUTPUT_TYPE_SIZE == 1
+#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ if (!last_in_f_group) {
+#endif
+ vstore4(as_uint4(final_result), 0, ((__global uint*)(output + output_pos)));
+#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ } else {
+ __attribute__((opencl_unroll_hint(OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+ for(uint k = 0; k < OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
+ output[output_pos + k] = final_result[k];
+ }
+ }
+#endif
+#else
+#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ if (!last_in_f_group) {
+#endif
+ *((__global OUT_VEC16*)(output + output_pos)) = final_result;
+#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
+ } else {
+ __attribute__((opencl_unroll_hint(OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE)))
+ for(uint k = 0; k < OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
+ output[output_pos + k] = final_result[k];
+ }
+ }
+#endif
+#endif
+}
+
+#undef ALIGN_TO
+#undef AS_TYPE
+#undef IN_VEC16
+#undef OUT_VEC16
+#undef ACTIVATION_VEC16
+#undef TO_ACTIVATION_VEC16
+#undef INIT_VAL
+#undef FEATURE_SLICE_SIZE
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return GET_FILTER_GOIYX(INPUT0, g, o, i, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_YX_OSV16_ISV16
return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(INPUT0, o, i, y, x);
+#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV16_ISV16
+ return GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(INPUT0, o, i, z, y, x);
+#elif defined INPUT0_LAYOUT_G_OS_IS_ZYX_OSV16_ISV16
+ return GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(INPUT0, g, o, i, z, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV32_ISV16
return GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(INPUT0, o, i, z, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV64_ISV16
return GET_FILTER_G_OS_IS_YX_ISV16_OSV16_INDEX(OUTPUT, g, o, i, y, x, SUB_GROUP_SIZE);
#elif defined OUTPUT_LAYOUT_OS_IS_YX_OSV16_ISV16
return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(OUTPUT, o, i, y, x);
+#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV16_ISV16
+ return GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(OUTPUT, o, i, z, y, x);
+#elif defined OUTPUT_LAYOUT_G_OS_IS_ZYX_OSV16_ISV16
+ return GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(OUTPUT, g, o, i, z, y, x);
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV32_ISV16
return GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(OUTPUT, o, i, z, y, x);
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV64_ISV16
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
case WeightsLayout::yxio: return "YXIO";
case WeightsLayout::os_is_yx_isv16_osv16: return "OS_IS_YX_ISV16_OSV16";
case WeightsLayout::os_is_yx_osv16_isv16: return "OS_IS_YX_OSV16_ISV16";
+ case WeightsLayout::os_is_zyx_osv16_isv16: return "OS_IS_ZYX_OSV16_ISV16";
case WeightsLayout::os_is_zyx_osv32_isv16: return "OS_IS_ZYX_OSV32_ISV16";
case WeightsLayout::os_is_zyx_osv64_isv16: return "OS_IS_ZYX_OSV64_ISV16";
case WeightsLayout::os_iyx_osv16: return "OS_IYX_OSV16";
case WeightsLayout::gs_oi_yxs_gsv32_yxsv4: return "GS_OI_YXS_GSV32_YXSV4";
case WeightsLayout::g_os_is_yx_isv16_osv16: return "G_OS_IS_YX_ISV16_OSV16";
case WeightsLayout::g_os_is_yx_osv16_isv4: return "G_OS_IS_YX_OSV16_ISV4";
+ case WeightsLayout::g_os_is_zyx_osv16_isv16: return "G_OS_IS_ZYX_OSV16_ISV16";
case WeightsLayout::g_os_zyx_is_osv16_isv4: return "G_OS_ZYX_IS_OSV16_ISV4";
case WeightsLayout::g_os_zyx_is_osv16_isv16: return "G_OS_ZYX_IS_OSV16_ISV16";
case WeightsLayout::g_os_zyx_is_osv16_isv32: return "G_OS_ZYX_IS_OSV16_ISV32";
/*
-// Copyright (c) 2016-2018 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// window size spatial Y", filter_size.spatial[1], "First convolution is outside of image. please reduce input
// offset Y");
- if (input_layout.format == format::bfzyx) {
+ if (input_layout.format.spatial_num() == 3) {
// convolution 3D
CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
"Stride spatial Z",
// block i8 format
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
+ implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
+ implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
// MMAD
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv16), val_fw);
+ implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
+ implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw);
!((_lo.get_optimization_attributes().b_fs_yx_fsv16_network || input_node.get_output_layout().format == format::b_fs_yx_fsv16) &&
_lo.is_format_optimized(node->as<deconvolution>(), format::b_fs_yx_fsv16));
// int8/uint8 input
- perform_opt |= (input_node.get_output_layout().data_type == data_types::i8 || input_node.get_output_layout().data_type == data_types::u8) &&
- // imad convolution kernel limitation for groups
- (groups == 1 || weights_node.get_output_layout().size.feature[0] % 4 == 0 ||
- groups == static_cast<uint32_t>(input_node.get_output_layout().size.feature[0])) &&
- // no uint8/int8 3D convolution support
- input_node.get_output_layout().format.dimension() == 4;
+ perform_opt |= (input_node.get_output_layout().data_type == data_types::i8 || input_node.get_output_layout().data_type == data_types::u8);
if (!perform_opt)
continue;
if (conv_layout.format != cldnn::format::bfyx &&
conv_layout.format != cldnn::format::bf8_xy16 &&
conv_layout.format != cldnn::format::b_fs_yx_fsv16 &&
+ conv_layout.format != cldnn::format::b_fs_zyx_fsv16 &&
conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 &&
conv_layout.format != cldnn::format::byxf_af32 &&
conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 &&
(filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] +
(filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
+ auto input_limit_z = input_offset.spatial[2] + (conv_layout.size.spatial[2] - 1) * stride.spatial[2] +
+ (filter_layout.size.spatial[2] - 1) * dilation.spatial[2] + 1;
- auto left_padding = std::max(-input_offset.spatial[0], 0);
- auto top_padding = std::max(-input_offset.spatial[1], 0);
- auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
- auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
+ auto padding_begin_x = std::max(-input_offset.spatial[0], 0);
+ auto padding_begin_y = std::max(-input_offset.spatial[1], 0);
+ auto padding_begin_z = std::max(-input_offset.spatial[2], 0);
+ auto padding_end_x = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
+ auto padding_end_y = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
+ auto padding_end_z = std::max(input_limit_z - prev_prim_output_layout.size.spatial[2], 0);
// Adjust right padding, so entire buffer size in X dimension is properly aligned.
// TODO: NOTE: Will be reenabled with next check-in once heuristic for line-aligned algorithm will be added.
// round_up_to(left_padding + prev_prim_output_layout.size.spatial[0] + right_padding, 16));
// right_padding = needed_buffer_size_x - left_padding - prev_prim_output_layout.size.spatial[0];
- cldnn::padding needed_padding({0, 0, left_padding, top_padding}, {0, 0, right_padding, bottom_padding}, 0);
+ cldnn::padding needed_padding({0, 0, padding_begin_x, padding_begin_y, padding_begin_z}, {0, 0, padding_end_x, padding_end_y, padding_end_z}, 0);
needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
p.apply_needed_padding(node, conv_input_node, needed_padding);
}
(filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] +
(filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
+ auto input_limit_z = input_offset.spatial[2] + (conv_layout.size.spatial[2] - 1) * stride.spatial[2] +
+ (filter_layout.size.spatial[2] - 1) * dilation.spatial[2] + 1;
- auto left_padding = std::max(-input_offset.spatial[0], 0);
- auto top_padding = std::max(-input_offset.spatial[1], 0);
- auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
- auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
+ auto padding_begin_x = std::max(-input_offset.spatial[0], 0);
+ auto padding_begin_y = std::max(-input_offset.spatial[1], 0);
+ auto padding_begin_z = std::max(-input_offset.spatial[2], 0);
+ auto padding_end_x = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
+ auto padding_end_y = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
+ auto padding_end_z = std::max(input_limit_z - prev_prim_output_layout.size.spatial[2], 0);
- cldnn::padding needed_padding({0, 0, left_padding, top_padding}, {0, 0, right_padding, bottom_padding}, 0);
+ cldnn::padding needed_padding({0, 0, padding_begin_x, padding_begin_y, padding_begin_z}, {0, 0, padding_end_x, padding_end_y, padding_end_z}, 0);
needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
p.apply_needed_padding(node, conv_input_node, needed_padding);
/*
-// Copyright (c) 2017-2019 Intel Corporation
+// Copyright (c) 2017-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return "g_os_is_zyx_isv16_osv16";
case format::g_os_is_yx_osv16_isv4:
return "g_os_is_yx_osv16_isv4";
+ case format::g_os_is_zyx_osv16_isv16:
+ return "g_os_is_zyx_osv16_isv16";
case format::g_os_zyx_is_osv16_isv4:
return "g_os_zyx_is_osv16_isv4";
case format::g_os_zyx_is_osv16_isv16:
return kernel_selector::weights_layout::g_os_is_zyx_isv16_osv16;
case format::g_os_is_yx_osv16_isv4:
return kernel_selector::weights_layout::g_os_is_yx_osv16_isv4;
+ case format::os_is_zyx_osv16_isv16:
+ return kernel_selector::weights_layout::os_is_zyx_osv16_isv16;
+ case format::g_os_is_zyx_osv16_isv16:
+ return kernel_selector::weights_layout::g_os_is_zyx_osv16_isv16;
case format::g_os_zyx_is_osv16_isv4:
return kernel_selector::weights_layout::g_os_zyx_is_osv16_isv4;
case format::g_os_zyx_is_osv16_isv16:
return cldnn::format::g_os_is_zyx_isv16_osv16;
case kernel_selector::weights_layout::os_is_yx_osv16_isv4:
return cldnn::format::g_os_is_yx_osv16_isv4;
+ case kernel_selector::weights_layout::os_is_zyx_osv16_isv16:
+ return cldnn::format::os_is_zyx_osv16_isv16;
+ case kernel_selector::weights_layout::g_os_is_zyx_osv16_isv16:
+ return cldnn::format::g_os_is_zyx_osv16_isv16;
case kernel_selector::weights_layout::g_os_zyx_is_osv16_isv4:
return cldnn::format::g_os_zyx_is_osv16_isv4;
case kernel_selector::weights_layout::g_os_zyx_is_osv16_isv16:
auto ks_x = weights_layout.size.spatial[0];
auto ks_y = weights_layout.size.spatial[1];
+ size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
+ size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
+ if (weights_layout.format.group_num() > 0) {
+ out_features_per_group = weights_layout.size.batch[0];
+ }
+
// Check for non-grouped or depthwise convolution
if (input_layout.size.spatial[2] == 1 &&
- input_layout.size.batch[0] < 16 &&
((ks_x == 7 && ks_y == 7) || (ks_x == 3 && ks_y == 3) || (ks_x == 1 && ks_y == 1) || (ks_x == 5 && ks_y == 5)) &&
weights_layout.size.batch[0] >= 16 &&
((conv->groups == 1 && conv->split() == 1) ||
return true;
// Check for grouped convolution
else if (input_layout.size.spatial[2] == 1 && input_layout.size.batch[0] < 16 &&
- weights_layout.size.batch[0] >= 16 &&
- ((input_layout.size.feature[0] / conv->groups) % 4 == 0) &&
- ((conv->dilation.spatial[0] + 1) * (ks_x - 1)) < 16 &&
- (conv->activations_zero_points.empty() && conv->weights_zero_points.empty()))
- return true;
-
+ out_features_per_group >= 16 &&
+ // Need to extend imad fsv4 kernel to handle e.g. 3 input features per group
+ (in_features_per_group % 4 == 0) &&
+ ((conv->dilation.spatial[0] + 1) * (ks_x - 1)) <= 16 &&
+ (conv->activations_zero_points.empty() && conv->weights_zero_points.empty()))
+ return true;
+ // Check for fsv16 imad kernel
+ else if ((input_layout.format.dimension() == 4) &&
+ (conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
+ (!((conv->groups > 1) && (in_features_per_group == 1) && (out_features_per_group == 1))))
+ return true;
return false;
}
// A set of rules that define when b_fs_yx_fsv16 mem format can be used for fp16/fp32 case
(weights_layout.size.batch[0] % 16 == 0 || (weights_layout.size.batch[0] == 8 && conv->groups > 1)) &&
conv->dilation == tensor(1))
return true;
+
+ size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
+ size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
+ if (weights_layout.format.group_num() > 0) {
+ out_features_per_group = weights_layout.size.batch[0];
+ }
+
+ // Check for fsv16 imad kernel
+ if ((input_layout.format.dimension() == 5) &&
+ (conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
+ (input_layout.data_type == data_types::i8 || input_layout.data_type == data_types::u8) &&
+ (weights_layout.data_type == data_types::i8 || weights_layout.data_type == data_types::u8) &&
+ (!((conv->groups > 1) && (in_features_per_group == 1) && (out_features_per_group == 1))))
+ return true;
return false;
}
} else if ((_optimization_attributes.b_fs_yx_fsv16_network &&
convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
expected_format = cldnn::format::b_fs_yx_fsv16;
+ } else if ((_optimization_attributes.b_fs_zyx_fsv16_network &&
+ convolution_b_fs_zyx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
+ expected_format = cldnn::format::b_fs_zyx_fsv16;
} else {
expected_format = imad_case(node);
}
layout{ data_types::f32, format::bfyx, tensor{} }).format;
} else if (node.is_type<quantize>()) {
auto layout = node.get_output_layout();
- if ((layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
+ if (layout.format.spatial_num() == 2 &&
+ (layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
layout.size.batch[0] % 16 == 0)
expected = format::b_fs_yx_fsv4;
} else if (node.is_type<reorder>() || node.is_type<input_layout>()) {
/*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
0,
"Input offset in batch is not supported");
- if (input_layout.format == format::bfzyx) {
+ if (input_layout.format.spatial_num() == 3) {
// 3D
CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
- "stride spatial Z",
- stride.spatial[1],
- "",
- 0,
- "Stride spatial Z must be positive (>= 1)");
+ "stride spatial Z",
+ stride.spatial[1],
+ "",
+ 0,
+ "Stride spatial Z must be positive (>= 1)");
CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
- "window size spatial Z",
- window_size.spatial[2],
- "",
- 0,
- "Size Z (of pooling window) must be positive (>= 1)");
+ "window size spatial Z",
+ window_size.spatial[2],
+ "",
+ 0,
+ "Size Z (of pooling window) must be positive (>= 1)");
+ CLDNN_ERROR_GREATER_THAN(node.id(),
+ "Input offset spatial Z",
+ 2 * input_offset.spatial[2],
+ "input layout size spatial Z",
+ input_layout.size.spatial[2],
+ "Input offset is greater than input data range. There is no input data to process");
CLDNN_ERROR_GREATER_THAN(node.id(),
- "Input offset spatial Z",
- 2 * input_offset.spatial[2],
- "input layout size spatial Z",
- input_layout.size.spatial[2],
- "Input offset is greater than input data range. There is no input data to process");
+ "Negate input offset spatial Z",
+ -input_offset.spatial[2],
+ "input window size spatial Z",
+ window_size.spatial[2],
+ "First pool is outside of image. please reduce input offset Z");
}
if (desc->with_output_size) {
};
template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT, typename AccT = typename convolution_accumulator<InputT>::type>
-VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
- int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
- int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
+VVVF<OutputT> reference_convolve(VVVVF<InputT> &input, VVVVF<WeightsT> &filter,
+ int stride_z, int stride_y, int stride_x,
+ float bias,
+ int dilation_z = 1, int dilation_y = 1, int dilation_x = 1,
+ int input_padding_z = 0, int input_padding_y = 0, int input_padding_x = 0,
+ int output_padding_z = 0, int output_padding_y = 0, int output_padding_x = 0,
+ size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
{
- size_t kernel_extent_y = dilation_y * (filter[0].size() - 1) + 1;
- size_t kernel_extent_x = dilation_x * (filter[0][0].size() - 1) + 1;
- size_t output_y = 1 + (input[0].size() - kernel_extent_y + 2 * input_padding_y) / stride_y + 2 * output_padding_y;
- size_t output_x = 1 + (input[0][0].size() - kernel_extent_x + 2 * input_padding_x) / stride_x + 2 * output_padding_x;
+ size_t kernel_extent_z = dilation_z * (filter[0].size() - 1) + 1;
+ size_t kernel_extent_y = dilation_y * (filter[0][0].size() - 1) + 1;
+ size_t kernel_extent_x = dilation_x * (filter[0][0][0].size() - 1) + 1;
+
+ size_t output_z = 1 + (input[0].size() - kernel_extent_z + 2 * input_padding_z) / stride_z + 2 * output_padding_z;
+ size_t output_y = 1 + (input[0][0].size() - kernel_extent_y + 2 * input_padding_y) / stride_y + 2 * output_padding_y;
+ size_t output_x = 1 + (input[0][0][0].size() - kernel_extent_x + 2 * input_padding_x) / stride_x + 2 * output_padding_x;
+
bool asymm_data = !data_zp.empty();
bool asymm_weights = weights_zp != static_cast<WeightsT>(0);
- VVF<OutputT> output(output_y, VF<OutputT>(output_x, 0));
+ VVVF<OutputT> output(output_z, VVF<OutputT>(output_y, VF<OutputT>(output_x, 0)));
size_t filter_begin = f_begin ? f_begin : 0;
size_t filter_end = f_end ? f_end : filter.size();
for (size_t f = filter_begin; f < filter_end; ++f) {
- for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
- for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
- VF<AccT> values;
- values.reserve(filter[0].size() * filter[0][0].size());
- for (size_t yf = 0; yf < filter[0].size(); ++yf) {
- int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
- bool yi_inside = yi >= 0 && (int)input[0].size() > yi;
- if (!yi_inside) continue;
- for (size_t xf = 0; xf < filter[0][0].size(); ++xf) {
- int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
- bool xi_inside = xi >= 0 && (int)input[0][0].size() > xi;
- if (!xi_inside) continue;
-
- auto input_val = static_cast<AccT>(input[f][yi][xi]);
-
- if (asymm_data) {
- input_val = input_val - static_cast<AccT>(data_zp[f]);
- }
+ for (size_t z = 0; z < (output_z - 2 * output_padding_z); ++z) {
+ for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
+ for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
+ VF<AccT> values;
+ values.reserve(filter[0].size() * filter[0][0].size() * filter[0][0][0].size());
+ for (size_t zf = 0; zf < filter[0].size(); ++zf) {
+ int zi = -input_padding_z + (int)zf * dilation_z + stride_z * (int)z;
+ bool zi_inside = zi >= 0 && (int)input[0].size() > zi;
+ if (!zi_inside) continue;
+ for (size_t yf = 0; yf < filter[0][0].size(); ++yf) {
+ int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
+ bool yi_inside = yi >= 0 && (int)input[0][0].size() > yi;
+ if (!yi_inside) continue;
+ for (size_t xf = 0; xf < filter[0][0][0].size(); ++xf) {
+ int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
+ bool xi_inside = xi >= 0 && (int)input[0][0][0].size() > xi;
+ if (!xi_inside) continue;
+
+ auto input_val = static_cast<AccT>(input[f][zi][yi][xi]);
+
+ if (asymm_data) {
+ input_val = input_val - static_cast<AccT>(data_zp[f]);
+ }
- AccT weights_val;
- if (!depthwise && !grouped) {
- weights_val = static_cast<AccT>(filter[f][yf][xf]);
- } else if (grouped) {
- weights_val = static_cast<AccT>(filter[f - filter_begin][yf][xf]);
- }
- else {
- weights_val = static_cast<AccT>(filter[0][yf][xf]);
- }
+ AccT weights_val;
+ if (!depthwise && !grouped) {
+ weights_val = static_cast<AccT>(filter[f][zf][yf][xf]);
+ } else if (grouped) {
+ weights_val = static_cast<AccT>(filter[f - filter_begin][zf][yf][xf]);
+ }
+ else {
+ weights_val = static_cast<AccT>(filter[0][zf][yf][xf]);
+ }
- if (asymm_weights) {
- weights_val = weights_val - static_cast<AccT>(weights_zp);
- }
+ if (asymm_weights) {
+ weights_val = weights_val - static_cast<AccT>(weights_zp);
+ }
+
+ //std::cout << std::endl << "f=" << f << ", z=" << z << ", y=" << y << ", x=" << x << ", zf=" << zf << ", yf=" << yf << ", xf=" << xf << ": " << (int)input_val << " * " << (int)weights_val;
- values.push_back(input_val * weights_val);
+ values.push_back(input_val * weights_val);
+ }
+ }
}
+ output[z + output_padding_z][y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(kahan_summation<AccT>(values));
}
- output[y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(kahan_summation<AccT>(values));
}
}
}
- for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
- for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
- output[y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(bias);
+ for (size_t z = 0; z < (output_z - 2 * output_padding_z); ++z) {
+ for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
+ for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
+ output[z + output_padding_z][y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(bias);
+ }
}
}
return output;
}
+template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT, typename AccT = typename convolution_accumulator<InputT>::type>
+VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
+ int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
+ int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
+ const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
+{
+ VVVVF<InputT> input_extended(input.size(), VVVF<InputT>(1, VVF<InputT>(input[0].size(), VF<InputT>(input[0][0].size(), 0))));
+ for (size_t fi = 0; fi < input.size(); fi++) {
+ for (size_t yi = 0; yi < input[0].size(); yi++) {
+ for (size_t xi = 0; xi < input[0][0].size(); xi++) {
+ input_extended[fi][0][yi][xi] = input[fi][yi][xi];
+ }
+ }
+ }
+
+ VVVVF<WeightsT> filter_extended(filter.size(), VVVF<WeightsT>(1, VVF<WeightsT>(filter[0].size(), VF<WeightsT>(filter[0][0].size(), 0))));
+ for (size_t fi = 0; fi < filter.size(); fi++) {
+ for (size_t yi = 0; yi < filter[0].size(); yi++) {
+ for (size_t xi = 0; xi < filter[0][0].size(); xi++) {
+ filter_extended[fi][0][yi][xi] = filter[fi][yi][xi];
+ }
+ }
+ }
+
+ VVVF<OutputT> output = reference_convolve<InputT, OutputT, WeightsT, AccT>(input_extended, filter_extended,
+ 1, stride_y, stride_x,
+ bias,
+ 1, dilation_y, dilation_x,
+ 0, input_padding_y, input_padding_x,
+ 0, output_padding_y, output_padding_x,
+ f_begin, f_end, depthwise, grouped,
+ data_zp, weights_zp);
+
+ VVF<OutputT> output_shrinked(output[0].size(), VF<OutputT>(output[0][0].size(), 0));
+
+ for (size_t yi = 0; yi < output[0].size(); yi++) {
+ for (size_t xi = 0; xi < output[0][0].size(); xi++) {
+ output_shrinked[yi][xi] = output[0][yi][xi];
+ }
+ }
+
+ return output_shrinked;
+}
+
template <typename T>
-VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
+VVVF<T> reference_scale_post_op(const VVVF<T>& input, const T& scale, const T& shift) {
auto output = input;
- auto size_y = input.size();
- auto size_x = input[0].size();
- for (size_t yi = 0; yi < size_y; ++yi) {
- for (size_t xi = 0; xi < size_x; ++xi) {
- output[yi][xi] = output[yi][xi] * scale + shift;
+ auto size_z = input.size();
+ auto size_y = input[0].size();
+ auto size_x = input[0][0].size();
+ for (size_t zi = 0; zi < size_z; ++zi) {
+ for (size_t yi = 0; yi < size_y; ++yi) {
+ for (size_t xi = 0; xi < size_x; ++xi) {
+ output[zi][yi][xi] = output[zi][yi][xi] * scale + shift;
+ }
}
}
return output;
}
+
+template <typename T>
+VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
+ VVVF<T> input_extended(1, VVF<T>(input.size(), VF<T>(input[0].size(), 0)));
+ for (size_t yi = 0; yi < input.size(); yi++) {
+ for (size_t xi = 0; xi < input[0].size(); xi++) {
+ input_extended[0][yi][xi] = input[yi][xi];
+ }
+ }
+ VVVF<T> output = reference_scale_post_op<T>(input_extended, scale, shift);
+ VVF<T> output_shrinked(output[0].size(), VF<T>(output[0][0].size(), 0));
+ for (size_t yi = 0; yi < output[0].size(); yi++) {
+ for (size_t xi = 0; xi < output[0][0].size(); xi++) {
+ output_shrinked[yi][xi] = output[0][yi][xi];
+ }
+ }
+
+ return output_shrinked;
+}
+
void dump_buffer(memory const& mem, std::string const& name)
{
std::ofstream out(name);
for (int f = 0; f < size.feature[0]; ++f)
{
out << "feature " << f << ":\n";
- for (int y = 0; y < size.spatial[1]; ++y)
+ for (int z = 0; z < size.spatial[2]; ++z)
{
- for (int x = 0; x < size.spatial[0]; ++x)
+ for (int y = 0; y < size.spatial[1]; ++y)
{
- size_t idx = b * pitches.batch[0] + f * pitches.feature[0] + y * pitches.spatial[1] + x * pitches.spatial[0];
- out << ptr[idx] << " ";
+ for (int x = 0; x < size.spatial[0]; ++x)
+ {
+ size_t idx = b * pitches.batch[0] + f * pitches.feature[0] + z * pitches.spatial[2] + y * pitches.spatial[1] + x * pitches.spatial[0];
+ out << ptr[idx] << " ";
+ }
+ out << "\n";
}
- out << "\n";
}
out << "\n";
bool>; // 6 - With bias
using TestParamType_grouped_convolution_gpu = ::testing::tuple< int, // 0 - Input X size
- int, // 1 - Input Y size
- int, // 2 - Input features
- int, // 3 - Output features
- int, // 4 - Kernel sizeX
- int, // 5 - Kernel sizeY
- int, // 6 - Groups number
- int, // 7 - Stride
- int, // 8 - Batch
- format>; // 9 - Input data format
+ int, // 1 - Input Y size
+ int, // 2 - Input Z size
+ int, // 3 - Input features
+ int, // 4 - Output features
+ int, // 5 - Kernel sizeX
+ int, // 6 - Kernel sizeY
+ int, // 7 - Kernel sizeZ
+ int, // 8 - Groups number
+ int, // 9 - Stride
+ int, // 10 - Batch
+ format, // 11 - Input data format
+ std::string>; // 12 - Implementation name
struct convolution_gpu : public ::testing::TestWithParam<TestParamType_convolution_gpu>
{
static std::string PrintToStringParamName(
testing::TestParamInfo<TestParamType_grouped_convolution_gpu> param_info) {
// construct a readable name
- return "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
- std::to_string(testing::get<1>(param_info.param)) + "y" +
- std::to_string(testing::get<2>(param_info.param)) + "f" +
- "_output" + std::to_string(testing::get<3>(param_info.param)) + "f" +
- "_filter" + std::to_string(testing::get<4>(param_info.param)) + "x" +
- std::to_string(testing::get<5>(param_info.param)) + "y" +
- "_groups" + std::to_string(testing::get<6>(param_info.param)) +
- "_stride" + std::to_string(testing::get<7>(param_info.param)) +
- "_batch" + std::to_string(testing::get<8>(param_info.param)) +
- "_format" + std::to_string(testing::get<9>(param_info.param));
+ std::string res = "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
+ std::to_string(testing::get<1>(param_info.param)) + "y" +
+ std::to_string(testing::get<2>(param_info.param)) + "z" +
+ std::to_string(testing::get<3>(param_info.param)) + "f" +
+ "_output" + std::to_string(testing::get<4>(param_info.param)) + "f" +
+ "_filter" + std::to_string(testing::get<5>(param_info.param)) + "x" +
+ std::to_string(testing::get<6>(param_info.param)) + "y" +
+ std::to_string(testing::get<7>(param_info.param)) + "z" +
+ "_groups" + std::to_string(testing::get<8>(param_info.param)) +
+ "_stride" + std::to_string(testing::get<9>(param_info.param)) +
+ "_batch" + std::to_string(testing::get<10>(param_info.param)) +
+ "_format" + std::to_string(testing::get<11>(param_info.param));
+
+ if (testing::get<12>(param_info.param) != "") {
+ res += "_impl_" + testing::get<12>(param_info.param);
+ }
+
+ return res;
}
};
INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
convolution_grouped_gpu,
::testing::Values(
- // Input X size, Input Y size, Input features, Output features, Kernel size X, Kernel size Y,
- // Groups number, Stride, Output padding, Batch, Input data format
+ // Input X size, Input Y size, Input Z size, Input features, Output features,
+ // Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
+ // Input data format, Implementation name
+
// Format: b_fs_yx_fsv4
- TestParamType_grouped_convolution_gpu(4, 4, 16, 17, 3, 3, 1, 1, 1, format::b_fs_yx_fsv4),
- TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1, format::b_fs_yx_fsv4),
- TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4, format::b_fs_yx_fsv4),
- TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1, format::b_fs_yx_fsv4),
- TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2, format::b_fs_yx_fsv4),
- TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1, format::b_fs_yx_fsv4),
- TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1, format::b_fs_yx_fsv4),
- TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1, format::b_fs_yx_fsv4),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv4, ""),
+ TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv4, ""),
+
// Format: b_fs_yx_fsv16
- TestParamType_grouped_convolution_gpu(4, 4, 16, 17, 3, 3, 1, 1, 1, format::b_fs_yx_fsv16),
- TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1, format::b_fs_yx_fsv16),
- TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4, format::b_fs_yx_fsv16),
- TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1, format::b_fs_yx_fsv16),
- TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2, format::b_fs_yx_fsv16),
- TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1, format::b_fs_yx_fsv16),
- TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1, format::b_fs_yx_fsv16),
- TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1, format::b_fs_yx_fsv16)
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv16, ""),
+
+ // Format: b_fs_zyx_fsv16
+ TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 17, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 16, 3, 3, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 4, 8, 4, 2, 2, 2, 2, 1, 4, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(8, 8, 8, 16, 16, 4, 4, 4, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(17, 17, 17, 32, 96, 3, 3, 3, 2, 2, 2, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(16, 16, 16, 8, 48, 2, 2, 2, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 3, 3, 48, 96, 2, 2, 2, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(6, 6, 6, 8, 26, 3, 3, 3, 2, 4, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(7, 5, 3, 51, 99, 3, 3, 3, 3, 1, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, format::b_fs_zyx_fsv16, "")
),
convolution_grouped_gpu::PrintToStringParamName);
const int input_x = testing::get<0>(GetParam()),
input_y = testing::get<1>(GetParam()),
- input_f = testing::get<2>(GetParam()),
- output_f = testing::get<3>(GetParam()),
- filter_x = testing::get<4>(GetParam()),
- filter_y = testing::get<5>(GetParam()),
- groups = testing::get<6>(GetParam()),
- stride = testing::get<7>(GetParam()),
- batch_num = testing::get<8>(GetParam()),
+ input_z = testing::get<2>(GetParam()),
+ input_f = testing::get<3>(GetParam()),
+ output_f = testing::get<4>(GetParam()),
+ filter_x = testing::get<5>(GetParam()),
+ filter_y = testing::get<6>(GetParam()),
+ filter_z = testing::get<7>(GetParam()),
+ groups = testing::get<8>(GetParam()),
+ stride = testing::get<9>(GetParam()),
+ batch_num = testing::get<10>(GetParam()),
output_padding = 0,
- input_offset_y = (filter_x - 1) / 2,
- input_offset_x = (filter_y - 1) / 2;
- auto input_data_format = testing::get<9>(GetParam());
-
- auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y));
- auto input_rnd = generate_random_4d<uint8_t>(batch_num, input_f, input_y, input_x, 0, 255);
- auto input_rnd_vec = flatten_4d<uint8_t>(format::bfyx, input_rnd);
- auto input = memory::allocate(engine, {data_types::u8, format::bfyx, input_size});
- set_values(input, input_rnd_vec);
+ input_offset_z = (filter_z - 1) / 2,
+ input_offset_y = (filter_y - 1) / 2,
+ input_offset_x = (filter_x - 1) / 2;
+ auto input_data_format = testing::get<11>(GetParam());
+ auto impl_name = testing::get<12>(GetParam());
+
+ auto num_in_spatial_dims = input_data_format.spatial_num();
+
+ auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y, input_z));
+ auto input_rnd = generate_random_5d<uint8_t>(batch_num, input_f, input_z, input_y, input_x, 0, 255);
+
+ auto input_lay = layout(data_types::u8, format::bfzyx, input_size);
+ if (num_in_spatial_dims == 2) {
+ input_lay = layout(data_types::u8, format::bfyx, input_size);
+ }
+ std::vector<uint8_t> input_flat(input_lay.get_linear_size());
+ for (int b = 0; b < batch_num; b++)
+ for (int f = 0; f < input_f; f++)
+ for (int z = 0; z < input_z; z++)
+ for (int y = 0; y < input_y; y++)
+ for (int x = 0; x < input_x; x++) {
+ tensor coords = tensor(batch(b), feature(f), spatial(x, y, z, 0));
+ size_t offset = input_lay.get_linear_offset(coords);
+ input_flat[offset] = input_rnd[b][f][z][y][x];
+ }
+ auto input = memory::allocate(engine, input_lay);
+ set_values(input, input_flat);
- auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y));
- VVVVVF<int8_t> weights_rnd = generate_random_5d<int8_t>(groups, output_f / groups, input_f / groups, filter_y, filter_x, -127, 127);
- auto weights_lay = layout(data_types::i8, format::goiyx, weights_size);
+ auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y, filter_z));
+ VVVVVVF<int8_t> weights_rnd = generate_random_6d<int8_t>(groups, output_f / groups, input_f / groups, filter_z, filter_y, filter_x, -127, 127);
+ auto weights_lay = layout(data_types::i8, format::goizyx, weights_size);
+ if (num_in_spatial_dims == 2) {
+ weights_lay = layout(data_types::i8, format::goiyx, weights_size);
+ }
std::vector<int8_t> weights_flat(weights_lay.get_linear_size());
for (int gi = 0; gi < groups; ++gi)
for (int ofi = 0; ofi < output_f / groups; ++ofi)
for (int ifi = 0; ifi < input_f / groups; ++ifi)
- for (int kyi = 0; kyi < filter_y; ++kyi)
- for (int kxi = 0; kxi < filter_x; ++kxi) {
- tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, 0, 0));
- size_t offset = weights_lay.get_linear_offset(coords);
- weights_flat[offset] = weights_rnd[gi][ofi][ifi][kyi][kxi];
- }
- auto weights = memory::allocate(engine, {data_types::i8, format::goiyx, weights_size});
+ for (int kzi = 0; kzi < filter_z; ++kzi)
+ for (int kyi = 0; kyi < filter_y; ++kyi)
+ for (int kxi = 0; kxi < filter_x; ++kxi) {
+ tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, kzi, 0));
+ size_t offset = weights_lay.get_linear_offset(coords);
+ weights_flat[offset] = weights_rnd[gi][ofi][ifi][kzi][kyi][kxi];
+ }
+ auto weights = memory::allocate(engine, weights_lay);
set_values(weights, weights_flat);
- VVVVF<float> expected_result(batch_num, VVVF<float>(output_f));
+ VVVVVF<float> expected_result(batch_num, VVVVF<float>(output_f));
// Calculate reference values without bias
for (int bi = 0; bi < batch_num; ++bi)
int f_end = gi * input_f / groups + input_f / groups;
expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<uint8_t, float, int8_t>(
- input_rnd[bi], weights_rnd[gi][ofi], // input, weights
- stride, stride, // strides
- 0, // bias
- 1, 1, // dilation
- input_offset_y, input_offset_x, // input padding
- 0, 0, // output_padding
- f_begin, f_end, // f_begin, f_end
- false, // depthwise
- grouped); // grouped
+ input_rnd[bi], weights_rnd[gi][ofi], // input, weights
+ stride, stride, stride, // strides
+ 0, // bias
+ 1, 1, 1, // dilation
+ input_offset_z, input_offset_y, input_offset_x, // input padding
+ 0, 0, 0, // output_padding
+ f_begin, f_end, // f_begin, f_end
+ false, // depthwise
+ grouped); // grouped
}
topology topology(input_layout("input", input.get_layout()),
"input_fsv",
{"weights"},
groups,
- {1, 1, stride, stride},
- {0, 0, -input_offset_x, -input_offset_y},
- {1, 1, 1, 1},
- padding({0, 0, output_padding, output_padding}, 0.f)));
+ tensor(batch(1), feature(1), spatial(stride, stride, stride, 1)),
+ tensor(batch(0), feature(0), spatial(-input_offset_x, -input_offset_y, -input_offset_z, 0)),
+ tensor(batch(1), feature(1), spatial(1, 1, 1, 1)),
+ padding({0, 0, output_padding, output_padding, output_padding}, 0.f)));
build_options options;
options.set_option(build_option::optimize_data(true));
- implementation_desc conv_impl = {input_data_format, "fused_conv_eltwise_gpu_imad"};
+ implementation_desc conv_impl = {input_data_format, impl_name};
options.set_option(build_option::force_implementations({{"conv", conv_impl}}));
network network(engine, topology, options);
ASSERT_EQ(out_mem.get_layout().format, input_data_format);
ASSERT_EQ(out_lay.size.batch[0], expected_result.size());
ASSERT_EQ(out_lay.size.feature[0], expected_result[0].size());
- ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0].size());
- ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0].size());
+ ASSERT_EQ(out_lay.size.spatial[2], expected_result[0][0].size());
+ ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0][0].size());
+ ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0][0].size());
for (int bi = 0; bi < batch_num; ++bi)
for (int ofi = 0; ofi < output_f; ++ofi)
- for (int yi = 0; yi < (int)expected_result[0][0].size(); ++yi)
- for (int xi = 0; xi < (int)expected_result[0][0][0].size(); ++xi) {
- tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, 0, 0));
- auto offset = out_lay.get_linear_offset(coords);
- auto val = out_ptr[offset];
- auto val_ref = expected_result[bi][ofi][yi][xi];
- auto equal = are_equal(val_ref, val, 1e-2f);
- if (!equal) {
- std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
- std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
+ for (int zi = 0; zi < (int)expected_result[0][0].size(); ++zi)
+ for (int yi = 0; yi < (int)expected_result[0][0][0].size(); ++yi)
+ for (int xi = 0; xi < (int)expected_result[0][0][0][0].size(); ++xi) {
+ tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, zi, 0));
+ auto offset = out_lay.get_linear_offset(coords);
+ auto val = out_ptr[offset];
+ auto val_ref = expected_result[bi][ofi][zi][yi][xi];
+ auto equal = are_equal(val_ref, val, 1e-2f);
+ if (!equal) {
+ std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", z: " << zi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
+ std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", z: " << zi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
+ }
+ EXPECT_TRUE(equal);
}
- EXPECT_TRUE(equal);
- }
}
template <typename InputT, typename WeightsT, typename OutputT>
_acc = max(_acc, val);
}
- output_t get(size_t /*pool_x*/, size_t /*pool_y*/) {
+ output_t get(size_t /*pool_x*/, size_t /*pool_y*/, size_t /*pool_z*/) {
return static_cast<output_t>(_acc);
}
_acc += static_cast<output_t>(val);
}
- output_t get(size_t /*pool_x*/, size_t /*pool_y*/) {
+ output_t get(size_t /*pool_x*/, size_t /*pool_y*/, size_t /*pool_z*/) {
return _acc / _cnt;
}
_acc += static_cast<output_t>(val);
}
- output_t get(size_t pool_x, size_t pool_y) {
- return static_cast<output_t>(_acc / static_cast<InputT>(pool_x * pool_y));
+ output_t get(size_t pool_x, size_t pool_y, size_t pool_z) {
+ return static_cast<output_t>(_acc / static_cast<InputT>(pool_x * pool_y * pool_z));
}
void reset() {
};
template <typename InputT, pooling_mode Mode>
-VVF<typename pooling_mode_output<InputT, Mode>::type> reference_pooling(const VVF<InputT>& input, size_t pool_x, size_t pool_y, int stride_x, int stride_y, int offset_x, int offset_y) {
+VVVF<typename pooling_mode_output<InputT, Mode>::type> reference_pooling(const VVVF<InputT>& input, size_t pool_x, size_t pool_y, size_t pool_z, int stride_x, int stride_y, int stride_z, int offset_x, int offset_y, int offset_z) {
using output_t = typename pooling_mode_output<InputT, Mode>::type;
- VVF<output_t> result;
- auto size_x = input[0].size();
- auto size_y = input.size();
+ VVVF<output_t> result;
+ auto size_x = input[0][0].size();
+ auto size_y = input[0].size();
+ auto size_z = input.size();
auto accumulator = pooling_accumulator<InputT, Mode>();
- for (int yi = offset_y; yi + static_cast<int>(pool_y) <= static_cast<int>(size_y) - offset_y; yi += stride_y) {
- VF<output_t> result_row;
- for (int xi = offset_x; xi + static_cast<int>(pool_x) <= static_cast<int>(size_x) - offset_x; xi += stride_x) {
- accumulator.reset();
- for (int fyi = 0; fyi < static_cast<int>(pool_y); ++fyi) {
- int index_y = yi + fyi;
- if (index_y < 0 || index_y >= static_cast<int>(size_y))
- continue;
- for (int fxi = 0; fxi < static_cast<int>(pool_x); ++fxi) {
- int index_x = xi + fxi;
- if (index_x < 0 || index_x >= static_cast<int>(size_x))
+ for (int zi = offset_z; zi + static_cast<int>(pool_z) <= static_cast<int>(size_z) - offset_z; zi += stride_z) {
+ VVF<output_t> result_matrix;
+ for (int yi = offset_y; yi + static_cast<int>(pool_y) <= static_cast<int>(size_y) - offset_y; yi += stride_y) {
+ VF<output_t> result_row;
+ for (int xi = offset_x; xi + static_cast<int>(pool_x) <= static_cast<int>(size_x) - offset_x; xi += stride_x) {
+ accumulator.reset();
+ for (int fzi = 0; fzi < static_cast<int>(pool_z); ++fzi) {
+ int index_z = zi + fzi;
+ if (index_z < 0 || index_z >= static_cast<int>(size_z))
continue;
-
- auto input_val = input[static_cast<size_t>(index_y)][static_cast<size_t>(index_x)];
- accumulator.accumulate(input_val);
+ for (int fyi = 0; fyi < static_cast<int>(pool_y); ++fyi) {
+ int index_y = yi + fyi;
+ if (index_y < 0 || index_y >= static_cast<int>(size_y))
+ continue;
+ for (int fxi = 0; fxi < static_cast<int>(pool_x); ++fxi) {
+ int index_x = xi + fxi;
+ if (index_x < 0 || index_x >= static_cast<int>(size_x))
+ continue;
+
+ auto input_val = input[static_cast<size_t>(index_z)][static_cast<size_t>(index_y)][static_cast<size_t>(index_x)];
+ accumulator.accumulate(input_val);
+ }
+ }
}
+ result_row.push_back(accumulator.get(pool_x, pool_y, pool_z));
}
- result_row.push_back(accumulator.get(pool_x, pool_y));
+ result_matrix.emplace_back(std::move(result_row));
}
- result.emplace_back(std::move(result_row));
+ result.emplace_back(std::move(result_matrix));
}
return result;
}
template <typename T>
-VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
+VVVF<T> reference_scale_post_op(const VVVF<T>& input, const T& scale, const T& shift) {
auto output = input;
- auto size_y = input.size();
- auto size_x = input[0].size();
- for (size_t yi = 0; yi < size_y; ++yi) {
- for (size_t xi = 0; xi < size_x; ++xi) {
- output[yi][xi] = output[yi][xi] * scale + shift;
+ auto size_z = input.size();
+ auto size_y = input[0].size();
+ auto size_x = input[0][0].size();
+ for (size_t zi = 0; zi < size_z; ++zi) {
+ for (size_t yi = 0; yi < size_y; ++yi) {
+ for (size_t xi = 0; xi < size_x; ++xi) {
+ output[zi][yi][xi] = output[zi][yi][xi] * scale + shift;
+ }
}
}
return output;
using output_t = typename pooling_mode_output<InputT, Mode>::type;
virtual topology build_topology(const engine& /*eng*/) {
- auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y()));
+ auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y(), input_z()));
auto input_lay = layout(input_type(),
input_format(),
input_size);
pooling("pool",
"input",
pool_mode(),
- tensor(batch(0), feature(0), spatial(pool_x(), pool_y())),
- tensor(batch(0), feature(0), spatial(stride_x(), stride_y())),
- tensor(batch(0), feature(0), spatial(offset_x(), offset_y())))
+ tensor(batch(0), feature(0), spatial(pool_x(), pool_y(), pool_z())),
+ tensor(batch(0), feature(0), spatial(stride_x(), stride_y(), stride_z())),
+ tensor(batch(0), feature(0), spatial(offset_x(), offset_y(), offset_z())))
);
return topo;
}
return "pool";
}
- virtual void run_expect(const VVVVF<output_t>& expected) {
+ virtual void run_expect(const VVVVVF<output_t>& expected) {
+
auto eng = get_test_engine();
auto topo = build_topology(eng);
auto opts = build_options(
);
auto net = network(eng, topo, opts);
- auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y()));
+ auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y(), input_z()));
auto input_lay = layout(input_type(),
input_format(),
input_size);
std::vector<InputT> input_flat(input_lay.get_linear_size(), static_cast<InputT>(0));
for (size_t bi = 0; bi < batch_num(); ++bi)
for (size_t fi = 0; fi < input_features(); ++fi)
- for (size_t yi = 0; yi < input_y(); ++yi)
- for (size_t xi = 0; xi < input_x(); ++xi) {
- tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
- size_t offset = input_lay.get_linear_offset(coords);
- input_flat[offset] = _input[bi][fi][yi][xi];
- }
+ for (size_t zi = 0; zi < input_z(); ++zi)
+ for (size_t yi = 0; yi < input_y(); ++yi)
+ for (size_t xi = 0; xi < input_x(); ++xi) {
+ tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, 0));
+ size_t offset = input_lay.get_linear_offset(coords);
+ input_flat[offset] = _input[bi][fi][zi][yi][xi];
+ }
set_values(input_mem, input_flat);
net.set_input_data("input", input_mem);
ASSERT_EQ(out_lay.data_type, output_type());
ASSERT_EQ(out_lay.size.batch[0], expected.size());
ASSERT_EQ(out_lay.size.feature[0], expected[0].size());
- ASSERT_EQ(out_lay.size.spatial[1], expected[0][0].size());
- ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0].size());
+ ASSERT_EQ(out_lay.size.spatial[2], expected[0][0].size());
+ ASSERT_EQ(out_lay.size.spatial[1], expected[0][0][0].size());
+ ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0][0].size());
bool compare_with_tolerance = input_type() == data_types::f16;
for (size_t bi = 0; bi < batch_num(); ++bi)
for (size_t fi = 0; fi < expected[0].size(); ++fi)
- for (size_t yi = 0; yi < expected[0][0].size(); ++yi)
- for (size_t xi = 0; xi < expected[0][0][0].size(); ++xi) {
- tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
- size_t offset = out_lay.get_linear_offset(coords);
- auto ref_val = static_cast<float>(expected[bi][fi][yi][xi]);
- auto actual_val = static_cast<float>(out_ptr[offset]);
- if (compare_with_tolerance) {
- auto tolerance = 1;
- ASSERT_NEAR(ref_val, actual_val, tolerance)
- << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
- } else {
- EXPECT_TRUE(are_equal(ref_val, actual_val))
- << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
+ for (size_t zi = 0; zi < expected[0][0].size(); ++zi)
+ for (size_t yi = 0; yi < expected[0][0][0].size(); ++yi)
+ for (size_t xi = 0; xi < expected[0][0][0][0].size(); ++xi) {
+ tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, 0));
+ size_t offset = out_lay.get_linear_offset(coords);
+ auto ref_val = static_cast<float>(expected[bi][fi][zi][yi][xi]);
+ auto actual_val = static_cast<float>(out_ptr[offset]);
+ if (compare_with_tolerance) {
+ auto tolerance = 1;
+ ASSERT_NEAR(ref_val, actual_val, tolerance)
+ << "at b= " << bi << ", f= " << fi << ", z= " << zi << ", y= " << yi << ", x= " << xi;
+ } else {
+ EXPECT_TRUE(are_equal(ref_val, actual_val))
+ << "at b= " << bi << ", f= " << fi << ", z= " << zi << ", y= " << yi << ", x= " << xi;
+ }
}
- }
-
}
size_t batch_num() { return _input.size(); }
size_t input_features() { return _input[0].size(); }
- size_t input_x() { return _input[0][0][0].size(); }
- size_t input_y() { return _input[0][0].size(); }
+ size_t input_x() { return _input[0][0][0][0].size(); }
+ size_t input_y() { return _input[0][0][0].size(); }
+ size_t input_z() { return _input[0][0].size(); }
format::type input_format() { return _input_fmt; }
data_types input_type() {
pooling_mode pool_mode() { return Mode; }
size_t pool_x() { return _pool_x; }
size_t pool_y() { return _pool_y; }
+ size_t pool_z() { return _pool_z; }
int stride_x() { return _stride_x; }
int stride_y() { return _stride_y; }
+ int stride_z() { return _stride_z; }
int offset_x() { return _offset_x; }
int offset_y() { return _offset_y; }
+ int offset_z() { return _offset_z; }
- void set_input(format::type input_fmt, VVVVF<InputT> input_data) {
+ void set_input(format::type input_fmt, VVVVVF<InputT> input_data) {
_input_fmt = input_fmt;
_input = std::move(input_data);
}
- void set_pool_size(size_t x, size_t y) {
+ void set_pool_size(size_t x, size_t y, size_t z) {
_pool_x = x;
_pool_y = y;
+ _pool_z = z;
}
- void set_strides(int x, int y) {
+ void set_strides(int x, int y, int z) {
_stride_x = x;
_stride_y = y;
+ _stride_z = z;
}
- void set_offsets(int x, int y) {
+ void set_offsets(int x, int y, int z) {
_offset_x = x;
_offset_y = y;
+ _offset_z = z;
}
- VVVVF<InputT> _input;
+ VVVVVF<InputT> _input;
format::type _input_fmt;
- size_t _pool_x, _pool_y;
- int _stride_x, _stride_y;
- int _offset_x, _offset_y;
+ size_t _pool_x, _pool_y, _pool_z;
+ int _stride_x, _stride_y, _stride_z;
+ int _offset_x, _offset_y, _offset_z;
};
using pooling_random_test_params = std::tuple<
- size_t, // batch
- size_t, // features
- std::tuple<size_t, size_t>, // input x, y
- std::tuple<size_t, size_t>, // pool x, y
- std::tuple<int, int>, // stride x, y
- std::tuple<int, int>, // offset x, y
- format::type // input format
+ size_t, // batch
+ size_t, // features
+ std::tuple<size_t, size_t, size_t>, // input x, y, z
+ std::tuple<size_t, size_t, size_t>, // pool x, y, z
+ std::tuple<int, int, int>, // stride x, y, z
+ std::tuple<int, int, int>, // offset x, y, z
+ format::type // input format
>;
template <typename InputT, pooling_mode Mode>
using parent = pooling_test_base<InputT, Mode>;
using output_t = typename parent::output_t;
- virtual VVVVF<output_t> calculate_reference() {
- VVVVF<output_t> reference(this->batch_num(), VVVF<output_t>(this->input_features()));
+ virtual VVVVVF<output_t> calculate_reference() {
+ VVVVVF<output_t> reference(this->batch_num(), VVVVF<output_t>(this->input_features()));
for (size_t bi = 0; bi < this->batch_num(); ++bi) {
for (size_t fi = 0; fi < this->input_features(); ++fi) {
reference[bi][fi] = reference_pooling<InputT, Mode>(
this->_input[bi][fi],
this->pool_x(),
this->pool_y(),
+ this->pool_z(),
this->stride_x(),
this->stride_y(),
+ this->stride_z(),
this->offset_x(),
- this->offset_y());
+ this->offset_y(),
+ this->offset_z());
}
}
return reference;
}
virtual void param_set_up(const pooling_random_test_params& params) {
- size_t b, f, in_x, in_y, p_x, p_y;
- int s_x, s_y, o_x, o_y;
+ size_t b, f, in_x, in_y, in_z, p_x, p_y, p_z;
+ int s_x, s_y, s_z, o_x, o_y, o_z;
format::type in_fmt;
std::forward_as_tuple(
b,
f,
- std::forward_as_tuple(in_x, in_y),
- std::forward_as_tuple(p_x, p_y),
- std::forward_as_tuple(s_x, s_y),
- std::forward_as_tuple(o_x, o_y),
+ std::forward_as_tuple(in_x, in_y, in_z),
+ std::forward_as_tuple(p_x, p_y, p_z),
+ std::forward_as_tuple(s_x, s_y, s_z),
+ std::forward_as_tuple(o_x, o_y, o_z),
in_fmt
) = params;
- auto input_data = generate_random_4d<InputT>(b, f, in_y, in_x, -256, 256);
+ auto input_data = generate_random_5d<InputT>(b, f, in_z, in_y, in_x, -256, 256);
this->set_input(in_fmt, std::move(input_data));
- this->set_pool_size(p_x, p_y);
- this->set_strides(s_x, s_y);
- this->set_offsets(o_x, o_y);
+ this->set_pool_size(p_x, p_y, p_z);
+ this->set_strides(s_x, s_y, s_z);
+ this->set_offsets(o_x, o_y, o_z);
}
void run_random(const pooling_random_test_params& params) {
}
INSTANTIATE_TEST_CASE_P(
- smoke_low_precision,
+ smoke_low_precision_2d_spatial,
pooling_random_test,
testing::Combine(testing::Values(1, 2),
testing::Values(3, 8, 64),
- testing::Values(std::tuple<size_t, size_t>(12, 12), std::tuple<size_t, size_t>(24, 24)),
- testing::Values(std::tuple<size_t, size_t>(4, 4), std::tuple<size_t, size_t>(2, 2)),
- testing::Values(std::tuple<int, int>(2, 2)),
- testing::Values(std::tuple<int, int>(0, 0)),
+ testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 1), std::tuple<size_t, size_t, size_t>(24, 24, 1)),
+ testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 1), std::tuple<size_t, size_t, size_t>(2, 2, 1)),
+ testing::Values(std::tuple<int, int, int>(2, 2, 1)),
+ testing::Values(std::tuple<int, int, int>(0, 0, 0)),
testing::Values(format::yxfb,
format::bfyx,
format::byxf_af32,
testing::internal::DefaultParamName<pooling_random_test_params>);
INSTANTIATE_TEST_CASE_P(
+ smoke_low_precision_3d_spatial,
+ pooling_random_test,
+ testing::Combine(testing::Values(1, 2),
+ testing::Values(3, 8, 64),
+ testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 12), std::tuple<size_t, size_t, size_t>(24, 24, 24)),
+ testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 4), std::tuple<size_t, size_t, size_t>(2, 2, 2)),
+ testing::Values(std::tuple<int, int, int>(2, 2, 2)),
+ testing::Values(std::tuple<int, int, int>(0, 0, 0)),
+ testing::Values(format::bfzyx,
+ format::b_fs_zyx_fsv16)),
+ testing::internal::DefaultParamName<pooling_random_test_params>);
+
+INSTANTIATE_TEST_CASE_P(
batched_low_precision,
pooling_random_test,
testing::Combine(
testing::Values(16),
testing::Values(16, 32),
- testing::Values(std::tuple<size_t, size_t>(3, 3), std::tuple<size_t, size_t>(8, 8)),
- testing::Values(std::tuple<size_t, size_t>(1, 1), std::tuple<size_t, size_t>(3, 3)),
- testing::Values(std::tuple<int, int>(1, 1)),
- testing::Values(std::tuple<int, int>(0, 0)),
+ testing::Values(std::tuple<size_t, size_t, size_t>(3, 3, 1), std::tuple<size_t, size_t, size_t>(8, 8, 1)),
+ testing::Values(std::tuple<size_t, size_t, size_t>(1, 1, 1), std::tuple<size_t, size_t, size_t>(3, 3, 1)),
+ testing::Values(std::tuple<int, int, int>(1, 1, 1)),
+ testing::Values(std::tuple<int, int, int>(0, 0, 0)),
testing::Values(format::bs_fs_yx_bsv16_fsv16)
),
testing::internal::DefaultParamName<pooling_random_test_params>);
topology build_topology(const engine& eng) override {
topology topo = parent::build_topology(eng);
- auto scale_lay = layout(this->output_type(), format::bfyx, tensor(batch(1), feature(this->input_features()), spatial(1, 1)));
+ auto scale_lay = layout(this->output_type(), format::bfyx, tensor(batch(1), feature(this->input_features()), spatial(1, 1, 1, 1)));
auto scale_mem = memory::allocate(eng, scale_lay);
auto shift_mem = memory::allocate(eng, scale_lay);
set_values(scale_mem, _scale);
return "scale_wa_out";
}
- VVVVF<output_t> calculate_reference() override {
+ VVVVVF<output_t> calculate_reference() override {
auto expected = parent::calculate_reference();
for (size_t bi = 0; bi < this->batch_num(); ++bi)
pooling_random_test_fp16_fp32,
testing::Combine(testing::Values(1, 2),
testing::Values(3, 8),
- testing::Values(std::tuple<size_t, size_t>(12, 12), std::tuple<size_t, size_t>(24, 24)),
- testing::Values(std::tuple<size_t, size_t>(4, 4), std::tuple<size_t, size_t>(2, 2)),
- testing::Values(std::tuple<int, int>(2, 2)),
- testing::Values(std::tuple<int, int>(0, 0)),
+ testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 1), std::tuple<size_t, size_t, size_t>(24, 24, 1)),
+ testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 1), std::tuple<size_t, size_t, size_t>(2, 2, 1)),
+ testing::Values(std::tuple<int, int, int>(2, 2, 1)),
+ testing::Values(std::tuple<int, int, int>(0, 0, 0)),
testing::Values(format::yxfb,
format::bfyx,
format::byxf,