tensor get_pitches() const {
auto sizes = get_buffer_size().sizes(format);
- if (format == format::byxf_af32) {
- sizes[3] = align_to(sizes[3], 32);
- }
-
- if (format == format::byx8_f4) {
- sizes[3] = align_to(sizes[3], 4);
- sizes[2] = align_to(sizes[2], 8);
- }
std::vector<tensor::value_type> pitches(sizes.size(), tensor::value_type(1));
std::partial_sum(sizes.rbegin(), sizes.rend() - 1, pitches.rbegin() + 1, std::multiplies<tensor::value_type>());
return {format, pitches};
sizes[block_axis] = align_to(sizes[block_axis], block_size);
}
- if (this->format == cldnn::format::bf8_xy16 && !(is_aligned_to(sizes[1], 8) && is_aligned_to(sizes[2] * sizes[3], 16))) {
- sizes[3] = align_to(sizes[2] * sizes[3], 16);
- sizes[2] = 1;
- } else if (this->format == cldnn::format::byxf_af32 && !(is_aligned_to(sizes[1], 32))) {
- sizes[1] = align_to(sizes[1], 32);
- } else if (this->format == cldnn::format::byx8_f4 && (!is_aligned_to(sizes[1], 4) || !is_aligned_to(sizes[2], 8))) {
- sizes[1] = align_to(sizes[1], 4);
- sizes[2] = align_to(sizes[2], 8);
- } else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) {
+ if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) {
sizes[0] = align_to(sizes[0], 8);
sizes[1] = align_to(sizes[1], 32);
} else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(sizes[0], 32)) && !(is_aligned_to(sizes[1], 32))) {
bs_fs_zyx_bsv16_fsv16, ///< format used for 3D blocked convolution (batch and features blocked by 16)
bs_fs_yx_bsv16_fsv16, ///< format used for 2D blocked convolution (batch and features blocked by 16)
fs_b_yx_fsv32, ///< format for input for fp16 primitives
- fs_bs_yx_bsv4_fsv32, ///< format for batched input for primitives using MMAD
b_fs_yx_fsv4, ///< format for input for IMAD convolutions
bs_xs_xsv8_bsv8, ///< format used only for fully connected weights: bs - batch slice,
///< xs - x slice, bsv8 - 8 values of single slice.
bs_x_bsv16, ///< format used only for fully connected weights fp16 batch=1 : bs - batch slice
///< (responses slice), bsv16 - 16 values of single batch slice, x - flattened plane of (fyx)
///< \n \image html bs_x_bsv16.jpg
- byxf_af32, ///< format for input for primitives using MMAD
- byx8_f4, ///< format for input for MMAD convolutions
- bf8_xy16, ///< format used only for convolution 1x1 input, xy aligned to 16, f aligned to 8
- ///< \n \image html bf8_xy16.jpg
b_fs_yx_32fp, ///< format for data for binary convolutions
winograd_2x3_s1_data, ///< format used for input for winograd convolution, F(2,3) -- filter 3x3 with stride 1
nv12, ///< format for media nv12 input
{ bs_xs_xsv8_bsv8, { 1, 1, 1, 0, 0, "bx", "b?x??", {{2, 8}, {0, 8}}}},
{ bs_xs_xsv8_bsv16, { 1, 1, 1, 0, 0, "bx", "b?x??", {{2, 8}, {0, 16}}}},
{ bs_x_bsv16, { 1, 1, 1, 0, 0, "bx", "b?x??", {{0, 16}}}},
- { bf8_xy16, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{1, 8}}}},
{ winograd_2x3_s1_data, { 1, 1, 2, 0, 0, "bxyf", "bfxy?", {}}},
- { byxf_af32, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
- { byx8_f4 , { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
- { fs_bs_yx_bsv4_fsv32, { 1, 1, 2, 0, 0, "fbyx", "bfxy?", {{0, 4}, {1, 32}}}},
{ b_fs_yx_fsv4, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{1, 4}}}},
{ bfzyx, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {}}},
{ bfwzyx, { 1, 1, 4, 0, 0, "bfwzyx", "bfxyzw", {}}},
adjusted_coords[external_axis] /= block_size;
}
- if (fmt == cldnn::format::byxf_af32 && !(is_aligned_to(my_sizes[3], 32))) {
- my_sizes[3] = align_to(my_sizes[3], 32);
- } else if (fmt == cldnn::format::byx8_f4 && (!(is_aligned_to(my_sizes[3], 4)) || !(is_aligned_to(my_sizes[2], 8)))) {
- my_sizes[3] = align_to(my_sizes[3], 4);
- my_sizes[2] = align_to(my_sizes[2], 8);
- } else if (fmt == cldnn::format::bf8_xy16) {
- // Special case of blocked format, where xy is treated as one flattened dimension
- auto flat_xy = adjusted_coords[3] + adjusted_coords[2] * my_sizes[3];
-
- my_sizes.push_back(16);
- my_sizes[3] = ceil_div(my_sizes[2] * my_sizes[3], 16);
- my_sizes[2] = 1;
-
- adjusted_coords.push_back(flat_xy % 16);
- adjusted_coords[3] = flat_xy / 16;
- adjusted_coords[2] = 0;
- } else if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4 && // TODO Fix offsets calculation for formats below
+ if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4 && // TODO Fix offsets calculation for formats below
!(is_aligned_to(my_sizes[0], 8)) &&
!(is_aligned_to(my_sizes[1], 32))) {
my_sizes[0] = align_to(my_sizes[0], 8);
{ DataLayout::bs_fs_yx_bsv16_fsv16, { 0, 1, -1, -1, 2, 3 } },
{ DataLayout::bs_f_bsv8__af8, { -1, -1, -1, -1, 0, 1 } },
{ DataLayout::bs_f_bsv16__af8, { -1, -1, -1, -1, 0, 1 } },
- { DataLayout::bf8_xy16, { 0, 1, -1, -1, 2, 3 } },
{ DataLayout::winograd_2x3_s1_data, { 2, 1, -1, -1, 0, 3 } },
- { DataLayout::byxf_af32, { 1, 2, -1, -1, 0, 3 } },
- { DataLayout::byx8_f4, { 1, 2, -1, -1, 0, 3 } },
- { DataLayout::fs_bs_yx_bsv4_fsv32, { 0, 1, -1, -1, 3, 2 } },
{ DataLayout::b_fs_yx_fsv4, { 0, 1, -1, -1, 2, 3 } },
{ DataLayout::bfzyx, { 0, 1, 2, -1, 3, 4 } },
{ DataLayout::fs_b_yx_fsv32, { 0, 1, -1, -1, 3, 2 } },
assert(newDims.size() == 5);
newDims[3] = RoundUp(newDims[3], 32);
break;
- case bf8_xy16:
- assert(newDims.size() == 4);
- newDims[1] = RoundUp(newDims[1], 8);
- newDims[3] = RoundUp(newDims[2] * newDims[3], 16);
- newDims[2] = 1;
- break;
- case byxf_af32:
- assert(newDims.size() == 4);
- newDims[0] = RoundUp(newDims[0], 32);
- break;
- case byx8_f4:
- assert(newDims.size() == 4);
- newDims[0] = RoundUp(newDims[0], 4);
- newDims[1] = RoundUp(newDims[1], 8);
- break;
- case fs_bs_yx_bsv4_fsv32:
- assert(newDims.size() == 4);
- newDims[3] = RoundUp(newDims[3], 32);
- newDims[2] = RoundUp(newDims[2], 4);
- break;
case b_fs_yx_32fp:
assert(newDims.size() == 4);
newDims[3] = RoundUp(newDims[3], 32);
pitch *= newDims[i];
}
- if (l == byxf_af32 || l == fs_bs_yx_bsv4_fsv32 || l == byx8_f4) {
- ret[0].pitch = 1;
- ret[1].pitch = ret[0].pitch * newDims[0];
- ret[2].pitch = ret[1].pitch * newDims[1];
- ret[3].pitch = ret[2].pitch * newDims[2];
- ret[4].pitch = ret[3].pitch * newDims[3];
- }
-
return ret;
}
bs_fs_zyx_bsv16_fsv16, // batch, feature, 3D spatial. Blocks of 16 batch and channels
bs_f_bsv8__af8, // for optimized FC
bs_f_bsv16__af8, // for optimized FC
- bf8_xy16, // for optimized conv1x1
winograd_2x3_s1_data, // winograd convolution input, F(2,3) -- filter 3x3 with stride 1
- byxf_af32, // for MMAD convolution
- byx8_f4, // for MMAD convolution
- fs_bs_yx_bsv4_fsv32, // for batched MMAD
b_fs_yx_fsv4, // reordering format for swizzled input for convolution using IMAD
bfzyx, // batch+feature+3D spatial
fs_b_yx_fsv32, // for FP16 kernels, 32 features to avoid partial writes
k.EnableInputLayout(DataLayout::byxf);
k.EnableInputLayout(DataLayout::fyxb);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
- k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableOutputLayout(DataLayout::bf);
k.EnableOutputLayout(DataLayout::byxf);
k.EnableOutputLayout(DataLayout::fyxb);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
- k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableTensorOffset();
k.EnableOutputDataType(Datatype::F32);
k.EnableInputWeightsType(WeightsType::F16);
k.EnableInputWeightsType(WeightsType::F32);
- k.EnableInputLayout(DataLayout::bf8_xy16);
k.EnableInputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::yxfb);
- k.EnableOutputLayout(DataLayout::bf8_xy16);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableDilation();
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
- k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
in_fsv = 4;
else if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16)
in_fsv = 16;
- else if (params.inputs[0].GetLayout() == DataLayout::byxf_af32)
- in_fsv = 32;
mem_consts.AddConstants({
MakeJitConstant("_ID", RoundUp(input.Feature().v, in_fsv)),
+++ /dev/null
-/*
-// Copyright (c) 2019-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_imad_byxf_af32_1x1.h"
-
-static size_t GetTileLength(size_t out_xy, size_t out_f, size_t min_threads) {
- for (int tile_len = 14; tile_len > 0; tile_len--) {
- // Kernel writes 32 output features per HW thread
- size_t threads = (out_xy / tile_len) * out_xy * out_f / 32;
- // Chose largest valid tile with enough HW threads
- if ((out_xy % tile_len == 0) && (threads >= min_threads)) {
- return tile_len;
- }
- }
- return 1;
-}
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_imad_byxf_af32_1x1::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::INT8);
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableInputDataType(Datatype::UINT8);
- k.EnableOutputDataType(Datatype::UINT8);
- k.EnableInputWeightsType(WeightsType::INT8);
- k.EnableInputLayout(DataLayout::byxf_af32);
- k.EnableOutputLayout(DataLayout::byxf_af32);
- k.EnableDifferentTypes();
- k.EnableDifferentInputWeightsTypes();
- k.EnableTensorOffset();
- k.EnableTensorPitches();
- k.EnableDilation();
- k.EnableBiasPerFeature();
- k.EnableBiasPerOutput();
- k.EnableNonBiasTerm();
- k.EnableBatching();
- k.EnableQuantization(QuantizationType::SYMMETRIC);
- k.DisableTuning();
- return k;
-}
-
-bool ConvolutionKernel_imad_byxf_af32_1x1::Validate(const Params& p, const optional_params& o) const {
- if (!Parent::Validate(p, o)) {
- return false;
- }
-
- const auto& params = static_cast<const convolution_params&>(p);
-
- if (params.filterSize.x != 1 || params.filterSize.y != 1)
- return false;
-
- if (params.padding.x != 0 || params.padding.y != 0)
- return false;
-
- if (params.output.Feature().v % 32 != 0)
- return false;
-
- const auto& input = params.inputs[0];
-
- // we do not support padded input
- if (input.X().pad.Total() != 0 || input.Y().pad.Total() != 0)
- return false;
-
- if (params.split != 1)
- return false;
-
- if (params.groups != 1)
- return false;
-
- return true;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_byxf_af32_1x1::SetDefault(const convolution_params& arg,
- int) const {
- DispatchData runInfo = Parent::SetDefault(arg);
-
- // Sub-group size
- constexpr size_t sub_group_size = 8;
-
- const auto of_maps = arg.output.Feature().v;
- const size_t of_maps_per_batch = RoundUp(of_maps, 32);
- const size_t of_maps_total = of_maps_per_batch * arg.output.Batch().v;
-
- // Need to have at least 4 HW threads per EU
- const size_t tile_length = GetTileLength(arg.output.X().v, of_maps_total, arg.engineInfo.computeUnitsCount * 4);
- runInfo.cldnnStyle.blockWidth = tile_length;
-
- runInfo.efficiency = FORCE_PRIORITY_1;
-
- runInfo.gws0 = arg.output.X().v * arg.output.Y().v / tile_length;
- runInfo.gws1 = of_maps_total / 4; // TILE_DEPTH==4
- runInfo.gws2 = 1;
-
- runInfo.lws0 = 1;
- runInfo.lws1 = sub_group_size;
- runInfo.lws2 = 1;
-
- return runInfo;
-}
-
-JitConstants ConvolutionKernel_imad_byxf_af32_1x1::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
-
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1));
- jit.AddConstant(MakeJitConstant("TILE_LENGTH", runInfo.cldnnStyle.blockWidth));
- jit.AddConstant(MakeJitConstant("TILE_DEPTH", 4));
-
- jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
-
- if (!params.fused_ops.empty()) {
- auto input_dt = GetActivationType(params);
- FusedOpsConfiguration conf_scalar = {"", {"b", "f2", "y", "(x+i)"}, "res", input_dt, 1 };
- jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
- }
-
- return jit;
-}
-
-KernelsData ConvolutionKernel_imad_byxf_af32_1x1::GetKernelsData(const Params& params,
- const optional_params& options) const {
- return GetTunedKernelsDataByIndex(params, options);
-}
-} // namespace kernel_selector
+++ /dev/null
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_imad_byxf_af32_1x1 : public ConvolutionKernelBase {
-public:
- using Parent = ConvolutionKernelBase;
- ConvolutionKernel_imad_byxf_af32_1x1() : ConvolutionKernelBase("fused_conv_eltwise_gpu_af32_imad_1x1") {}
- virtual ~ConvolutionKernel_imad_byxf_af32_1x1() {}
-
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
-
-protected:
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
- DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- bool Validate(const Params& p, const optional_params& o) const override;
- WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
- return WeightsLayout::os_is_osv32_isv32_swizzled_by_4;
- }
- std::vector<FusedOpType> GetSupportedFusedOps() const override {
- return { FusedOpType::ELTWISE,
- FusedOpType::QUANTIZE,
- FusedOpType::SCALE,
- FusedOpType::ACTIVATION };
- }
-};
-} // namespace kernel_selector
+++ /dev/null
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_imad_byxf_af32_depthwise.h"
-
-#define SIMD_SIZE 16
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_imad_byxf_af32_depthiwise::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::INT8);
- k.EnableInputDataType(Datatype::UINT8);
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableOutputDataType(Datatype::UINT8);
- k.EnableOutputDataType(Datatype::F32);
- k.EnableOutputDataType(Datatype::F16);
- k.EnableInputWeightsType(WeightsType::INT8);
- k.EnableInputLayout(DataLayout::byxf_af32);
- k.EnableOutputLayout(DataLayout::byxf_af32);
- k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
- k.EnableTensorOffset();
- k.EnableTensorPitches();
- k.EnableDilation();
- k.EnableBiasPerFeature();
- k.EnableBiasPerOutput();
- k.EnableNonBiasTerm();
- k.EnableBatching();
- k.EnableSplitSupport();
- k.EnableQuantization(QuantizationType::SYMMETRIC);
- k.EnableDepthwiseSeparableOpt();
- k.EnableDifferentTypes();
- k.EnableDifferentInputWeightsTypes();
- k.DisableTuning();
- k.EnableGroupedConvolution();
- return k;
-}
-
-static size_t GetTileLength(size_t out_x) {
- for (int i = 20; i >= 1; i--) {
- if (out_x % i == 0)
- return i;
- }
- return 1;
-}
-
-static int GetSplit(size_t out_x, int stride) {
- if (out_x >= 75) {
- if (stride > 1)
- return 1;
- else
- return 3;
- }
-
- if (out_x == 38 && stride == 2)
- return 2;
-
- if (out_x < 75) {
- if (stride > 1)
- return 1;
- else if (out_x % 2 == 0)
- return 2;
- }
- return 1;
-}
-
-bool ConvolutionKernel_imad_byxf_af32_depthiwise::Validate(const Params& p, const optional_params& o) const {
- if (!Parent::Validate(p, o)) {
- return false;
- }
-
- const convolution_params& cp = static_cast<const convolution_params&>(p);
- if (cp.inputs[0].Feature().v != cp.groups || cp.output.Feature().v != cp.groups || cp.groups == 1) {
- return false;
- }
-
- return true;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_byxf_af32_depthiwise::SetDefault(const convolution_params& arg,
- int) const {
- DispatchData runInfo = Parent::SetDefault(arg);
-
- runInfo.efficiency = FORCE_PRIORITY_1;
-
- runInfo.gws0 = Align(arg.output.Feature().v, SIMD_SIZE) * arg.output.Batch().v;
- runInfo.gws1 = arg.output.X().v / GetTileLength(arg.output.X().v);
- runInfo.gws2 = CeilDiv(arg.output.Y().v, GetSplit(arg.output.Y().v, arg.stride.y));
-
- std::vector<size_t> local = { SIMD_SIZE, 1, 1 };
-
- runInfo.lws0 = local[0];
- runInfo.lws1 = local[1];
- runInfo.lws2 = local[2];
-
- return runInfo;
-}
-
-JitConstants ConvolutionKernel_imad_byxf_af32_depthiwise::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
-
- jit.AddConstant(MakeJitConstant("ALIGNED_OFM", Align(params.output.Feature().v, SIMD_SIZE)));
- jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", GetTileLength(params.output.X().v)));
- jit.AddConstant(MakeJitConstant("SPLIT_Y", GetSplit(params.output.Y().v, params.stride.y)));
- jit.AddConstant(MakeJitConstant("SIMD_SIZE", SIMD_SIZE));
-
- if (params.output.Y().v % GetSplit(params.output.Y().v, params.stride.y) != 0)
- jit.AddConstant(MakeJitConstant("SPLIT_LEFTOVERS", params.output.Y().v % GetSplit(params.output.Y().v, params.stride.y)));
-
- if (!params.fused_ops.empty()) {
- auto input_dt = GetActivationType(params);
- FusedOpsConfiguration conf_scalar = {"", {"b", "of", "(y+m)", "(x+l)"}, "res", input_dt, 1 };
- conf_scalar.SetLoopAxes({Tensor::DataChannelName::Y, Tensor::DataChannelName::X});
- jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
- }
-
- return jit;
-}
-
-
-KernelsData ConvolutionKernel_imad_byxf_af32_depthiwise::GetKernelsData(const Params& params,
- const optional_params& options) const {
- KernelsData kd = GetTunedKernelsDataByIndex(params, options);
- if (!kd.empty())
- kd[0].estimatedTime = FORCE_PRIORITY_1;
- return kd;
-}
-
-} // namespace kernel_selector
+++ /dev/null
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_imad_byxf_af32_depthiwise : public ConvolutionKernelBase {
-public:
- using Parent = ConvolutionKernelBase;
- ConvolutionKernel_imad_byxf_af32_depthiwise() : ConvolutionKernelBase("convolution_gpu_byxf_af32_depthwise") {}
- virtual ~ConvolutionKernel_imad_byxf_af32_depthiwise() {}
-
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
-
-protected:
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
- DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
- bool Validate(const Params& p, const optional_params& o) const override;
- WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
- return WeightsLayout::goiyx;
- }
- std::vector<FusedOpType> GetSupportedFusedOps() const override {
- return { FusedOpType::ELTWISE,
- FusedOpType::QUANTIZE,
- FusedOpType::SCALE,
- FusedOpType::ACTIVATION };
- }
-};
-} // namespace kernel_selector
+++ /dev/null
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::INT8);
- k.EnableInputDataType(Datatype::UINT8);
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableOutputDataType(Datatype::UINT8);
- k.EnableInputWeightsType(WeightsType::INT8);
- k.EnableInputLayout(DataLayout::byxf_af32);
- k.EnableOutputLayout(DataLayout::byxf_af32);
- k.EnableTensorOffset();
- k.EnableTensorPitches();
- k.EnableDilation();
- k.EnableBiasPerFeature();
- k.EnableBiasPerOutput();
- k.EnableNonBiasTerm();
- k.EnableBatching();
- k.EnableSplitSupport();
- k.EnableQuantization(QuantizationType::SYMMETRIC);
- k.EnableDifferentInputWeightsTypes();
- k.DisableTuning();
- k.EnableDifferentTypes();
- return k;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad::SetDefault(const convolution_params& arg, int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
- constexpr size_t sub_group_size = 8;
-
- const auto of_maps = arg.output.Feature().v;
- const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
-
- runInfo.efficiency = FORCE_PRIORITY_4;
-
- runInfo.gws0 = arg.output.X().v;
- runInfo.gws1 = arg.output.Y().v;
- runInfo.gws2 = of_threads_per_batch * arg.output.Batch().v;
-
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = sub_group_size;
-
- return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
-
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
-
- // pitch for special block format used in this kernel
- const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
- const size_t filter_ofm_block_pitch =
- (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
- jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
- jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
- if (!params.fused_ops.empty()) {
- auto input_dt = GetActivationType(params);
- FusedOpsConfiguration conf_scalar = {"", {"b", "f", "y", "x"}, "res", input_dt, 1 };
- jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
- }
- return jit;
-}
-
-KernelsData ConvolutionKernel_mmad::GetKernelsData(const Params& params, const optional_params& options) const {
- KernelsData kd = GetTunedKernelsDataByIndex(params, options);
- if (!kd.empty())
- kd[0].estimatedTime = FORCE_PRIORITY_4;
- return kd;
-}
-} // namespace kernel_selector
+++ /dev/null
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad : public ConvolutionKernelBase {
-public:
- using Parent = ConvolutionKernelBase;
- ConvolutionKernel_mmad() : ConvolutionKernelBase("convolution_gpu_mmad") {}
- virtual ~ConvolutionKernel_mmad() {}
-
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
-
-protected:
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
- DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
- return WeightsLayout::os_is_yx_isa8_osv8_isv4;
- }
- std::vector<FusedOpType> GetSupportedFusedOps() const override {
- return { FusedOpType::ELTWISE,
- FusedOpType::QUANTIZE,
- FusedOpType::SCALE,
- FusedOpType::ACTIVATION };
- }
-};
-} // namespace kernel_selector
+++ /dev/null
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_batched.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_batched::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::INT8);
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableInputWeightsType(WeightsType::INT8);
- k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
- k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
- k.EnableTensorOffset();
- k.EnableTensorPitches();
- k.EnableDilation();
- k.EnableBiasPerFeature();
- k.EnableNonBiasTerm();
- k.EnableBatching();
- k.EnableSplitSupport();
- k.EnableQuantization(QuantizationType::SYMMETRIC);
- k.DisableTuning();
- return k;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched::SetDefault(const convolution_params& arg,
- int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
- constexpr size_t sub_group_size = 8;
-
- const auto of_maps = arg.output.Feature().v;
- const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
-
- runInfo.efficiency = FORCE_PRIORITY_6;
-
- runInfo.gws0 = arg.output.X().v;
- runInfo.gws1 = arg.output.Y().v;
- runInfo.gws2 = of_threads_per_batch * ((arg.output.Batch().v + 3) / 4);
-
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = sub_group_size;
-
- return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_batched::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
-
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
-
- // pitch for special block format used in this kernel
- const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
- const size_t filter_ofm_block_pitch =
- (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
- jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
- const size_t in_x_pitch = 32 * 4;
- const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
- const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
- const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
- const size_t in_offset =
- in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
- jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
- jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
- jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
- jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
- jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
- return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_batched::GetKernelsData(const Params& params, const optional_params& options) const {
- KernelsData kd = GetTunedKernelsDataByIndex(params, options);
- if (!kd.empty())
- kd[0].estimatedTime = FORCE_PRIORITY_6;
- return kd;
-}
-} // namespace kernel_selector
+++ /dev/null
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_batched : public ConvolutionKernelBase {
-public:
- using Parent = ConvolutionKernelBase;
- ConvolutionKernel_mmad_batched() : ConvolutionKernelBase("convolution_gpu_mmad_batched") {}
- virtual ~ConvolutionKernel_mmad_batched() {}
-
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
-
-protected:
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
- DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
- return WeightsLayout::os_is_yx_isa8_osv8_isv4;
- }
-};
-} // namespace kernel_selector
\ No newline at end of file
+++ /dev/null
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_batched_block.h"
-#include "kernel_selector_utils.h"
-#include <vector>
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_batched_block::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::INT8);
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableInputWeightsType(WeightsType::INT8);
- k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
- k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
- k.EnableTensorOffset();
- k.EnableTensorPitches();
- k.EnableBiasPerFeature();
- k.EnableBatching();
- k.EnableQuantization(QuantizationType::SYMMETRIC);
- k.DisableTuning();
- return k;
-}
-
-struct block_params {
- int32_t out_width;
- int32_t out_height;
- int32_t out_depth;
-};
-
-static block_params get_out_block_size(const convolution_params& p) {
- if (p.filterSize.x == 3 && p.filterSize.y == 3) {
- if (p.output.X().v == 7)
- return {7, 1, 4};
- else if (p.output.X().v == 14)
- return {7, 1, 4};
- else if (p.output.X().v == 28)
- return {7, 1, 4};
- else if (p.output.X().v == 56)
- return {8, 1, 4};
- }
-
- return {1, 1, 1};
-}
-
-WeightsLayout ConvolutionKernel_mmad_batched_block::GetPreferredWeightsLayout(
- const convolution_params &cp) const {
- auto block = get_out_block_size(cp);
- if (block.out_depth == 4)
- return WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4;
- else
- return WeightsLayout::os_is_yx_isa8_osv8_isv4;
-}
-
-bool ConvolutionKernel_mmad_batched_block::Validate(const Params& p, const optional_params& o) const {
- if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
- return false;
- }
- const convolution_params& cp = static_cast<const convolution_params&>(p);
-
- // if block sizes are 1x1, then this algorithm is probably not the best
- auto block = get_out_block_size(cp);
- if (block.out_width == 1 && block.out_height == 1)
- return false;
-
- if (cp.output.X().v % block.out_width != 0)
- return false;
- if (cp.output.Y().v % block.out_height != 0)
- return false;
-
- if (cp.filterSize.x == 1)
- return false;
-
- return true;
-}
-
-size_t static get_wg_batch_count(const convolution_params& params) {
- if (params.inputs[0].Batch().v % 64 == 0)
- return 16; // because we process 4 batches per SIMD
- return 1;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block::SetDefault(const convolution_params& arg,
- int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
- constexpr size_t sub_group_size = 8;
-
- runInfo.efficiency = FORCE_PRIORITY_5;
-
- auto block = get_out_block_size(arg);
-
- runInfo.gws0 = arg.output.X().v / block.out_width;
- runInfo.gws1 = arg.output.Y().v / block.out_height;
- runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) /
- block.out_depth; // process 4 output channels per Workitem
-
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = sub_group_size * get_wg_batch_count(arg);
-
- return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_batched_block::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
-
- const int sub_group_size = 8;
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
-
- // pitch for special block format used in this kernel
- const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
- const size_t filter_ofm_block_pitch =
- (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
- jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
- const size_t in_x_pitch = 32 * 4;
- const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
- const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
- const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
- const size_t in_offset =
- in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
- jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
- jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
- jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
- jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
- jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-
- const size_t out_x_pitch = 32 * 4;
- jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
-
- auto block = get_out_block_size(params);
- jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
- jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
- jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth));
-
- jit.AddConstant(MakeJitConstant("WG_BATCH_COUNT", get_wg_batch_count(params)));
-
- return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_batched_block::GetKernelsData(const Params& params,
- const optional_params& options) const {
- KernelsData kd = GetCommonKernelsData(params, options);
- if (!kd.empty())
- kd[0].estimatedTime = FORCE_PRIORITY_5;
- return kd;
-}
-} // namespace kernel_selector
+++ /dev/null
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_batched_block : public ConvolutionKernelBase {
-public:
- using Parent = ConvolutionKernelBase;
- ConvolutionKernel_mmad_batched_block() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block") {}
- virtual ~ConvolutionKernel_mmad_batched_block() {}
-
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
-
-protected:
- bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
- DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
-};
-} // namespace kernel_selector
\ No newline at end of file
+++ /dev/null
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_batched_block_1x1.h"
-#include "kernel_selector_utils.h"
-#include <vector>
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_batched_block_1x1::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::INT8);
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableInputWeightsType(WeightsType::INT8);
- k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
- k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
- k.EnableTensorOffset();
- k.EnableTensorPitches();
- k.EnableBiasPerFeature();
- k.EnableBatching();
- k.EnableQuantization(QuantizationType::SYMMETRIC);
- k.DisableTuning();
- return k;
-}
-
-struct block_params {
- int32_t out_width;
- int32_t out_height;
- int32_t out_depth;
-};
-
-static block_params get_out_block_size(const convolution_params& p) {
- if (p.output.X().v == 7)
- return {7, 1, 4};
- else if (p.output.X().v == 14)
- return {7, 1, 4};
- else if (p.output.X().v == 28)
- return {4, 2, 4};
- else if (p.output.X().v == 56)
- return {8, 1, 4};
-
- return {1, 1, 1};
-}
-
-WeightsLayout ConvolutionKernel_mmad_batched_block_1x1::GetPreferredWeightsLayout(
- const convolution_params &cp) const {
- auto block = get_out_block_size(cp);
- if (block.out_depth == 4)
- return WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4;
- else
- return WeightsLayout::os_is_yx_isa8_osv8_isv4;
-}
-
-bool ConvolutionKernel_mmad_batched_block_1x1::Validate(const Params& p, const optional_params& o) const {
- if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
- return false;
- }
- const convolution_params& cp = static_cast<const convolution_params&>(p);
-
- // only for conv 1x1
- if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
- return false;
-
- // only for stride 1x1
- if (cp.stride.x != 1 || cp.stride.y != 1)
- return false;
-
- // if block sizes are 1x1, then this algorithm is probably not the best
- auto block = get_out_block_size(cp);
- if (block.out_depth != 4)
- return false;
-
- if (cp.output.X().v % block.out_width != 0)
- return false;
- if (cp.output.Y().v % block.out_height != 0)
- return false;
-
- return true;
-}
-
-size_t static get_wg_batch_count(const convolution_params& params) {
- if (params.inputs[0].Batch().v % 64 == 0)
- return 16; // because we process 4 batches per SIMD
- return 1;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block_1x1::SetDefault(const convolution_params& arg,
- int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
- constexpr size_t sub_group_size = 8;
-
- runInfo.efficiency = FORCE_PRIORITY_3;
-
- auto block = get_out_block_size(arg);
-
- runInfo.gws0 = arg.output.X().v / block.out_width;
- runInfo.gws1 = arg.output.Y().v / block.out_height;
- runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) /
- block.out_depth; // process 4 output channels per Workitem
-
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = sub_group_size * get_wg_batch_count(arg);
-
- return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_batched_block_1x1::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
-
- const int sub_group_size = 8;
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
-
- // pitch for special block format used in this kernel
- const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
- const size_t filter_ofm_block_pitch =
- (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
- jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
- const size_t in_x_pitch = 32 * 4;
- const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
- const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
- const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
- const size_t in_offset =
- in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
- jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
- jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
- jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
- jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
- jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-
- const size_t out_x_pitch = 32 * 4;
- const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
-
- jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
- jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
-
- auto block = get_out_block_size(params);
- jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
- jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
- jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth));
-
- jit.AddConstant(MakeJitConstant("WG_BATCH_COUNT", get_wg_batch_count(params)));
-
- return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_batched_block_1x1::GetKernelsData(const Params& params,
- const optional_params& options) const {
- KernelsData kd = GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
- if (!kd.empty())
- kd[0].estimatedTime = FORCE_PRIORITY_3;
- return kd;
-}
-} // namespace kernel_selector
+++ /dev/null
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_batched_block_1x1 : public ConvolutionKernelBase {
-public:
- using Parent = ConvolutionKernelBase;
- ConvolutionKernel_mmad_batched_block_1x1() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block_1x1") {}
- virtual ~ConvolutionKernel_mmad_batched_block_1x1() {}
-
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
-
-protected:
- bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
- DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
-};
-} // namespace kernel_selector
\ No newline at end of file
k.EnableInputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
- k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableDilation();
+++ /dev/null
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "convolution_kernel_mmad_blocks.h"
-#include <vector>
-#include <utility>
-#include <string>
-#include <algorithm>
-
-namespace kernel_selector {
-ConvolutionKernel_mmad_blocks::ConvolutionKernel_mmad_blocks() : ConvolutionKernelBase("convolution_gpu_mmad_blocks") {
- // Generate the dispatch options to the auto-tuner.
- std::vector<size_t> blockWidthSizes = {1, 2, 4, 5, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32};
- std::vector<size_t> blockHeightSizes = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- std::vector<size_t> prefetchSizes = {1, 2, 3, 4, 5, 6, 8, 10};
- std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
- const size_t maxBlockSize = 240;
- for (auto executionMode : executionModes) {
- for (auto blockWidth : blockWidthSizes) {
- for (auto blockHeight : blockHeightSizes) {
- for (auto prefetch : prefetchSizes) {
- if (blockWidth * blockHeight <= maxBlockSize) {
- autoTuneOptions.emplace_back(AutoTuneOption{blockWidth, blockHeight, prefetch, executionMode});
- }
- }
- }
- }
- }
-}
-
-ParamsKey ConvolutionKernel_mmad_blocks::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::INT8);
- k.EnableInputDataType(Datatype::UINT8);
-
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableOutputDataType(Datatype::UINT8);
- k.EnableOutputDataType(Datatype::F32);
- k.EnableOutputDataType(Datatype::F16);
-
- k.EnableInputWeightsType(WeightsType::INT8);
-
- k.EnableInputLayout(DataLayout::byxf_af32);
- k.EnableOutputLayout(DataLayout::byxf_af32);
- k.EnableTensorOffset();
- k.EnableTensorPitches();
-// k.EnableDilation(); TODO: Add dilation support
- k.EnableBiasPerFeature();
- k.EnableBiasPerOutput();
- k.EnableNonBiasTerm();
- k.EnableBatching();
- k.EnableSplitSupport();
- k.EnableQuantization(QuantizationType::SYMMETRIC);
- k.EnableDifferentTypes();
- k.EnableDifferentInputWeightsTypes();
- k.DisableTuning();
- return k;
-}
-
-bool ConvolutionKernel_mmad_blocks::Validate(const Params& p, const optional_params& o) const {
- if (!Parent::Validate(p, o)) {
- return false;
- }
-
- return true;
-}
-
-static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t& block_x, size_t& block_y) {
- // how many elements we will compute in each dimension
- size_t computed_x = Align(output_x, block_x);
- size_t computed_y = Align(output_y, block_y);
- // how many simds we need in each dimension
- size_t simds_x = computed_x / block_x;
- size_t simds_y = computed_y / block_y;
- // how many unused values we have in each dimension
- size_t unused_x = computed_x - output_x;
- size_t unused_y = computed_y - output_y;
-
- block_x -= unused_x / simds_x;
- block_y -= unused_y / simds_y;
-}
-
-ConvolutionKernel_mmad_blocks::AutoTuneOption ConvolutionKernel_mmad_blocks::GetAutoTuneOptions(
- const Params& p,
- int autoTuneIndex) const {
- if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast<int>(autoTuneOptions.size()))) {
- return autoTuneOptions[autoTuneIndex];
- }
-
- // Sub-group size used by "convolution_gpu_mmad_blocks" kernel.
- constexpr size_t sub_group_size = 16;
-
- AutoTuneOption option = {0, 0, 0, DEFAULT};
-
- const convolution_params& cp = static_cast<const convolution_params&>(p);
-
- if (cp.stride.x == 1 && cp.stride.y == 1) {
- if (cp.filterSize.x == 1 && cp.filterSize.y == 1) {
- option.blockWidth = 16;
- option.blockHeight = 1;
- option.prefetch = 4;
- // if less than 16 values is required to compute one single row of output
- // then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice
- // performance results)
- } else if (cp.output.X().v + (cp.filterSize.x - 1) * cp.dilation.x < sub_group_size) {
- option.blockWidth = cp.output.X().v;
- option.blockHeight = 1;
- option.prefetch = 4;
- } else if (cp.filterSize.x < 5 && cp.filterSize.y < 5) {
- option.blockWidth = sub_group_size - cp.filterSize.x + 1;
- option.blockHeight = 2;
- option.prefetch = 4;
- } else {
- option.blockWidth = 4;
- option.blockHeight = 3;
- option.prefetch = 4;
- }
- } else if (cp.stride.x == 2 && cp.stride.y == 2) {
- option.blockWidth = 5;
- option.blockHeight = 4;
- option.prefetch = 4;
- } else {
- option.blockWidth = 4;
- option.blockHeight = 3;
- option.prefetch = 5;
- // run_info.efficiency = FORCE_PRIORITY_7; // GEMM is better
- }
-
- // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block
- // sizes
- if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) {
- shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight);
- }
-
- return option;
-}
-
-static std::pair<size_t, size_t> get_byxf_af32_req_input_block_dims(size_t output_block_width,
- size_t output_block_height,
- const uSize& filter_size,
- const uSize& stride,
- const uSize& dilation,
- size_t sub_group_size = 8,
- size_t read_chunk_size = 8,
- size_t min_read_size = 8) {
- assert(output_block_width > 0 && output_block_height > 0);
- assert(stride.x > 0 && stride.y > 0);
- assert(filter_size.x > 0 && filter_size.y > 0);
-
- // Number of elements in X dimension needed from input to compute output block without re-reading input.
- size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1) * dilation.x + 1;
- // Number of elements in Y dimension needed from input to compute output block without re-reading input.
- size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1) * dilation.y + 1;
-
- // Required number of elements in X dimension rounded to nearest >= read chunk size.
- size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size);
- // Number of sub-group-sized vectors of unit type needed to store input block.
- size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sub_group_size);
-
- // size of our array per workitem
- input_block_array_size = input_block_req_height * input_block_read_width;
- return std::make_pair(input_block_array_size, input_block_read_width);
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_blocks::SetDefault(const convolution_params& cp,
- int autoTuneIndex) const {
- // Sub-group size used by "convolution_gpu_mmad_blocks" kernel.
- constexpr size_t sub_group_size = 8;
-
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
-
- auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
- runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
- runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
- runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
-
- auto input_block_dims =
- get_byxf_af32_req_input_block_dims(runInfo.cldnnStyle.blockWidth,
- runInfo.cldnnStyle.blockHeight,
- cp.filterSize,
- cp.stride,
- cp.dilation,
- sub_group_size,
- runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
- sub_group_size);
- runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
- runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
-
- const auto of_maps = cp.output.Feature().v;
- const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
-
- runInfo.efficiency = FORCE_PRIORITY_3;
-
- runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
- runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
- runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v;
-
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = sub_group_size;
-
- return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_blocks::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = Parent::GetJitConstants(params, runInfo);
-
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
- jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
- jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
- jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
- jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
-
- jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
-
- // pitch for special block format used in this kernel
- const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
- const size_t filter_ofm_block_pitch =
- (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
- jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
- if (!params.fused_ops.empty()) {
- auto input_dt = GetActivationType(params);
- FusedOpsConfiguration conf_scalar = {"", {"b", "f", "(y+br)", "(x+bc)"}, "res", input_dt, 1 };
- jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
- }
-
- return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_blocks::GetKernelsData(const Params& params, const optional_params& options) const {
- KernelsData kd = GetTunedKernelsDataByIndex(params, options);
- if (!kd.empty())
- kd[0].estimatedTime = FORCE_PRIORITY_2;
-
- return kd;
-}
-
-KernelsData ConvolutionKernel_mmad_blocks::GetKernelsDataForAutoTune(const Params& params,
- const optional_params& options) const {
- if (!Validate(params, options)) {
- return {};
- }
-
- KernelsData res = {};
-
- for (size_t i = 0; i < autoTuneOptions.size(); i++) {
- KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
- if (!kd.empty()) {
- res.emplace_back(kd[0]);
- }
- }
-
- return res;
-}
-
-} // namespace kernel_selector
+++ /dev/null
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <string>
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_blocks : public ConvolutionKernelBase {
-public:
- using Parent = ConvolutionKernelBase;
- ConvolutionKernel_mmad_blocks();
- virtual ~ConvolutionKernel_mmad_blocks() {}
-
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
-
-protected:
- bool Validate(const Params& p, const optional_params& o) const override;
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
- DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
- return WeightsLayout::os_is_yx_isa8_osv8_isv4;
- }
- std::vector<FusedOpType> GetSupportedFusedOps() const override {
- return { FusedOpType::ELTWISE,
- FusedOpType::QUANTIZE,
- FusedOpType::SCALE,
- FusedOpType::ACTIVATION };
- }
-
-private:
- struct AutoTuneOption {
- size_t blockWidth;
- size_t blockHeight;
- size_t prefetch;
- std::string exeMode;
- };
-
- AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
- std::vector<AutoTuneOption> autoTuneOptions = {};
-};
-} // namespace kernel_selector
+++ /dev/null
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_slm_2x14_rep4.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_slm_2x14_rep4::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::INT8);
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableInputWeightsType(WeightsType::INT8);
- k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
- k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
- k.EnableTensorOffset();
- k.EnableTensorPitches();
- k.EnableBiasPerFeature();
- k.EnableBiasPerOutput();
- k.EnableNonBiasTerm();
- k.EnableBatching();
- k.EnableQuantization(QuantizationType::SYMMETRIC);
- k.DisableTuning();
- return k;
-}
-
-bool ConvolutionKernel_mmad_slm_2x14_rep4::Validate(const Params& p, const optional_params& o) const {
- if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
- return false;
- }
-
- const convolution_params& cp = static_cast<const convolution_params&>(p);
-
- if (cp.filterSize.x != 3 || cp.filterSize.y != 3)
- return false;
-
- if (cp.inputs[0].X().v != 56 || cp.inputs[0].Y().v != 56)
- return false;
-
- if (cp.stride.x != 1 || cp.stride.y != 1)
- return false;
-
- return true;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_2x14_rep4::SetDefault(const convolution_params& arg,
- int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
- runInfo.efficiency = FORCE_PRIORITY_1;
-
- const size_t rep_count = 4;
- const size_t batch_per_wi = 1;
- const size_t out_block_width = 14;
- const size_t out_block_height = 2;
- runInfo.gws0 = arg.output.Feature().v *
- (arg.output.Batch().v / (rep_count * batch_per_wi)); // number of tiles needed to cover output width
- runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width;
- runInfo.gws2 = ((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height;
-
- runInfo.lws0 = 32; // depth
- runInfo.lws1 = 1; // width
- runInfo.lws2 = 4; // height
-
- return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_slm_2x14_rep4::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo);
-
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8));
-
- // pitch for special block format used in this kernel
- const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
- const size_t filter_ofm_block_pitch =
- (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
- jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
- const size_t in_x_pitch = 32 * 4;
- const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
- const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
- const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
- const size_t in_offset =
- in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
- jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
- jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
- jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
- jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
- jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-
- jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 14));
- jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 2));
- jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0));
- jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1));
- jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", runInfo.lws2));
-
- return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_slm_2x14_rep4::GetKernelsData(const Params& params,
- const optional_params& options) const {
- return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
-}
-} // namespace kernel_selector
+++ /dev/null
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_slm_2x14_rep4 : public ConvolutionKernelBase {
-public:
- ConvolutionKernel_mmad_slm_2x14_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_2x14_rep4") {}
- virtual ~ConvolutionKernel_mmad_slm_2x14_rep4() {}
-
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
-
-protected:
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
- DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- bool Validate(const Params& p, const optional_params& o) const override;
- WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
- return WeightsLayout::os_is_yx_isa8_osv8_isv4;
- }
-};
-} // namespace kernel_selector
\ No newline at end of file
+++ /dev/null
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_slm_7x7_rep4.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_slm_7x7_rep4::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::INT8);
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableInputWeightsType(WeightsType::INT8);
- k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
- k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
- k.EnableTensorOffset();
- k.EnableTensorPitches();
- k.EnableBiasPerFeature();
- k.EnableBiasPerOutput();
- k.EnableNonBiasTerm();
- k.EnableBatching();
- k.EnableQuantization(QuantizationType::SYMMETRIC);
- k.DisableTuning();
- return k;
-}
-
-bool ConvolutionKernel_mmad_slm_7x7_rep4::Validate(const Params& p, const optional_params& o) const {
- if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
- return false;
- }
-
- const convolution_params& cp = static_cast<const convolution_params&>(p);
-
- if (cp.filterSize.x != 3 || cp.filterSize.y != 3)
- return false;
-
- if (cp.stride.x != 1 || cp.stride.y != 1)
- return false;
-
- if (cp.inputs[0].X().v == 7 && cp.inputs[0].Y().v == 7)
- return true;
-
- if (cp.inputs[0].X().v == 14 && cp.inputs[0].Y().v == 14)
- return true;
-
- return false;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_7x7_rep4::SetDefault(const convolution_params& arg,
- int) const {
- DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
- runInfo.efficiency = FORCE_PRIORITY_1;
-
- const size_t rep_count = 4;
- const size_t batch_per_wi = 4;
- const size_t out_block_width = 7;
- // const size_t out_block_height = 1;
- runInfo.gws0 = (arg.output.Feature().v * arg.output.Batch().v) /
- (rep_count * batch_per_wi); // number of tiles needed to cover output width
- runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width;
- // since this kernel only apply to 7x7 sizes we need to manually set gws2 to 8
- runInfo.gws2 = Align(arg.inputs[0].Y().v,
- 8); // 8;//((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height;
-
- runInfo.lws0 = 16; // depth
- runInfo.lws1 = 1; // width
- runInfo.lws2 = 8; // height
-
- return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_slm_7x7_rep4::GetJitConstants(const convolution_params& params,
- const DispatchData& runInfo) const {
- auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo);
-
- jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8));
-
- // pitch for special block format used in this kernel
- const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
- const size_t filter_ofm_block_pitch =
- (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
- jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
- const size_t in_x_pitch = 32 * 4;
- const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
- const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
- const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
- const size_t in_offset =
- in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
- const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
-
- jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
- jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
- jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
- jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
- jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-
- jit.AddConstant(MakeJitConstant("OUT_X_PITCH", in_x_pitch));
- jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
- jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 7));
- jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 1));
- jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0));
- jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1));
- jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", 7)); // must be 7 since we process 7 in Y per workgroup
-
- return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_slm_7x7_rep4::GetKernelsData(const Params& params,
- const optional_params& options) const {
- return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
-}
-} // namespace kernel_selector
+++ /dev/null
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_slm_7x7_rep4 : public ConvolutionKernelBase {
-public:
- ConvolutionKernel_mmad_slm_7x7_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_7x7_rep4") {}
- virtual ~ConvolutionKernel_mmad_slm_7x7_rep4() {}
-
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
-
-protected:
- JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
- DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
- bool Validate(const Params& p, const optional_params& o) const override;
- WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
- return WeightsLayout::os_is_yx_isa8_osv8_isv4;
- }
-};
-} // namespace kernel_selector
\ No newline at end of file
#include "convolution_kernel_bfyx_1x1_gemm_buf.h"
#include "convolution_kernel_winograd_2x3_s1_fused.h"
#include "convolution_kernel_winograd_6x3_s1_fused.h"
-#include "convolution_kernel_mmad.h"
-#include "convolution_kernel_mmad_blocks.h"
-#include "convolution_kernel_imad_byxf_af32_depthwise.h"
#include "convolution_kernel_bfyx_depthwise_weights_lwg.h"
#include "convolution_kernel_imad.h"
#include "convolution_kernel_fs_byx_fsv32.h"
#include "deformable_convolution_kernel_bfyx_interp.h"
#include "convolution_kernel_b_fs_zyx_fsv16_fp32.h"
#include "convolution_kernel_b_fs_zyx_fsv16_fp16.h"
-#include "convolution_kernel_imad_byxf_af32_1x1.h"
#include "convolution_kernel_imad_b_fs_yx_fsv4_1x1.h"
#include "convolution_kernel_imad_b_fs_yx_fsv4_dw.hpp"
#include "convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.h"
Attach<ConvolutionKernel_Winograd_2x3_s1_fused>();
Attach<ConvolutionKernel_Winograd_6x3_s1_fused>();
- // byxf_af32 int8
- Attach<ConvolutionKernel_mmad>();
- Attach<ConvolutionKernel_mmad_blocks>();
- Attach<ConvolutionKernel_imad_byxf_af32_1x1>();
- Attach<ConvolutionKernel_imad_byxf_af32_depthiwise>();
-
// b_fs_yx_fsv4 kernels
Attach<ConvolutionKernel_imad>();
Attach<ConvolutionKernel_imad_b_fs_yx_fsv4_1x1>();
k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
- k.EnableInputLayout(DataLayout::byxf_af32);
- k.EnableOutputLayout(DataLayout::byxf_af32);
-
k.EnableDifferentTypes();
k.EnableDifferentInputWeightsTypes();
k.EnableBatching();
input_tile_ifm_pitch = zyx_pitch_factor * 16 * 16;
}
input_in_tile_batch_pitch = 16;
- } else if (in_layout == DataLayout::byxf_af32) {
- input_tile_ifm_pitch = tile_ifm;
- input_in_tile_batch_pitch = zyx_pitch_factor * Align(in.Feature().LogicalDimPadded(), 32);
}
jit.AddConstant(MakeJitConstant("INPUT_VALID_TILE_IFM_PITCH", input_tile_ifm_pitch != 0));
fsv = 16;
}
if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32
- || params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32
- || params.inputs[0].GetLayout() == DataLayout::byxf_af32) {
+ || params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32) {
fsv = 32;
}
k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
- k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableAllOutputLayout();
k.EnableDifferentTypes();
const auto& ewParams = static_cast<const eltwise_params&>(params);
for (size_t i = 0; i < ewParams.inputs.size(); i++) {
- if (ewParams.inputs[i].GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 ||
- (ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
+ if ((ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
(ewParams.inputs[i].GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
ewParams.inputs[i].GetLayout() == DataLayout::fs_b_yx_fsv32)
return false;
}
- if (ewParams.output.GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 ||
- (ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
+ if ((ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
(ewParams.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
ewParams.output.GetLayout() == DataLayout::fs_b_yx_fsv32)
return false;
k.EnableDifferentTypes();
k.EnableInputLayout(DataLayout::bfyx);
- k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
k.EnableOutputLayout(DataLayout::bf);
size_t input_y_pitch = input.Y().pitch;
size_t input_z_pitch = input.Z().pitch;
- if (input.GetLayout() == DataLayout::byxf_af32 || input.GetLayout() == DataLayout::bfyx) {
+ if (input.GetLayout() == DataLayout::bfyx) {
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", 32));
} else if (input.GetLayout() == DataLayout::b_fs_yx_fsv32 || input.GetLayout() == DataLayout::b_fs_zyx_fsv32) {
input_x_pitch = 32;
kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
if (output.GetLayout() == DataLayout::bfyx || output.GetLayout() == DataLayout::b_fs_yx_fsv4 ||
- output.GetLayout() == DataLayout::byxf || output.GetLayout() == DataLayout::byxf_af32 ||
+ output.GetLayout() == DataLayout::byxf ||
output.GetLayout() == DataLayout::bfzyx || output.GetLayout() == DataLayout::b_fs_zyx_fsv16 ||
output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) {
// Determine global work sizes.
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::bfyx);
- k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
+++ /dev/null
-// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "pooling_kernel_gpu_byxf_af32.h"
-
-namespace kernel_selector {
-ParamsKey PoolingKerneGPU_byxf_af32::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::INT8);
- k.EnableInputDataType(Datatype::UINT8);
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableOutputDataType(Datatype::UINT8);
- k.EnableOutputDataType(Datatype::F16);
- k.EnableOutputDataType(Datatype::F32);
- k.EnableInputLayout(DataLayout::byxf_af32);
- k.EnableOutputLayout(DataLayout::byxf_af32);
- k.EnableTensorOffset();
- k.EnableTensorPitches();
- k.EnableBatching();
- k.EnablePoolType(PoolType::MAX);
- k.EnablePoolType(PoolType::AVG);
- k.EnablePoolRemainder(PoolRemainder::FLOOR);
- k.EnablePoolRemainder(PoolRemainder::CEIL);
- k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
- k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC);
- k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
- k.EnableDifferentTypes();
- return k;
-}
-
-PoolingKernelBase::DispatchData PoolingKerneGPU_byxf_af32::SetDefault(const pooling_params& params) const {
- constexpr int simdSize = 8;
-
- DispatchData runInfo = PoolingKernelBase::SetDefault(params);
-
- runInfo.gws0 = params.output.X().v;
- runInfo.gws1 = params.output.Y().v;
- // we got byxf_af32 format, so if we process 4 features per workitem, that means we process 32 per simd, so divide
- // by 4 and we end up with 8
- runInfo.gws2 = (RoundUp(params.output.Feature().v, 32) * params.output.Batch().v) / 4;
-
- runInfo.lws0 = 1;
- runInfo.lws1 = 1;
- runInfo.lws2 = simdSize;
-
- return runInfo;
-}
-
-JitConstants PoolingKerneGPU_byxf_af32::GetJitConstants(const pooling_params& params, DispatchData kd) const {
- JitConstants jit = PoolingKernelBase::GetJitConstants(params, kd);
-
- jit.AddConstant(MakeJitConstant("AS_INPUT_TYPE(val)", "as_" + toCLType(params.inputs[0].GetDType()) + "4(val)"));
- jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
- jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
-
- if (!params.fused_ops.empty()) {
- auto input_dt = GetActivationType(params);
- FusedOpsConfiguration conf = {"",
- {"b", "f", "y", "x"},
- "fused_pool_result",
- input_dt,
- 4,
- LoadType::LT_UNALIGNED,
- BoundaryCheck::ENABLED,
- IndexType::TENSOR_COORD,
- Tensor::DataChannelName::FEATURE};
- jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
- }
-
- return jit;
-}
-
-
-KernelsData PoolingKerneGPU_byxf_af32::GetKernelsData(const Params& params, const optional_params& options) const {
- return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
-}
-} // namespace kernel_selector
+++ /dev/null
-// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "pooling_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-class PoolingKerneGPU_byxf_af32 : public PoolingKernelBase {
-public:
- PoolingKerneGPU_byxf_af32() : PoolingKernelBase("pooling_gpu_byxf_af32") {}
- virtual ~PoolingKerneGPU_byxf_af32() {}
-
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
- JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
- DispatchData SetDefault(const pooling_params& params) const override;
- std::vector<FusedOpType> GetSupportedFusedOps() const override {
- return { FusedOpType::ELTWISE,
- FusedOpType::QUANTIZE,
- FusedOpType::SCALE,
- FusedOpType::ACTIVATION };
- }
-};
-} // namespace kernel_selector
k.EnableInputLayout(DataLayout::bfzyx);
k.EnableInputLayout(DataLayout::yxfb);
k.EnableInputLayout(DataLayout::byxf);
- k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
k.EnableOutputLayout(DataLayout::bfzyx);
k.EnableOutputLayout(DataLayout::yxfb);
k.EnableOutputLayout(DataLayout::byxf);
- k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
#include "pooling_kernel_gpu_byxf_opt.h"
#include "pooling_kernel_gpu_bfyx_block_opt.h"
#include "pooling_kernel_gpu_byxf_padding_opt.h"
-#include "pooling_kernel_gpu_byxf_af32.h"
#include "pooling_kernel_gpu_int8_ref.h"
#include "pooling_kernel_gpu_b_fs_yx_fsv4.h"
#include "pooling_kernel_gpu_fs_b_yx_fsv32.h"
Attach<PoolingKernelGPUBfyxBlockOpt>();
Attach<PoolingKernelGPUByxfPaddingOpt>();
Attach<PoolingKernelGPUInt8Ref>();
- Attach<PoolingKerneGPU_byxf_af32>();
Attach<PoolingKerneGPU_b_fs_yx_fsv4>();
Attach<PoolingKerneGPU_fs_b_yx_fsv32>();
Attach<PoolingKernel_b_fs_yx_fsv16>();
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
- k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::yxfb);
k.EnableOutputLayout(DataLayout::bfzyx);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
- k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
+++ /dev/null
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-ParamsKey reorder_kernel_byxf_f32_to_byx8_f4_i8::GetSupportedKey() const {
- ParamsKey k;
- k.EnableInputDataType(Datatype::F32);
- k.EnableOutputDataType(Datatype::INT8);
- k.EnableDifferentTypes();
- k.EnableInputLayout(DataLayout::byxf);
- k.EnableOutputLayout(DataLayout::byx8_f4);
- k.EnableTensorOffset();
- k.EnableTensorPitches();
- k.EnableBatching();
- return k;
-}
-
-bool reorder_kernel_byxf_f32_to_byx8_f4_i8::Validate(const Params& p, const optional_params& o) const {
- if (!ReorderKernelBase::Validate(p, o)) {
- return false;
- }
-
- const reorder_params& params = static_cast<const reorder_params&>(p);
-
- if (params.output.X().v % 16 != 0)
- return false;
-
- if (params.inputs[0].Feature().v != 3)
- return false;
-
- if (params.mode == MeanSubtractMode::IN_BUFFER && params.mean.LogicalSize() != params.inputs[0].Feature().v)
- return false;
-
- return true;
-}
-
-size_t static get_wg_batch_size(const reorder_params& params) {
- if (params.inputs[0].Batch().v % 16 == 0)
- return 16;
- return 1;
-}
-
-reorder_kernel_byxf_f32_to_byx8_f4_i8::DispatchData reorder_kernel_byxf_f32_to_byx8_f4_i8::SetDefault(
- const reorder_params& params) const {
- DispatchData kd;
-
- const auto& input = params.inputs[0];
-
- kd.gws0 = input.X().v;
- kd.gws1 = input.Y().v;
- kd.gws2 = input.Batch().v;
-
- kd.lws0 = 16;
- kd.lws1 = 1;
- kd.lws2 = get_wg_batch_size(params);
-
- return kd;
-}
-
-JitConstants reorder_kernel_byxf_f32_to_byx8_f4_i8::GetJitConstants(const reorder_params& params) const {
- auto jit = ReorderKernelBase::GetJitConstants(params);
- jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0]));
- jit.AddConstant(MakeJitConstant("WG_BATCH_SIZE", get_wg_batch_size(params)));
- return jit;
-}
-
-KernelsData reorder_kernel_byxf_f32_to_byx8_f4_i8::GetKernelsData(const Params& params,
- const optional_params& options) const {
- const reorder_params& orgParams = static_cast<const reorder_params&>(params);
- return GetCommonKernelsData(orgParams, options, FORCE_PRIORITY_5);
-}
-} // namespace kernel_selector
\ No newline at end of file
+++ /dev/null
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "reorder_kernel_base.h"
-
-namespace kernel_selector {
-class reorder_kernel_byxf_f32_to_byx8_f4_i8 : public ReorderKernelBase {
-public:
- reorder_kernel_byxf_f32_to_byx8_f4_i8() : ReorderKernelBase("reorder_data_byxf_f32_to_byx8_f4_i8") {}
- virtual ~reorder_kernel_byxf_f32_to_byx8_f4_i8() {}
-
- bool Validate(const Params& p, const optional_params& o) const override;
- DispatchData SetDefault(const reorder_params& params) const override;
- KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
- ParamsKey GetSupportedKey() const override;
- JitConstants GetJitConstants(const reorder_params& params) const override;
-};
-} // namespace kernel_selector
k.EnableInputLayout(DataLayout::bfwzyx);
k.EnableInputLayout(DataLayout::bs_f_bsv8__af8);
k.EnableInputLayout(DataLayout::bs_f_bsv16__af8);
- k.EnableInputLayout(DataLayout::bf8_xy16);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
k.EnableOutputLayout(DataLayout::bfwzyx);
k.EnableOutputLayout(DataLayout::bs_f_bsv8__af8);
k.EnableOutputLayout(DataLayout::bs_f_bsv16__af8);
- k.EnableOutputLayout(DataLayout::bf8_xy16);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
#include "reorder_from_winograd_2x3_kernel.h"
#include "reorder_to_winograd_2x3_kernel.h"
#include "reorder_kernel_to_yxfb_batched.h"
-#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h"
#include "reorder_kernel_binary.h"
#include "reorder_biplanar_nv12.h"
#include "reorder_kernel_fs_b_yx_fsv32_to_bfyx.h"
Attach<ReorderFromWinograd2x3Kernel>();
Attach<ReorderToWinograd2x3Kernel>();
Attach<ReorderKernel_to_yxfb_batched>();
- Attach<reorder_kernel_byxf_f32_to_byx8_f4_i8>();
Attach<reorder_biplanar_nv12>();
Attach<ReorderKernel_fs_b_yx_fsv32_to_bfyx>();
}
return 16;
case DataLayout::b_fs_yx_fsv4:
return 4;
- case DataLayout::byxf_af32:
- return 16;
default:
break;
}
#if FP16_UNIT_USED
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
-
+
#define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \
{ \
const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \
__attribute__((intel_reqd_sub_group_size(16)))
KERNEL(convolution_bfyx_1x1)(
- __global INPUT0_TYPE* input,
- __global OUTPUT_TYPE* output,
- __global FILTER_TYPE* weights,
+ __global INPUT0_TYPE* input,
+ __global OUTPUT_TYPE* output,
+ __global FILTER_TYPE* weights,
#if BIAS_TERM
__global BIAS_TYPE* biases,
#endif
{
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00;
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00;
-
+
uint input_idx = input_offset + k * 8 * xy_block_num * 16;
uint filter_idx = filter_offset + k * 8 * 16;
-
+
blockA00 = ALIGNED_BLOCK_READ8(input, input_idx);
blockB00 = ALIGNED_BLOCK_READ8(weights, filter_idx);
for(uint i = 0; i < 16; i++)
{
- #if OUTPUT_LAYOUT_BF8_XY16
- const uint dst_index = GET_DATA_BF8_XY16_INDEX(OUTPUT, b, group_f+i, y, x) + out_split_offset;
- #else
const uint dst_index = GET_DATA_INDEX(OUTPUT, b, group_f+i, y, x) + out_split_offset;
- #endif
#if LEFTOVERS
if(group_f+i < OUTPUT_FEATURE_NUM)
#endif
+++ /dev/null
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#if STRIDE_SIZE_Y == DILATION_SIZE_Y
- #define BLOCK_Y_SIZE (FILTER_SIZE_Y + (SPLIT_Y - 1))
- #define LOAD_Y_WITH_STRIDES
-#else
- #define BLOCK_Y_SIZE ((SPLIT_Y - 1) * STRIDE_SIZE_Y + (FILTER_SIZE_Y - 1) * (DILATION_SIZE_Y - 1) + FILTER_SIZE_Y)
-#endif
-
-#if STRIDE_SIZE_X == DILATION_SIZE_X
- #define FILTER_SIZE_X_PRELOAD FILTER_SIZE_X
- #define LOAD_X_WITH_STRIDES
-#else
- #define FILTER_SIZE_X_PRELOAD FILTER_SIZE_X
- #define LOAD_X_WITH_STRIDES
- #define DONT_USE_X_SHIFTS
-#endif
-
-__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
-KERNEL(convolution_gpu_byxf_af32_depthwise)(
- __global INPUT0_TYPE* input,
- __global OUTPUT_TYPE* output,
- __global FILTER_TYPE* weights,
-#if BIAS_TERM
- __global BIAS_TYPE* biases,
-#endif
-#if HAS_FUSED_OPS_DECLS
- FUSED_OPS_DECLS,
-#endif
- uint split_idx)
-{
- const uint x = get_global_id(1) * OUT_BLOCK_WIDTH;
- const uint y = get_global_id(2) * SPLIT_Y;
-#if OUTPUT_BATCH_NUM == 1
- const uint of = get_global_id(0);
- const uint b = 0;
-#else
- const uint of = (uint)get_global_id(0) % ALIGNED_OFM;
- const uint b = (uint)get_global_id(0) / ALIGNED_OFM;
-#endif
- const uint g = of;
-
- if (of >= OUTPUT_FEATURE_NUM)
- return;
-
- int dotProd[SPLIT_Y] = {0};
- OUTPUT_TYPE out[SPLIT_Y];
- const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
- const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
- const uint filter_offset = g*FILTER_GROUPS_PITCH;
- const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + g*FILTER_IFM_NUM;
-
- // read all weights
- FILTER_TYPE w[FILTER_IFM_PITCH];
- __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
- for (int j = 0; j < FILTER_SIZE_Y; j++) {
- __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
- for (int i = 0; i < FILTER_SIZE_X; i++) {
- w[j * FILTER_SIZE_X + i] = weights[filter_offset + j * FILTER_Y_PITCH + i * FILTER_X_PITCH];
- }
- }
-
- // initial input read
- INPUT0_TYPE in[FILTER_SIZE_X_PRELOAD * BLOCK_Y_SIZE];
- __attribute__((opencl_unroll_hint(BLOCK_Y_SIZE)))
- for (int i = 0; i < BLOCK_Y_SIZE; i++) {
- __attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD)))
- for (int j = 0; j < FILTER_SIZE_X_PRELOAD; j++) {
-#ifdef LOAD_Y_WITH_STRIDES
- int input_offset_y = input_y + i * DILATION_SIZE_Y;
-#else
- int input_offset_y = input_y + i;
-#endif
-#ifdef LOAD_X_WITH_STRIDES
- int input_offset_x = input_x + j * DILATION_SIZE_X;
-#else
- int input_offset_x = input_x + j;
-#endif
- uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
- in[i * FILTER_SIZE_X_PRELOAD + j] = input[input_idx];
- }
- }
-
-#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
- FUSED_OPS_PRELOAD;
-#endif
-
- for (int l = 0; l < OUT_BLOCK_WIDTH; l++) {
- //calculate dotproduct
- __attribute__((opencl_unroll_hint(SPLIT_Y)))
- for (int i = 0; i < SPLIT_Y; i++) {
- __attribute__((opencl_unroll_hint(FILTER_IFM_PITCH)))
- for (int j = 0; j < FILTER_IFM_PITCH; j++) {
-#if defined(LOAD_X_WITH_STRIDES) && defined(LOAD_Y_WITH_STRIDES)
- const uint start_pos_y = i * FILTER_SIZE_X_PRELOAD;
- dotProd[i] += (int)in[start_pos_y + j] * (int)w[j];
-#elif defined(LOAD_X_WITH_STRIDES) && !defined(LOAD_Y_WITH_STRIDES)
- const uint start_pos_y = i * STRIDE_SIZE_Y * FILTER_SIZE_X_PRELOAD;
- const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * DILATION_SIZE_Y * FILTER_SIZE_X_PRELOAD;
- const uint pos_x = (j % FILTER_SIZE_X);
- dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
-#elif defined(LOAD_Y_WITH_STRIDES) && !defined(LOAD_X_WITH_STRIDES)
- const uint start_pos_y = i * FILTER_SIZE_X_PRELOAD;
- const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * FILTER_SIZE_X_PRELOAD;
- const uint pos_x = (j % FILTER_SIZE_X) * DILATION_SIZE_X;
- dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
-#else
- const uint start_pos_y = i * STRIDE_SIZE_Y * FILTER_SIZE_X_PRELOAD;
- const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * DILATION_SIZE_Y * FILTER_SIZE_X_PRELOAD;
- const uint pos_x = (j % FILTER_SIZE_X) * DILATION_SIZE_X;
- dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
-#endif // defined(LOAD_X_WITH_STRIDES) && defined(LOAD_Y_WITH_STRIDES)
- }
- }
-
- __attribute__((opencl_unroll_hint(BLOCK_Y_SIZE)))
- for (int i = 0; i < BLOCK_Y_SIZE; i++) {
- // inputs shift
-#ifndef DONT_USE_X_SHIFTS
-#if (FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X) > 0
- __attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X)))
-#endif
- for (int j = 0; j < FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X; j++) {
- in[i * FILTER_SIZE_X_PRELOAD + j] = in[i * FILTER_SIZE_X_PRELOAD + j + STRIDE_SIZE_X];
- }
-#endif
-
- // read additional inputs
-#ifdef LOAD_Y_WITH_STRIDES
- int input_offset_y = input_y + i * DILATION_SIZE_Y;
-#else
- int input_offset_y = input_y + i;
-#endif // LOAD_Y_WITH_STRIDES
-
-#if defined(DONT_USE_X_SHIFTS)
- __attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD)))
- for (int j = 0; j < FILTER_SIZE_X_PRELOAD; j++) {
- int input_offset_x = input_x + ((l + 1) * STRIDE_SIZE_X) + j * DILATION_SIZE_X;
- uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
- in[i * FILTER_SIZE_X_PRELOAD + j] = input[input_idx];
- }
-
-#else
- {
- int input_offset_x = input_x + ((l + 1) * STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) * DILATION_SIZE_X;
- uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
- in[i * FILTER_SIZE_X_PRELOAD + FILTER_SIZE_X_PRELOAD - 1] = input[input_idx];
- }
-#endif // defined(DONT_USE_X_SHIFTS)
- }
-
- __attribute__((opencl_unroll_hint(SPLIT_Y)))
- for (int m = 0; m < SPLIT_Y; m++) {
-#if BIAS_TERM
- #if BIAS_PER_OUTPUT
- #if OUTPUT_LAYOUT_BYXF_AF32 == 1
- const uint bias_index = GET_DATA_INDEX(BIAS, b, of, y + m, x + l);
- #elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
- const uint bias_index = GET_DATA_B_FS_YX_FSV4_INDEX(BIAS, b, of, y + m, x + l);
- #else
- #error "Incorrect output layout"
- #endif
-#elif BIAS_PER_OFM
- const uint bias_index = of;
-#endif
- // TODO: Maybe half should be supported as well.
- float res = (float)dotProd[m] + biases[bias_index];
-#else
- float res = (float)dotProd[m];
-#endif
- dotProd[m] = 0;
-
-#if HAS_FUSED_OPS
-#if FUSED_OPS_CAN_USE_PRELOAD
- FUSED_OPS_CALC;
-#else
- FUSED_OPS;
-#endif
- out[m] = FUSED_OPS_RESULT;
-#else
- out[m] = TO_OUTPUT_TYPE(res);
-#endif
- }
-
- __attribute__((opencl_unroll_hint(SPLIT_Y)))
- for (int m = 0; m < SPLIT_Y; m++) {
-#ifdef SPLIT_LEFTOVERS
- if (y + m >= OUTPUT_SIZE_Y)
- continue;
-#endif
- const uint dst_index = OUTPUT_GET_INDEX(b, of, y + m, x + l);
- output[dst_index] = ACTIVATION(out[m], ACTIVATION_PARAMS);
- }
- } // OUT_BLOCK_WIDTH
-}
+++ /dev/null
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
-#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
-#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL(convolution_MMAD)(
- __global INPUT0_TYPE* input,
- __global OUTPUT_TYPE* output,
- __global FILTER_TYPE* weights,
-#if BIAS_TERM
- __global BIAS_TYPE* biases,
-#endif
-#if HAS_FUSED_OPS_DECLS
- FUSED_OPS_DECLS,
-#endif
- uint split_idx)
-{
- const uint x = get_global_id(0);
- const uint y = get_global_id(1);
-#if OUTPUT_BATCH_NUM == 1
- const uint f = get_global_id(2);
- const uint b = 0;
-#else
- const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED;
- const uint b = (uint)get_global_id(2) / FILTER_OFM_ALIGNED;
-#endif
-
- int dotProd = 0;
-
- const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
- const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
- const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
-
- const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
- const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
-
- for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
- {
- for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
- {
- const int input_offset_y = input_y + j * DILATION_SIZE_Y;
- const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
-
- if(!zero_y)
- {
- for (uint i = 0; i < FILTER_SIZE_X ; ++i)
- {
- const int input_offset_x = input_x + i * DILATION_SIZE_X;
- const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
-
- if(!zero_x)
- {
- uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*32;
- uint filter_idx = filter_offset + k*FILTER_Y_PITCH * FILTER_SIZE_Y + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
-
- PACKED_TYPE input_data = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
- MAKE_VECTOR_TYPE(PACKED_TYPE, 8) activations; //activations of all lanes
- activations.s0 = sub_group_broadcast(input_data, 0);
- activations.s1 = sub_group_broadcast(input_data, 1);
- activations.s2 = sub_group_broadcast(input_data, 2);
- activations.s3 = sub_group_broadcast(input_data, 3);
- activations.s4 = sub_group_broadcast(input_data, 4);
- activations.s5 = sub_group_broadcast(input_data, 5);
- activations.s6 = sub_group_broadcast(input_data, 6);
- activations.s7 = sub_group_broadcast(input_data, 7);
-
- int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
-
- dotProd = MMAD_8(activations, weights_data, dotProd);
- }
- }
- }
- }
- }
-
-#if BIAS_TERM
-#if BIAS_PER_OUTPUT
- const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
-#elif BIAS_PER_OFM
- const uint bias_index = f;
-#endif
- float res = (float)dotProd + biases[bias_index];
-#else
- float res = (float)dotProd;
-#endif // BIAS_TERM
-
-#if HAS_FUSED_OPS
- FUSED_OPS;
- OUTPUT_TYPE result = FUSED_OPS_RESULT;
-#else
- OUTPUT_TYPE result = TO_OUTPUT_TYPE(res);
-#endif
-
- const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
- const uint dst_index = OUTPUT_GET_INDEX(b, f, y, x) + out_split_offset;
- output[dst_index] = result;
-}
-
-#undef FILTER_IFM_MMAD_NUM
-#undef FILTER_OFM_MMAD_NUM
-#undef FILTER_IFM_ALIGNED
-#undef FILTER_OFM_ALIGNED
+++ /dev/null
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
-#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
-#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL(convolution_MMAD_blocks)(
- __global INPUT0_TYPE* input,
- __global OUTPUT_TYPE* output,
- __global FILTER_TYPE* weights,
-#if BIAS_TERM
- __global BIAS_TYPE* biases,
-#endif
-#if HAS_FUSED_OPS_DECLS
- FUSED_OPS_DECLS,
-#endif
- uint split_idx)
-{
- const uint x = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH;
- const uint y = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT;
-#if OUTPUT_BATCH_NUM == 1
- const uint f = (uint)get_global_id(2);
- const uint b = 0;
-#else
- const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED;
- const uint b = (uint)get_global_id(2) / FILTER_OFM_ALIGNED;
-#endif
-
- int acc[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT] = { 0 };
- PACKED_TYPE in[IN_BLOCK_ARRAY_SIZE];
-
- const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
- const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
- const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
-
- const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
- const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
-
- uint in_addr = input_offset + input_x * INPUT0_X_PITCH + input_y * INPUT0_Y_PITCH;
- uint filter_idx = filter_offset;
-
- __attribute__((opencl_unroll_hint(1)))
- for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
- {
- // preload input data
- for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE; in_block_pos++)
- {
- uint block_x = in_block_pos % IN_BLOCK_WIDTH;
- uint block_y = in_block_pos / IN_BLOCK_WIDTH;
- uint input_idx = in_addr + block_x * INPUT0_X_PITCH + block_y * INPUT0_Y_PITCH;
- in[in_block_pos] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
- }
- // end of preloading input data
-
- __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
- for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
- {
- __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
- for (uint i = 0; i < FILTER_SIZE_X ; ++i)
- {
- int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
-
- __attribute__((opencl_unroll_hint(OUTPUT_BLOCK_HEIGHT)))
- for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
- {
- __attribute__((opencl_unroll_hint(OUTPUT_BLOCK_WIDTH)))
- for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
- {
- PACKED_TYPE input_data = in[(br * STRIDE_SIZE_Y + j) * IN_BLOCK_WIDTH + bc * STRIDE_SIZE_X + i];
- MAKE_VECTOR_TYPE(PACKED_TYPE, 8) activations; //activations of all lanes
- activations.s0 = sub_group_broadcast(input_data, 0);
- activations.s1 = sub_group_broadcast(input_data, 1);
- activations.s2 = sub_group_broadcast(input_data, 2);
- activations.s3 = sub_group_broadcast(input_data, 3);
- activations.s4 = sub_group_broadcast(input_data, 4);
- activations.s5 = sub_group_broadcast(input_data, 5);
- activations.s6 = sub_group_broadcast(input_data, 6);
- activations.s7 = sub_group_broadcast(input_data, 7);
-
- acc[br * OUTPUT_BLOCK_WIDTH + bc] = MMAD_8(activations, weights_data, acc[br * OUTPUT_BLOCK_WIDTH + bc]);
- }
- }
- filter_idx += 32*8; // 32 features per channel * 8 output features per SIMD channel
- }
- }
- in_addr += 32; // 4 features per channel * 8 SIMD channels
- }
-
-#if BIAS_TERM
-#if BIAS_PER_OUTPUT
- const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
-#elif BIAS_PER_OFM
- const uint bias_index = f;
-#endif
-#endif // BIAS_TERM
-
- OUTPUT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT] = { 0 };
- for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
- {
- for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
- {
-#if BIAS_TERM
- // TODO: Maybe half should be supported as well.
- float res = (float)acc[br * OUTPUT_BLOCK_WIDTH + bc] + biases[bias_index];
-#else
- float res = (float)acc[br * OUTPUT_BLOCK_WIDTH + bc];
-#endif
-#if HAS_FUSED_OPS
- FUSED_OPS;
- out[br * OUTPUT_BLOCK_WIDTH + bc] = FUSED_OPS_RESULT;
-#else
- out[br * OUTPUT_BLOCK_WIDTH + bc] = TO_OUTPUT_TYPE(res);
-#endif
- }
- }
-
- const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
- for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
- {
- if(y + br < OUTPUT_SIZE_Y)
- {
- for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
- {
- if(x + bc < OUTPUT_SIZE_X)
- {
- const uint dst_index = OUTPUT_GET_INDEX(b, f, y+br, x+bc) + out_split_offset;
- output[dst_index] = out[br * OUTPUT_BLOCK_WIDTH + bc];
- }
- }
- }
- }
-}
-
-#undef FILTER_IFM_MMAD_NUM
-#undef FILTER_OFM_MMAD_NUM
-#undef FILTER_IFM_ALIGNED
-#undef FILTER_OFM_ALIGNED
+++ /dev/null
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/imad.cl"
-
-#if defined(ACCUMULATOR_TYPE)
-#undef ACCUMULATOR_TYPE
-#endif
-
-#if QUANTIZATION_TERM
-# define ACCUMULATOR_TYPE int
-# define ACTIVATION_TYPE float
-# define TO_ACTIVATION_TYPE(x) convert_float(x)
-#else
-# define ACCUMULATOR_TYPE INPUT0_TYPE
-# define ACTIVATION_TYPE INPUT0_TYPE
-# define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
-#endif
-
-
-#define FILTER_IFM_SLICES_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_NUM_ALIGNED ((FILTER_OFM_NUM + SUB_GROUP_SIZE - 1) / SUB_GROUP_SIZE * SUB_GROUP_SIZE)
-
-// we are packing 4 8bit activations per 32 bit
-#define PACK 4
-
-#define AS_TYPE_N_(type, n, x) as_##type##n(x)
-#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
-#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL(fused_conv_eltwise_gpu_af32_imad_1x1)(
- const __global PACKED_TYPE* input,
- __global OUTPUT_TYPE* restrict output,
- const __global uint* weights,
-#if BIAS_TERM
- __global BIAS_TYPE* biases,
-#endif
-#if HAS_FUSED_OPS_DECLS
- FUSED_OPS_DECLS,
-#endif
- uint split_idx)
-{
- const uint x = (uint)get_global_id(0) * TILE_LENGTH % OUTPUT_SIZE_X;
- const uint y = (uint)get_global_id(0) * TILE_LENGTH / OUTPUT_SIZE_X;
- const uint f = (((uint)get_global_id(1) * TILE_DEPTH) % FILTER_OFM_NUM_ALIGNED) / (TILE_DEPTH * SUB_GROUP_SIZE) * (TILE_DEPTH * SUB_GROUP_SIZE);
- const uint b = ((uint)get_global_id(1) * TILE_DEPTH) / FILTER_OFM_NUM_ALIGNED;
- const uint lid = get_sub_group_local_id();
-
- const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
- const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
- PACKED_TYPE input_slice[TILE_LENGTH];
- int8 weights_slice;
- ACCUMULATOR_TYPE accu[TILE_LENGTH][TILE_DEPTH] = {0};
-
- uint filter_idx = f * FILTER_IFM_SLICES_NUM * 32 / PACK;
- uint in_addr = (INPUT0_GET_INDEX(b, 0, input_y, input_x)) / PACK;
-
- __attribute__((opencl_unroll_hint(1)))
- for (uint k = 0; k < FILTER_IFM_SLICES_NUM; ++k)
- {
- // Read 32 input features for each pixel in the tile. 4 features in each int, 8 ints across SIMD
- __attribute__((opencl_unroll_hint(TILE_LENGTH)))
- for (uint i = 0; i < TILE_LENGTH; ++i)
- {
- uint tmp_addr = in_addr + i * INPUT0_X_PITCH * STRIDE_SIZE_X / PACK;
- input_slice[i] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)input + tmp_addr));
- }
-
- // Loop through TILE_DEPTH output features
- __attribute__((opencl_unroll_hint(TILE_DEPTH)))
- for (uint of = 0; of < TILE_DEPTH; ++of)
- {
- // Read 32 weights. 8 ints, 4 weights in each int, each SIMD lane has own weghts
- weights_slice = as_int8(intel_sub_group_block_read8(weights + filter_idx));
-
- __attribute__((opencl_unroll_hint(TILE_LENGTH)))
- for (uint i = 0; i < TILE_LENGTH; ++i)
- {
- PACKED_TYPE A_scalar;
- A_scalar = sub_group_broadcast(input_slice[i], 0); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s0));
- A_scalar = sub_group_broadcast(input_slice[i], 1); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s1));
- A_scalar = sub_group_broadcast(input_slice[i], 2); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s2));
- A_scalar = sub_group_broadcast(input_slice[i], 3); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s3));
- A_scalar = sub_group_broadcast(input_slice[i], 4); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s4));
- A_scalar = sub_group_broadcast(input_slice[i], 5); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s5));
- A_scalar = sub_group_broadcast(input_slice[i], 6); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s6));
- A_scalar = sub_group_broadcast(input_slice[i], 7); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s7));
- }
-
- filter_idx += 32 * 8 / 4; // 32 features per channel * 8 SIMD channels / sizeof(int)
- }
- in_addr += 4 * 8 / 4; // 4 features per channel * 8 SIMD channels / sizeof(int) -> next 32 input features
- }
-
-#if TILE_DEPTH == 8
- MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result[TILE_LENGTH];
-#elif TILE_DEPTH == 4
- MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result[TILE_LENGTH];
-#endif
-
- uint dst_index = (OUTPUT_GET_INDEX(b, f, y, x)) / PACK;
-
- __attribute__((opencl_unroll_hint(TILE_LENGTH)))
- for (uint i = 0; i < TILE_LENGTH; ++i)
- {
-
- __attribute__((opencl_unroll_hint(TILE_DEPTH)))
- for (uint j = 0; j < TILE_DEPTH; ++j)
- {
- const uint f2 = f + lid * 4 + (j % 4) + (j / 4 * 32);
- ACCUMULATOR_TYPE dotProd = accu[i][j];
-#if BIAS_TERM
- #if BIAS_PER_OUTPUT
- const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
- #elif BIAS_PER_OFM
- const uint bias_index = f2;
- #endif
- ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd) + TO_ACTIVATION_TYPE(biases[bias_index]);
-#else
- ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd);
-#endif //BIAS_TERM
-
- #if HAS_FUSED_OPS
- FUSED_OPS;
- result[i][j] = FUSED_OPS_RESULT;
- #else
- result[i][j] = TO_OUTPUT_TYPE(res);
- #endif
- }
- }
-
- __attribute__((opencl_unroll_hint(TILE_LENGTH)))
- for (uint i = 0; i < TILE_LENGTH; ++i)
- {
-#if TILE_DEPTH == 8
- intel_sub_group_block_write2((__global uint*)output + dst_index + i * OUTPUT_X_PITCH / PACK, as_uint2(result[i]));
-#elif TILE_DEPTH == 4
- intel_sub_group_block_write((__global uint*)output + dst_index + i * OUTPUT_X_PITCH / PACK, as_uint(result[i]));
-#endif
- }
-}
-#undef FILTER_IFM_SLICES_NUM
-#undef FILTER_OFM_NUM_ALIGNED
-#undef ACCUMULATOR_TYPE
-#undef ACTIVATION_TYPE
-#undef TO_ACTIVATION_TYPE
#else
in[reg] = AS_PACKED_TYPE(conv_input[in_addr]);// read SIMD_SIZE elements wide
#endif
- // TODO This will cause errors for byxf_af32 format on input
in_addr += (INPUT0_SIZE_X + IWPAD); // move to next row down
#endif
}
if(!zero_c)
#endif
{
- #if OUTPUT_LAYOUT_BYXF_AF32 == 1
- uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
- #elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
+ #if OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
uint out_idx = output_idx_offset + r * output_row_size_bytes + (c*PACK);
#elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1 || OUTPUT_LAYOUT_BS_FS_YX_BSV16_FSV16 == 1
uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
((b) / (sub_group_size))*CAT(prefix, _BATCH_PITCH) \
)
-inline uint FUNC(get_bf8_xy16_index)(uint b, uint f, uint y, uint x, uint x_size, uint y_size, uint f_size, uint offset)
-{
- const uint xy_idx = x + y * x_size;
- const uint xy_offset = (xy_idx % 16) + (xy_idx / 16) * 16 * 8;
- const uint xy_block_num = (x_size * y_size + 16 - 1) / 16;
- const uint f_offset = (f % 8) * 16 + (f / 8) * xy_block_num * 16 * 8;
- const uint f_block_num = (f_size + 8 - 1) / 8;
- const uint b_offset = b * f_block_num * xy_block_num * 128;
-
- const size_t idx = offset + xy_offset + f_offset + b_offset;
-
- return idx;
-}
-
inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x,
uint x_size, uint y_size, uint f_size, uint b_size,
uint b_pad_before, uint b_pad_after,
CAT(prefix, _SIZE_Y), \
CAT(prefix, _SIZE_Z))
-inline uint FUNC(get_byxf_af32_index)(uint b, uint f, uint y, uint x, uint y_pitch, uint b_pitch, uint f_size, uint f_pad_before, uint f_pad_after, uint offset)
-{
- const uint f_aligned_to_32 = ((f_size + 31) / 32) * 32;
- const uint x_pitch = f_pad_before + f_aligned_to_32 + f_pad_after;
- const uint b_offset = b * b_pitch;
- const uint xy_offset = x_pitch * x + y_pitch * y;
- const uint f_offset = f;
- const size_t idx = offset + xy_offset + b_offset + f_offset;
- return idx;
-}
-
-#define GET_DATA_BYXF_AF32_INDEX(prefix, b, f, y, x) \
- FUNC_CALL(get_byxf_af32_index)( \
- b, f, y, x, CAT(prefix, _Y_PITCH), \
- CAT(prefix, _BATCH_PITCH), \
- CAT(prefix, _FEATURE_NUM), \
- CAT(prefix, _PAD_BEFORE_FEATURE_NUM), \
- CAT(prefix, _PAD_AFTER_FEATURE_NUM), \
- CAT(prefix, _OFFSET))
-
-inline uint FUNC(get_byx8_f4_index)(uint b, uint f, uint y, uint x,
- uint x_pitch, uint y_pitch, uint b_pitch, uint f_size, uint x_size, uint offset)
-{
- const uint f_aligned_to_4 = ((f_size + 3) / 4) * 4;
- const uint x_aligned_to_8 = ((x_size + 7) / 8) * 8;
- const uint b_offset = b * b_pitch;
- const uint xy_offset = x * x_pitch + y * y_pitch;
- const uint f_offset = f;
- const size_t idx = offset + xy_offset + b_offset + f_offset;
- return idx;
-}
-
-#define GET_DATA_BYX8_F4_INDEX(prefix, b, f, y, x) \
- FUNC_CALL(get_byx8_f4_index)( \
- b, f, y, x, CAT(prefix, _X_PITCH), \
- CAT(prefix, _Y_PITCH), \
- CAT(prefix, _BATCH_PITCH), \
- CAT(prefix, _FEATURE_NUM), \
- CAT(prefix, _SIZE_X), \
- CAT(prefix, _OFFSET))
-
-#define GET_DATA_BF8_XY16_INDEX(prefix, b, f, y, x) \
- FUNC_CALL(get_bf8_xy16_index)( \
- b, f, y, x, CAT(prefix, _SIZE_X ), \
- CAT(prefix, _SIZE_Y), \
- CAT(prefix, _FEATURE_NUM), \
- CAT(prefix, _OFFSET))
-
-inline uint FUNC(get_fs_bs_yx_bsv4_fsv32_index)(uint b, uint f, uint y, uint x,
- uint x_pad_before, uint x_size, uint x_pad_after,
- uint y_pad_before, uint y_size, uint y_pad_after,
- uint size_f, uint size_b)
-{
- const uint f_32_aligned = ((size_f + 31)/32) * 32;
- const uint b_4_aligned = ((size_b + 3)/4) * 4;
- const uint fsv_idx = f % 32;
- const uint bsv_idx = b % 4;
- const uint fs_idx = f / 32;
- const uint bs_idx = b / 4;
-
- const uint x_pitch = 32 * 4;
- const uint y_pitch = 32 * 4 * (x_pad_before + x_size + x_pad_after);
- const uint bs_pitch = y_pitch * (y_pad_before + y_size + y_pad_after);
- const uint fs_pitch = bs_pitch * (b_4_aligned / 4);
- uint offset = x_pitch * x_pad_before + y_pitch * y_pad_before;
-
- size_t idx = offset + fsv_idx + bsv_idx * 32;
- idx += 32*4 * x;
- idx += y * y_pitch;
- idx += bs_idx * bs_pitch;
- idx += fs_idx * fs_pitch;
-
- return idx;
-}
-
-#define GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(prefix, b, f, y, x) \
- FUNC_CALL(get_fs_bs_yx_bsv4_fsv32_index)( \
- b, f, y, x, \
- CAT(prefix, _PAD_BEFORE_SIZE_X), \
- CAT(prefix, _SIZE_X), \
- CAT(prefix, _PAD_AFTER_SIZE_X), \
- CAT(prefix, _PAD_BEFORE_SIZE_Y), \
- CAT(prefix, _SIZE_Y), \
- CAT(prefix, _PAD_AFTER_SIZE_Y), \
- CAT(prefix, _FEATURE_NUM), \
- CAT(prefix, _BATCH_NUM))
-
#define GET_FILTER_GOIYX(prefix, g, o, i, y, x) \
CAT(prefix, _OFFSET) + \
(x)*CAT(prefix, _X_PITCH) + \
}
#endif
-#if OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32
+#if OUTPUT_LAYOUT_B_FS_YX_FSV4
const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
#if OUTPUT_FEATURE_NUM % 4 == 0
*((__global OUTPUT_VEC4*)(output + output_pos)) = final_result;
+++ /dev/null
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-
-#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4)
-#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4)
-
-#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4)
-
-#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
-#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4)
-
-#if MAX_POOLING
- #define INIT_VAL ACCUMULATOR_VAL_MIN
-#elif AVG_POOLING
- #define INIT_VAL ACCUMULATOR_VAL_ZERO
-#else
- #error
-#endif
-
-inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
-{
-#if MAX_POOLING
- return ACCUMULATOR_MAX_FUNC(tmp, in);
-#elif AVG_POOLING
- return tmp + in;
-#endif
-}
-
-KERNEL(pooling_gpu_byxf_af32)(
- const __global INPUT0_TYPE* input,
- __global OUTPUT_TYPE* output
-#if HAS_FUSED_OPS_DECLS
- , FUSED_OPS_DECLS
-#endif
-)
-{
- const uint x = (uint)get_global_id(0);
- const uint y = (uint)get_global_id(1);
- const uint bf = (uint)get_global_id(2);
- // we process 4 features per workitem that's why we need to divide it
- const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32;
- const uint f = 4 * (bf % (aligned32_features / 4));
- const uint b = bf / (aligned32_features / 4);
-
- typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_t;
- if (x >= OUTPUT_SIZE_X)
- {
- return;
- }
-
- const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
- const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
- ACCUMULATOR_VEC4 result = INIT_VAL;
-
-#ifdef CHECK_BOUNDRY
- if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
- offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
- {
- return;
- }
-
-#ifdef DYNAMIC_KERNEL_DIVIDER
- uint num_elementes = 0;
-#endif
-
- const uint batch_and_feature_offset = GET_DATA_INDEX(INPUT0, b, f, 0, 0);
- for(uint j = 0; j < POOL_SIZE_Y; j++)
- {
- int input_offset_y = offset_y + j;
- bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
- if(!zero_y)
- {
- for(uint i = 0; i < POOL_SIZE_X; i++)
- {
- int input_offset_x = offset_x + i;
- bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
- if(!zero)
- {
- const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH;
-
- input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
- result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
- result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
- result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
- result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
-
-#ifdef DYNAMIC_KERNEL_DIVIDER
- num_elementes++;
-#endif
- }
- }
- }
- }
-#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
- const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
- const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
- const uint num_elementes = (hend - offset_y) * (wend - offset_x);
-#endif
-#else
- uint input_idx = GET_DATA_INDEX(INPUT0, b, f, offset_y, offset_x);
-
- for(uint j = 0; j < POOL_SIZE_Y; j++)
- {
- for(uint i = 0; i < POOL_SIZE_X; i++)
- {
- input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
- result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
- result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
- result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
- result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
-
- input_idx += INPUT0_X_PITCH;
- }
- input_idx += (INPUT0_Y_PITCH - POOL_SIZE_X*INPUT0_X_PITCH);
- }
-
-#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
- const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y;
-#endif
-#endif
-
-#if defined AVG_POOLING
-#if ENABLE_ROUND
- int4 not_fused_result;
- for (uint i = 0; i < 4; ++i) {
- #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
- not_fused_result[i] = convert_int(round(((float)result[i] / max(num_elementes, (uint)1)));
- #else
- not_fused_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
- #endif
- }
-#else // ENABLE_ROUND
- float4 not_fused_result;
- for (uint i = 0; i < 4; ++i) {
- #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
- not_fused_result[i] = (float)result[i] / max(num_elementes, (uint)1);
- #else
- not_fused_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
- #endif
- }
-#endif // ENABLE_ROUND
-#else // AVG_POOLING
- float4 not_fused_result = convert_float4(result);
-#endif // AVG_POOLING
-
- OUTPUT_VEC4 final_result;
-#if HAS_FUSED_OPS
- ACTIVATION_VEC4 fused_pool_result = TO_ACTIVATION_VEC4(not_fused_result);
- FUSED_OPS;
- final_result = FUSED_OPS_RESULT;
- for(uint op = 0; op < 4; op++)
- {
- const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
- output[output_pos] = final_result[op];
- }
-#else
- final_result = TO_OUTPUT_VEC4(not_fused_result);
- for(uint op = 0; op < 4; op++)
- {
- const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
- final_result[op] = TO_OUTPUT_TYPE(ACTIVATION(not_fused_result[op], ACTIVATION_PARAMS));
- output[output_pos] = final_result[op];
- }
-#endif
-}
-
-#undef INIT_VAL
-#undef ACCUMULATOR_VEC4
-
-#undef ACTIVATION_VEC4
-#undef TO_ACTIVATION_VEC4
-
-#undef OUTPUT_VEC4
-#undef TO_OUTPUT_VEC4
#endif
)
{
-#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BYXF_AF32 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BFZYX
+#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BFZYX
const uint x = (uint)get_global_id(0);
const uint yz = (uint)get_global_id(1);
#if OUTPUT_DIMS == 5
)
{
#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BFZYX ||\
- OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32
+ OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4
const uint x = (uint)get_global_id(0);
#if OUTPUT_DIMS == 5
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \
defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined OUTPUT_LAYOUT_BF8_XY16
- return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV16
return GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_BYXF_AF32
- return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_BYX8_F4
- return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32
- return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4
return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_FS_B_YX_FSV32
+++ /dev/null
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/reshape_dims.cl"
-#include "include/fetch.cl"
-
-#include "include/data_types.cl"
-
-///////////////////////// Input Index /////////////////////////
-inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x)
-{
-#if INPUT0_SIMPLE
- return GET_DATA_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \
- defined INPUT0_LAYOUT_BS_F_BSV16__AF8
- return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined INPUT0_LAYOUT_BF8_XY16
- return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_BYXF_AF32
- return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_BYX8_F4
- return GET_DATA_BYX8_F4_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_FS_BS_YX_BSV4_FSV32
- return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_B_FS_YX_FSV4
- return GET_DATA_B_FS_YX_FSV4_INDEX(INPUT0, b, f, y, x);
-#else
-#error reorder_data.cl: input format - not supported
-#endif
-}
-
-///////////////////////// Output Index /////////////////////////
-
-inline uint FUNC(get_output_index)(uint b, uint f, uint y, uint x)
-{
-#if OUTPUT_SIMPLE
- return GET_DATA_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \
- defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
- return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined OUTPUT_LAYOUT_BF8_XY16
- return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_BYXF_AF32
- return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_BYX8_F4
- return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32
- return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
-#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4
- return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x);
-#else
-#error reorder_data.cl: output format - not supported
-#endif
-}
-
-__attribute__((intel_reqd_sub_group_size(16)))
-KERNEL (reorder_data_byxf_f32_to_byx8_f4_i8)(
- const __global INPUT_REORDER_TYPE* input,
- __global OUTPUT_REORDER_TYPE* output
-#ifdef MEAN_SUBTRACT_IN_BUFFER
- , __global MEAN_SUBTRACT_TYPE* mean_subtract
-#endif
- )
-{
- const uint x = get_global_id(0);
- const uint y = get_group_id(1);
- const uint b = (uint)get_group_id(2) * WG_BATCH_SIZE + (uint)get_sub_group_id();
-
- const uint input_idx = FUNC_CALL(get_input_index)(b, 0, y, x);
- const uint output_idx = FUNC_CALL(get_output_index)(b, 0, y, x);
-
-#if defined MEAN_SUBTRACT_INSIDE_PARAMS
- float4 res;
- res.s0 = TO_MEAN_TYPE(input[input_idx]);
- res.s0 = MEAN_OP(res.s0, VALUE_TO_SUBTRACT[0 % VALUE_TO_SUBTRACT_SIZE]);
- res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
- res.s1 = MEAN_OP(res.s1, VALUE_TO_SUBTRACT[1 % VALUE_TO_SUBTRACT_SIZE]);
- res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
- res.s2 = MEAN_OP(res.s2, VALUE_TO_SUBTRACT[2 % VALUE_TO_SUBTRACT_SIZE]);
- res.s3 = 0;
-#elif defined MEAN_SUBTRACT_IN_BUFFER
-#if defined MEAN_PER_FEATURE
- MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res;
- res.s0 = TO_MEAN_TYPE(input[input_idx]);
- res.s0 = MEAN_OP(res.s0, mean_subtract[0]);
- res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
- res.s1 = MEAN_OP(res.s1, mean_subtract[1]);
- res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
- res.s2 = MEAN_OP(res.s2, mean_subtract[2]);
- res.s3 = 0
-#else
- MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res;
- res.s0 = TO_MEAN_TYPE(input[input_idx]);
- res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
- res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
- res.s3 = 0;
-
- res.s0 = MEAN_OP(res.s0, mean_subtract[0]);
- res.s1 = MEAN_OP(res.s1, mean_subtract[1]);
- res.s2 = MEAN_OP(res.s2, mean_subtract[2]);
-#endif
-#else
- MAKE_VECTOR_TYPE(CALC_TYPE, 4) res;
- res.s0 = TO_CALC_TYPE(input[input_idx]);
- res.s1 = TO_CALC_TYPE(input[input_idx+1]);
- res.s2 = TO_CALC_TYPE(input[input_idx+2]);
- res.s3 = 0;
-#endif
-
- char4 out_vals;
- out_vals.s0 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s0), ACTIVATION_PARAMS_TYPED);
- out_vals.s1 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s1), ACTIVATION_PARAMS_TYPED);
- out_vals.s2 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s2), ACTIVATION_PARAMS_TYPED);
- out_vals.s3 = 0;
-
- __global uint* dst = (__global uint*)output;
- dst[output_idx/4] = as_uint(out_vals);
-}
#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \
defined INPUT0_LAYOUT_BS_F_BSV16__AF8
return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined INPUT0_LAYOUT_BF8_XY16
- return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_B_FS_YX_FSV16
return GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_B_FS_ZYX_FSV16
#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \
defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined OUTPUT_LAYOUT_BF8_XY16
- return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV16
return GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_B_FS_ZYX_FSV16
#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \
defined INPUT0_LAYOUT_BS_F_BSV16__AF8
return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
-#elif defined INPUT0_LAYOUT_BF8_XY16
- return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
-#elif defined INPUT0_LAYOUT_BYXF_AF32
- return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_B_FS_YX_FSV16
return GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_FS_B_YX_FSV32
raw_index_func_val = "GET_DATA_INDEX_RAW(" + _name + ", b, f, y, x)";
} else if (layout == DataLayout::b_fs_yx_fsv16 ||
layout == DataLayout::b_fs_yx_fsv32 ||
- layout == DataLayout::byxf_af32 ||
- layout == DataLayout::fs_bs_yx_bsv4_fsv32 ||
layout == DataLayout::b_fs_yx_fsv4 ||
layout == DataLayout::fs_b_yx_fsv32 ||
layout == DataLayout::bs_fs_yx_bsv16_fsv16) {
case kernel_selector::DataLayout::b_fs_zyx_fsv32: return "B_FS_ZYX_FSV32";
case kernel_selector::DataLayout::bs_f_bsv8__af8: return "BS_F_BSV8__AF8";
case kernel_selector::DataLayout::bs_f_bsv16__af8: return "BS_F_BSV16__AF8";
- case kernel_selector::DataLayout::bf8_xy16: return "BF8_XY16";
case kernel_selector::DataLayout::winograd_2x3_s1_data: return "WINOGRAD_2x3_S1_DATA";
- case kernel_selector::DataLayout::byxf_af32: return "BYXF_AF32";
- case kernel_selector::DataLayout::byx8_f4: return "BYX8_F4";
- case kernel_selector::DataLayout::fs_bs_yx_bsv4_fsv32: return "FS_BS_YX_BSV4_FSV32";
case kernel_selector::DataLayout::b_fs_yx_fsv4: return "B_FS_YX_FSV4";
case kernel_selector::DataLayout::b_fs_yx_32fp: return "B_FS_YX_32FP";
case kernel_selector::DataLayout::bfzyx: return "BFZYX";
return {output_type, format::b_fs_yx_32fp, output_size};
}
- // due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer
- // will take byxf and return fs_bs_yx_bsv4_fsv32
- if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 &&
- input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) {
- return layout{output_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size};
- }
-
- auto users = node.get_users();
- if (users.size() == 1 && users.front()->is_type<convolution>()) {
- auto conv_split = users.front()->as<convolution>().get_split();
- auto conv_groups = (int32_t)users.front()->as<convolution>().get_groups();
-
- bool next_is_dw = ((conv_split > 1 && conv_split == output_size.feature[0]) ||
- (conv_groups > 1 && conv_groups == output_size.feature[0]));
-
- if (input_layout.data_type == data_types::i8 && input_layout.format == format::b_fs_yx_fsv4 && next_is_dw) {
- return layout{output_type, cldnn::format::byxf_af32, output_size};
- }
-
- auto prev_node = node.get_dependencies().front();
- if (prev_node->is_type<reorder>())
- prev_node = prev_node->get_dependencies().front();
-
- auto prev_is_convo = prev_node->is_type<convolution>();
- if (prev_is_convo) {
- auto prev2_node = prev_node->get_dependencies().front();
- auto prev_input_format = prev2_node->get_output_layout().format;
-
- if (input_layout.data_type == data_types::i8 && input_layout.format == format::byxf_af32 && !next_is_dw &&
- prev_input_format == format::b_fs_yx_fsv4) {
- return layout{output_type, cldnn::format::b_fs_yx_fsv4, output_size};
- }
- }
- }
return {output_type, input_layout.format, output_size};
}
output_range.spatial[1],
output_range.spatial[2]);
- // due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer
- // will take byxf and return fs_bs_yx_bsv4_fsv32
- if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 &&
- input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) {
- return layout{output_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size};
- }
-
return {output_type, input_layout.format, output_size};
}
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), concatenation_gpu::create},
// MMAD
- {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), concatenation_gpu::create},
- {std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), concatenation_gpu::create},
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfzyx), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::winograd_2x3_s1_data), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::winograd_2x3_s1_data), val_fw);
- implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bf8_xy16), val_fw);
- implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bf8_xy16), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
// block f16 format
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
// MMAD
- implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
- implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
- implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw);
- implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw);
- implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byx8_f4), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv32), val_fw);
- implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw);
+++ /dev/null
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#ifdef CODE_PREFIX
-#define CODE_BEGIN CODE_PREFIX
-#define CODE_END CODE_POSTFIX
-#else
-#define CODE_BEGIN
-#define CODE_END
-#endif
-
-CODE_BEGIN
-enum neural_memory_format {
- x_f32,
- xb_f32, // 1D+batch, float32
- bx_f32, // 1D+batch, float32
- yxfb_f32, // 3D+batch, float32
- byxf_f32, // for convolution_cpu_jit_batch1
- bfyx_f32, // used in Caffe
- fyxb_f32, // used in Caffe
- oiyx_f32, // format used only for weights: o - output feature maps, i - input feature maps
- byxf_b24_f32, // for convolution_cpu_generic
- yxoi_o4_f32, // for convolution_cpu_generic
- os_yxi_sv16_f32, // format used only for weights: os - output slice, i - input feature maps, sv16 - 16 values of single slice
- bs_yxf_bv24_f32,
- any=-1
-};
-
-#pragma pack(push, 4)
-typedef struct _neural_memory_tag {
- uint format;
- uint feature_offset;
- uint spatial_offset;
- uint vector_size;
- uint data_offset;
- uint data[1];
-} neural_memory;
-
-typedef struct _neural_vector_tag {
- uint feature_offset;
- uint spatial_offset;
- uint raw_size;
- uint data[1];
-} neural_vector;
-#pragma pack(pop)
-
-// neural_memory accessors
-__attribute__((overloadable)) __global uint* get_raw(__global neural_memory* mem) { return &(mem->data[0]); }
-__attribute__((overloadable)) const __global uint* get_raw(const __global neural_memory* mem) { return &(mem->data[0]); }
-__attribute__((overloadable)) uint get_raw_size(const __global neural_memory* mem) { return mem->vector_size; }
-
-__attribute__((overloadable)) __global uint* get_batch(__global neural_memory* mem) { return get_raw(mem); }
-__attribute__((overloadable)) const __global uint* get_batch(const __global neural_memory* mem) { return get_raw(mem); }
-__attribute__((overloadable)) uint get_batch_size(const __global neural_memory* mem) { return mem->feature_offset; }
-
-__attribute__((overloadable)) __global uint* get_feature(__global neural_memory* mem) { return &(mem->data[mem->feature_offset]); }
-__attribute__((overloadable)) const __global uint* get_feature(const __global neural_memory* mem) { return &(mem->data[mem->feature_offset]); }
-__attribute__((overloadable)) uint get_feature_size(const __global neural_memory* mem) { return mem->spatial_offset - mem->feature_offset; }
-
-__attribute__((overloadable)) __global uint* get_spatial(__global neural_memory* mem) { return &(mem->data[mem->spatial_offset]); }
-__attribute__((overloadable)) const __global uint* get_spatial(const __global neural_memory* mem) { return &(mem->data[mem->spatial_offset]); }
-__attribute__((overloadable)) uint get_spatial_size(const __global neural_memory* mem) { return get_raw_size(mem) - mem->spatial_offset; }
-
-__attribute__((overloadable)) __global void* get_data(__global neural_memory* mem) { return &(mem->data[mem->data_offset]); }
-__attribute__((overloadable)) const __global void* get_data(const __global neural_memory* mem) { return &(mem->data[mem->data_offset]); }
-__attribute__((overloadable)) size_t get_element_size(const __global neural_memory* mem) { return sizeof(float); }
-
-__attribute__((overloadable)) size_t get_data_size(const __global neural_memory* mem) {
- size_t result = get_element_size(mem);
-
- const __global uint* raw = get_raw(mem);
- uint raw_size = get_raw_size(mem);
-
- for(uint i = 0; i < raw_size; i++) {
- result *= raw[i];
- }
- return result;
-}
-
-// neural_vector accessors
-// TODO NOTE: non-const accessors are disabled now, because read-only neural_vector argument is only supported now
-
-//__attribute__((overloadable)) __global uint* get_raw(__global neural_vector* v) { return &(v->data[0]); }
-__attribute__((overloadable)) const __global uint* get_raw(const __global neural_vector* v) { return &(v->data[0]); }
-__attribute__((overloadable)) uint get_raw_size(const __global neural_vector* v) { return v->raw_size; }
-
-//__attribute__((overloadable)) __global uint* get_batch(__global neural_vector* v) { return get_raw(v); }
-__attribute__((overloadable)) const __global uint* get_batch(const __global neural_vector* v) { return get_raw(v); }
-__attribute__((overloadable)) uint get_batch_size(const __global neural_vector* v) { return v->feature_offset; }
-
-//__attribute__((overloadable)) __global uint* get_feature(__global neural_vector* v) { return &(v->data[v->feature_offset]); }
-__attribute__((overloadable)) const __global uint* get_feature(const __global neural_vector* v) { return &(v->data[v->feature_offset]); }
-__attribute__((overloadable)) uint get_feature_size(const __global neural_vector* v) { return v->spatial_offset - v->feature_offset; }
-
-//__attribute__((overloadable)) __global uint* get_spatial(__global neural_vector* v) { return &(v->data[v->spatial_offset]); }
-__attribute__((overloadable)) const __global uint* get_spatial(const __global neural_vector* v) { return &(v->data[v->spatial_offset]); }
-__attribute__((overloadable)) uint get_spatial_size(const __global neural_vector* v) { return get_raw_size(v) - v->spatial_offset; }
-
-CODE_END
-
-/*
-KERNEL(Fully_Connected_GPU)
-DECALRE_CONSTANT()
-BEGIN_ARGUMENTS_DECLARATION
-DECLARE_INPUT_MEMORY_ARGUMENT(input_mem)
-DECLARE_INPUT_MEMORY_ARGUMENT(weights_mem)
-DECLARE_INPUT_MEMORY_ARGUMENT(bias_mem)
-DECLARE_OUTPUT_MEMORY_ARGUMENT(dst_mem)
-END_ARGUMENTS_DECLARATION
-CODE_BEGIN
-#define WEIGHTS { 1.0, 3.2, 4.5, 6.7 }
-#define WEIGHTS_SIZE { 2, 2 }
-#define WEIGHTS_DIM 2
-*/
-__kernel void Fully_Connected_GPU(__global neural_memory* input_mem, __global neural_memory* weights_mem, __global neural_memory* bias_mem, __global neural_memory* dst_mem)
-{
- __global uint* input_size = get_raw(input_mem);
- __global uint* weights_size = get_raw(weights_mem);
- __global float* input = (__global float*)get_data(input_mem);
- __global float* weights = (__global float*)get_data(weights_mem);
- __global float* bias = (__global float*)get_data(bias_mem);
- __global float* pDst = (__global float*)get_data(dst_mem);
-
- const int x = get_global_id(0);
-
- pDst[x] = 0;
- uint outXIdx = x / input_size[0];
- uint inputBatchIdx = x % input_size[0];
- uint weightYIdx = outXIdx * weights_size[0];
- for (uint i = 0; i < input_size[2]; i++)
- {
- pDst[x] += input[i * input_size[0] + inputBatchIdx] * weights[weightYIdx + i];
- }
- pDst[x] += bias[outXIdx];
-}
-CODE_END
-
-CODE_BEGIN
-__kernel void Convolution_GPU(
- const __global neural_memory* input_mem,
- const __global neural_memory* filter_mem,
- float bias,
- __global neural_memory* dst_mem,
- const __global neural_vector* spatial_stride)
-{
-
-//
- const __global uint* input_size = get_raw(input_mem);
- const __global uint* filter_size = get_raw(filter_mem);
- const __global uint* dst_size = get_raw(dst_mem);
- const __global float* input = (const __global float*)get_data(input_mem);
- const __global float* filter = (const __global float*)get_data(filter_mem);
- __global float* pDst = (__global float*)get_data(dst_mem);
-//
-
- int global_id = get_global_id(0);
- const int batch_num = dst_size[0];
- const int batch_offset = global_id % dst_size[0];
-
- const int idx = global_id / batch_num;
-
- const int x = (idx % input_size[2]) * get_spatial(spatial_stride)[0];
- const int y = (idx * get_spatial(spatial_stride)[1]) / input_size[2];
-
- const int out_offset = idx * batch_num + batch_offset;
-
- pDst[out_offset] = 0;
- for (uint i = 0; i < filter_size[4]; i++)
- {
- for (uint j = 0; j < filter_size[3]; j++)
- {
- int input_idx = (x + j + ((y + i) * input_size[2])) * batch_num + batch_offset;
- int filter_idx = i * filter_size[3] + j;
- pDst[out_offset] += input[input_idx] * filter[filter_idx];
- }
- }
- pDst[out_offset] += bias;
-}
-CODE_END
{ std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), eltwise_gpu::create },
// MMAD
- { std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), eltwise_gpu::create },
- { std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), eltwise_gpu::create },
- { std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), eltwise_gpu::create },
{std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw},
{std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw},
// MMAD
- {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw},
- {std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw},
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), val_fw},
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv32), val_fw},
// IMAD
fused_conv_eltwise_gpu::create);
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
fused_conv_eltwise_gpu::create);
- // MMAD
- implementation_map<fused_conv_eltwise>::add(
- std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32),
- fused_conv_eltwise_gpu::create);
// IMAD
- implementation_map<fused_conv_eltwise>::add(
- std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4),
- fused_conv_eltwise_gpu::create);
- implementation_map<fused_conv_eltwise>::add(
- std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4),
- fused_conv_eltwise_gpu::create);
- implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32),
+ implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4),
fused_conv_eltwise_gpu::create);
- implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32),
+ implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4),
fused_conv_eltwise_gpu::create);
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::image_2d_rgba),
- fused_conv_eltwise_gpu::create);
+ fused_conv_eltwise_gpu::create);
}
} // namespace detail
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
- implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), pooling_gpu::create);
- implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), pooling_gpu::create);
-
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), pooling_gpu::create);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
- implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
- implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
- implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw);
- implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw);
-
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), val_fw);
{std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), resample_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), resample_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), resample_gpu::create},
- {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create},
- {std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), resample_gpu::create},
- {std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), resample_gpu::create},
- {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), resample_gpu::create},
- {std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), resample_gpu::create}});
+ {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create}});
}
} // namespace detail
(l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
return false;
- // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required
- if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
- return false;
-
if (l.format == format::bs_fs_yx_bsv16_fsv16)
return false;
// right now output padding optimization is only available for bfyx format and data type = float32
if (conv_layout.format != cldnn::format::bfyx &&
- conv_layout.format != cldnn::format::bf8_xy16 &&
conv_layout.format != cldnn::format::b_fs_yx_fsv16 &&
conv_layout.format != cldnn::format::b_fs_zyx_fsv16 &&
conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 &&
- conv_layout.format != cldnn::format::byxf_af32 &&
- conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 &&
conv_layout.format != cldnn::format::b_fs_yx_fsv4 &&
conv_layout.format != cldnn::format::fs_b_yx_fsv32 &&
conv_layout.format != cldnn::format::b_fs_yx_32fp) {
for (auto& dep : eltw_node->get_dependencies()) {
format fmt = dep->get_output_layout().format;
data_types dep_dt = dep->get_output_layout().data_type;
- if ((fmt != format::fs_bs_yx_bsv4_fsv32 || dep_dt != data_types::i8) &&
- (fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::i8) &&
+ if ((fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::i8) &&
(fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::u8) &&
- (fmt != format::byxf_af32 || dep_dt != data_types::i8) &&
- (fmt != format::byxf_af32 || dep_dt != data_types::u8) &&
(fmt != format::bfyx || dep_dt != data_types::f32) && (fmt != format::bfyx || dep_dt != data_types::u8) &&
(fmt != format::bfyx || dep_dt != data_types::i8) && (fmt != format::yxfb || dep_dt != data_types::f16) &&
(fmt != format::bfyx || dep_dt != data_types::f16 || !if_already_depth_to_space_fused))
auto fmt_map = get_preferred_formats(p, lo);
propagate_formats(p, fmt_map, lo);
minimize_local_reorders(p, fmt_map, lo);
-
- // WA START ============================================================================================================
- if (lo.get_optimization_attributes().b_fs_yx_fsv16_network) {
- // This is a temprorary work-around for known bad case until byxf_af32 handling will be corrected in layout_optimizer.
- //
- // Find pattern:
- // mvn(int8, b_fs_yx_fsv16, [x,16,1280,720]) -> conv(int8, byxf_af32, [x,3,1280,720]) -> mvn(*, bfyx) ->
- // Replace with:
- // mvn(b_fs_yx_fsv16) -> conv(b_fs_yx_fsv16) -> mvn(b_fs_yx_fsv16) ->
- //
- // Generally for such convolution b_fs_yx_fsv16 will always perform better than byxf_af32,
- // but to avoid unvalidated int8 b_fs_yx_fsv16 networks and potential regressions this WA is needed.
- // Additionally reorder from af32 -> bfyx will take ~9 times longer than actual convolution.
- for (auto& node_ptr : p.get_processing_order()) {
- if (!node_ptr->is_in_data_flow() || !node_ptr->is_type<convolution>() || fmt_map.at(node_ptr) != format::byxf_af32)
- continue;
-
- auto& conv_node = node_ptr->as<convolution>();
-
- bool input_path =
- conv_node.input().get_output_layout().data_type == data_types::i8 &&
- conv_node.input().is_type<mvn>() &&
- fmt_map.at(&conv_node.input()) == format::b_fs_yx_fsv16;
- bool output_path =
- conv_node.get_users().size() == 1 &&
- conv_node.get_users().front()->is_type<mvn>() &&
- fmt_map.at(conv_node.get_users().front()) == format::bfyx &&
- conv_node.get_users().front()->get_users().size() == 1 &&
- !conv_node.get_users().front()->as<mvn>().get_primitive()->across_channels;
-
- if (!input_path || !output_path)
- continue;
-
- auto in_lay = conv_node.input().get_output_layout();
- auto out_lay = conv_node.get_output_layout();
- auto wei_lay = conv_node.weights().get_output_layout();
- bool correct_layouts =
- // weights
- wei_lay.data_type == data_types::i8 &&
- wei_lay.size.spatial[0] == 3 && wei_lay.size.spatial[1] == 3 &&
- // input/output
- in_lay.data_type == data_types::i8 && out_lay.data_type == data_types::i8 &&
- in_lay.size.feature[0] == 16 && out_lay.size.feature[0] == 3 &&
- in_lay.size.spatial[0] == 1280 && out_lay.size.spatial[0] == 1280 &&
- in_lay.size.spatial[1] == 720 && out_lay.size.spatial[1] == 720;
-
- if (!correct_layouts)
- continue;
-
- bool correct_conv =
- conv_node.get_groups() == 1 && conv_node.get_split() == 1 && conv_node.get_deformable_groups() == 1 &&
- !conv_node.get_depthwise_sep_opt() && !conv_node.get_transposed() &&
- !conv_node.activations_zero_points_term() && !conv_node.weights_zero_points_term() && !conv_node.compensation_term() &&
- conv_node.get_primitive()->dilation == tensor(1);
-
- if (!correct_conv)
- continue;
-
- fmt_map.at(node_ptr) = format::b_fs_yx_fsv16;
- fmt_map.at(conv_node.get_users().front()) = format::b_fs_yx_fsv16;
- }
- }
- // WA END ==============================================================================================================
-
insert_reorders(p, fmt_map, rf);
for (auto n : p.get_processing_order()) {
return "bs_xs_xsv8_bsv16";
case format::bs_x_bsv16:
return "bs_x_bsv16";
- case format::bf8_xy16:
- return "bf8_xy16";
case format::winograd_2x3_s1_data:
return "winograd_2x3_s1_data";
- case format::byxf_af32:
- return "byxf_af32";
- case format::byx8_f4:
- return "byx8_f4";
- case format::fs_bs_yx_bsv4_fsv32:
- return "fs_bs_yx_bsv4_fsv32";
case format::b_fs_yx_fsv4:
return "b_fs_yx_fsv4";
case format::b_fs_yx_32fp:
return kernel_selector::data_layout::bs_f_bsv8__af8;
case format::bs_xs_xsv8_bsv16:
return kernel_selector::data_layout::bs_f_bsv16__af8;
- case format::bf8_xy16:
- return kernel_selector::data_layout::bf8_xy16;
case format::winograd_2x3_s1_data:
return kernel_selector::data_layout::winograd_2x3_s1_data;
- case format::byxf_af32:
- return kernel_selector::data_layout::byxf_af32;
- case format::byx8_f4:
- return kernel_selector::data_layout::byx8_f4;
- case format::fs_bs_yx_bsv4_fsv32:
- return kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32;
case format::b_fs_yx_fsv4:
return kernel_selector::data_layout::b_fs_yx_fsv4;
case format::b_fs_yx_32fp:
return cldnn::format::bs_xs_xsv8_bsv8;
case kernel_selector::data_layout::bs_f_bsv16__af8:
return cldnn::format::bs_x_bsv16;
- case kernel_selector::data_layout::bf8_xy16:
- return cldnn::format::bf8_xy16;
case kernel_selector::data_layout::winograd_2x3_s1_data:
return cldnn::format::winograd_2x3_s1_data;
- case kernel_selector::data_layout::byxf_af32:
- return cldnn::format::byxf_af32;
- case kernel_selector::data_layout::byx8_f4:
- return cldnn::format::byx8_f4;
- case kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32:
- return cldnn::format::fs_bs_yx_bsv4_fsv32;
case kernel_selector::data_layout::b_fs_yx_32fp:
return cldnn::format::b_fs_yx_32fp;
case kernel_selector::data_layout::bfzyx:
size_t pitch = 1;
auto new_vals = vals;
- if (ks_layout == kernel_selector::Tensor::byxf_af32) {
- new_vals[3] = align_to(vals[3], 32);
- }
if (ks_layout == kernel_selector::Tensor::b_fs_yx_fsv32) {
new_vals[1] = align_to(vals[1], 32);
}
- if (ks_layout == kernel_selector::Tensor::fs_bs_yx_bsv4_fsv32) {
- new_vals[3] = align_to(vals[3], 32);
- new_vals[2] = align_to(vals[2], 4);
- }
- if (ks_layout == kernel_selector::Tensor::byx8_f4) {
- new_vals[3] = align_to(vals[3], 4);
- new_vals[2] = align_to(vals[2], 8);
- }
if (ks_layout == kernel_selector::Tensor::bs_fs_yx_bsv16_fsv16) {
new_vals[0] = align_to(vals[0], 16);
new_vals[1] = align_to(vals[1], 16);
if (next.is_type<fully_connected>() &&
(fmt_prev == format::bfyx || fmt_prev == format::yxfb ||
fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::fs_b_yx_fsv32 ||
- fmt_prev == format::byxf_af32 || fmt_prev == format::b_fs_yx_fsv32 ||
+ fmt_prev == format::b_fs_yx_fsv32 ||
(fmt_prev == format::b_fs_yx_fsv4 &&
prev_output_layout.size.feature[0] % 32 == 0 &&
prev_output_layout.size.spatial[0] == 1 &&
prev_output_layout.size.spatial[1] == 1)))
return true;
- if (next.is_type<convolution>() && fmt_prev == format::byxf_af32 && fmt_next == format::b_fs_yx_fsv4 && next.as<convolution>().get_groups() != 1)
- return true;
-
- if (next.is_type<convolution>() && fmt_prev == format::byxf_af32 && fmt_next == format::bfyx)
- return true;
-
- if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::byxf_af32 && next.as<convolution>().get_groups() == 1)
- return true;
-
if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv16 && fmt_next == format::b_fs_yx_fsv4 && is_input_idx(0))
return true;
if (next.is_type<convolution>() &&
(fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bfyx) && prev_output_layout.size.feature[0] == 3 &&
- (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::byxf_af32 ||
+ (fmt_next == format::b_fs_yx_fsv4 ||
fmt_next == format::bs_fs_yx_bsv16_fsv16))
return true;
// nothing to do, just go out from here.
} else if (layout_optimizer::convolution_bfyx_opt(current_layout, output_or_weights_layout, prim) ||
(_output_size_handling_enabled && prim->with_output_size) || node.get_transposed()) {
- // commented out due to performance reasons, maybe enable in future
- /*if (current_layout.data_type == data_types::f32 &&
- current_layout.size.batch[0] % 16 == 0 &&
- current_layout.format == format::bfyx &&
- output_or_weights_layout.size.spatial[0] == 1 && output_or_weights_layout.size.spatial[1] == 1 &&
- prim->stride.spatial[0] == 1 && prim->stride.spatial[1] == 1 &&
- prim->input_offset.spatial[0] == 0 && prim->input_offset.spatial[1] == 0 &&
- !node.get_transposed())
- {
- if (!((current_layout.size.feature[0] % 8) == 0 && (current_layout.size.spatial[0] *
- current_layout.size.spatial[1]) == 16 && current_layout.data_padding == padding{ { 0,0,0,0 }, 0 }))
- {
- expected_tensor = current_layout.size.transform(cldnn::format::bf8_xy16, 1);
- expected_format = cldnn::format::bf8_xy16;
- }
- }
- else*/
{
expected_tensor = current_layout.size;
if (current_layout.format == format::b_fs_zyx_fsv16 || current_layout.format == format::bs_fs_zyx_bsv16_fsv16)
((layout.format != format::b_fs_yx_fsv32 && layout.format != format::b_fs_zyx_fsv32) ||
(layout.size.feature[0] % 32 == 0)) &&
// TODO: check if this condition always correct
- ((layout.format == format::byxf_af32 && layout.size.feature[0] == rec_list._memory->get_layout().size.feature[0]) ||
- (layout.format != format::byxf_af32 && layout.size.feature[0] <= rec_list._memory->get_layout().size.feature[0])) &&
+ layout.size.feature[0] <= rec_list._memory->get_layout().size.feature[0] &&
layout.size.batch[0] <= rec_list._memory->get_layout().size.batch[0] &&
rec_list._memory->get_layout().format != format::fs_b_yx_fsv32 &&
layout.format != format::fs_b_yx_fsv32 &&
/*
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
return {false, false};
if (l1.get_linear_size() != l2.get_linear_size())
return {false, false};
- if ((l1.format == format::bf8_xy16 && l2.format != format::bf8_xy16) ||
- (l2.format == format::bf8_xy16 && l1.format != format::bf8_xy16) ||
- (l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) ||
+ if ((l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) ||
(l2.format == format::b_fs_yx_fsv4 && l1.format != format::b_fs_yx_fsv4) ||
(l1.format == format::fs_b_yx_fsv32 && l2.format != format::fs_b_yx_fsv32) ||
(l2.format == format::fs_b_yx_fsv32 && l1.format != format::fs_b_yx_fsv32) ||
build_options options;
options.set_option(build_option::optimize_data(true));
implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" };
- options.set_option(build_option::force_implementations({ {"conv", conv_impl} }));
+ options.set_option(build_option::force_implementations({ {"conv", conv_impl} }));
network network(engine, topology, options);
network.set_input_data("input", input);
EXPECT_EQ(output_layout.format, format::bfyx);
- EXPECT_EQ(y_size, output_size.spatial[1]);
+ EXPECT_EQ(y_size, output_size.spatial[1]);
EXPECT_EQ(x_size, output_size.spatial[0]);
EXPECT_EQ(f_size, output_size.feature[0]);
EXPECT_EQ(b_size, output_size.batch[0]);
.all_test_params(format::bfyx, false, true)
.all_test_params(format::bfyx, true, false)
.all_test_params(format::b_fs_yx_fsv4)
- // byxf_af32 - depthwise broken for batch > 1
- // .smoke_test_params(format::byxf_af32)
.all_test_params(format::b_fs_yx_fsv32)
.all_test_params(format::b_fs_yx_fsv32, true, true)
.all_test_params(format::b_fs_yx_fsv32, false, true)
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
- testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+ testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_i8_i8_test::PrintToStringParamName
);
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
- testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+ testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_i8_u8_test::PrintToStringParamName
);
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
- testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+ testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_i8_f32_test::PrintToStringParamName
);
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
- testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+ testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_u8_i8_test::PrintToStringParamName
);
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
- testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+ testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_u8_u8_test::PrintToStringParamName
);
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
- testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
+ testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_u8_f32_test::PrintToStringParamName
);
EXPECT_EQ(outputs_act.begin()->first, "eltwise");
auto output_act = outputs_act.begin()->second.get_memory();
- auto&& out_act_layout = output_act.get_layout();
auto out_act_ptr = output_act.pointer<uint8_t>();
topology topology_ref(
EXPECT_EQ(outputs_ref.begin()->first, "out");
auto output_ref = outputs_ref.begin()->second.get_memory();
- auto&& out_ref_layout = output_ref.get_layout();
auto out_ref_ptr = output_ref.pointer<uint8_t>();
for (int i = 0;i < 3 * 256 * 4;i++) {
bc_test_params{CASE_CONV3D_S8S8_4, 2, 6},
}), );
-
-class conv_int8_byxf_af32 : public ConvFusingTest {};
-TEST_P(conv_int8_byxf_af32, per_channel_coeffs) {
- auto p = GetParam();
- create_topologies(input_layout("input", get_input_layout(p)),
- data("weights", get_mem(get_weights_layout(p))),
- data("bias", get_mem(get_bias_layout(p))),
- data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)),
- convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
- scale("scale", "conv_prim", "scale_data"),
- reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
- );
-
- implementation_desc conv_impl = { format::byxf_af32, "" };
- bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
-
- tolerance = 1e-5f;
- execute(p);
-}
-
-TEST_P(conv_int8_byxf_af32, per_element_coeffs) {
- auto p = GetParam();
- create_topologies(input_layout("input", get_input_layout(p)),
- data("weights", get_mem(get_weights_layout(p))),
- data("bias", get_mem(get_bias_layout(p))),
- data("eltwise_data", get_mem(get_output_layout(p))),
- convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
- eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum),
- reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32)
- );
-
- implementation_desc conv_impl = { format::byxf_af32, "" };
- bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
-
- tolerance = 1e-5f;
- execute(p);
-}
-
-INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_byxf_af32,
- ::testing::ValuesIn(std::vector<bc_test_params>{
- bc_test_params{CASE_CONV_U8S8_1, 2, 3},
- bc_test_params{CASE_CONV_U8S8_2, 2, 3},
- bc_test_params{CASE_CONV_U8S8_3, 2, 3},
- bc_test_params{CASE_CONV_U8S8_4, 2, 3},
- bc_test_params{CASE_CONV_U8S8_6, 2, 3},
- bc_test_params{CASE_CONV_S8S8_1, 2, 3},
- bc_test_params{CASE_CONV_S8S8_2, 2, 3},
- bc_test_params{CASE_CONV_S8S8_3, 2, 3},
- bc_test_params{CASE_CONV_S8S8_4, 2, 3},
- bc_test_params{CASE_CONV_S8S8_6, 2, 3},
- }), );
-
class conv_int8_prelu_eltwise : public ConvFusingTest {};
TEST_P(conv_int8_prelu_eltwise, basic) {
auto p = GetParam();
#define CASE_ACTIVATION_F32_2 {7, 3, 7, 7}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_3 {1, 14, 8, 8}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_4 {1, 17, 31, 29}, data_types::f32, format::yxfb, data_types::f32, format::bfyx
-#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::byxf_af32, data_types::f32, format::bfyx
+#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_6 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_7 {1, 17, 31, 29}, data_types::f32, format::fyxb, data_types::f32, format::bfyx
#define CASE_ACTIVATION_3D_F32_0 {3, 16, 13, 13, 13}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F32_3 {1, 17, 7, 7, 7}, data_types::f32, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F32_4 {1, 17, 7, 7, 7}, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F32_5 {1, 17, 7, 7, 7}, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx
-#define CASE_ACTIVATION_3D_F32_6 {1, 17, 7, 7, 7}, data_types::f32, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_F16_0 {7, 32, 5, 5}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_2 {7, 16, 7, 7}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_3 {1, 14, 8, 8}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_4 {1, 17, 31, 29}, data_types::f16, format::yxfb, data_types::f32, format::bfyx
-#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::byxf_af32, data_types::f32, format::bfyx
+#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_6 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_7 {1, 17, 31, 29}, data_types::f16, format::fyxb, data_types::f32, format::bfyx
#define CASE_ACTIVATION_3D_F16_0 {3, 16, 13, 13, 13}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F16_3 {1, 17, 7, 7, 7}, data_types::f16, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F16_4 {1, 17, 7, 7, 7}, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F16_5 {1, 17, 7, 7, 7}, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx
-#define CASE_ACTIVATION_3D_F16_6 {1, 17, 7, 7, 7}, data_types::f16, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
#define CASE_ACTIVATION_U8_2 {1, 12, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
activation_test_params{CASE_ACTIVATION_F32_7, 2, 3, "activation_ref"}, // FIXME - accuracy bug
activation_test_params{CASE_ACTIVATION_3D_F32_3, 2, 3, "activation_ref"}, // FIXME - accuracy bug
activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 3, "activation_ref"}, // FIXME - accuracy bug
- activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 3, "activation_ref"}, // FIXME - accuracy bug
}), );
class activation_scale_activation_quantize_u8 : public ActivationFusingTest {};
activation_scale_activation_quantize_u8,
::testing::ValuesIn(std::vector<activation_test_params>{
activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 5, "activation_ref"}, // FIXME - accuracy bug
- activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 5, "activation_ref"}, // FIXME - accuracy bug
}), );
class activation_scale_activation : public ActivationFusingTest {};
activation_test_params{CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_ref"},
activation_test_params{CASE_ACTIVATION_3D_F16_2, 2, 4, "activation_ref"},
activation_test_params{CASE_ACTIVATION_3D_F16_3, 2, 4, "activation_ref"},
- activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"},
- activation_test_params{CASE_ACTIVATION_3D_F16_5, 2, 4, "activation_ref"},
+ activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"},
// InputDataType = UINT8
activation_test_params{CASE_ACTIVATION_U8_1, 2, 4, "activation_ref"},
::testing::ValuesIn(std::vector<activation_test_params>{
activation_test_params{CASE_ACTIVATION_3D_F32_4, 2, 4, "activation_ref"}, // FIXME - accuracy bug
activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 4, "activation_ref"}, // FIXME - accuracy bug
- activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 4, "activation_ref"}, // FIXME - accuracy bug
- activation_test_params{CASE_ACTIVATION_3D_F16_6, 2, 4, "activation_ref"}, // FIXME - accuracy bug
}), );
/* ----------------------------------------------------------------------------------------------------- */
#define CASE_POOLING_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
#define CASE_POOLING_U8_2 {2, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_POOLING_U8_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4
-#define CASE_POOLING_U8_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f32, format::bfyx
#define CASE_POOLING_U8_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
#define CASE_POOLING_U8_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
#define CASE_POOLING_U8_FP16_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4
-#define CASE_POOLING_U8_FP16_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f16, format::bfyx
#define CASE_POOLING_U8_FP16_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
#define CASE_POOLING_U8_FP16_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
#define CASE_POOLING_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_POOLING_I8_2 {2, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_POOLING_I8_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f32, format::bfyx
#define CASE_POOLING_I8_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4
#define CASE_POOLING_I8_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
-#define CASE_POOLING_I8_FP16_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f16, format::bfyx
#define CASE_POOLING_I8_FP16_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4
#define CASE_POOLING_I8_FP16_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
-// Disabled
-#define CASE_POOLING_I8_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfyx
-#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx
-#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx
-
class PoolingFusingTest : public ::BaseFusingTest<pooling_test_params> {
public:
void execute(pooling_test_params& p) {
pooling_test_params{CASE_POOLING_F32_10, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
// Input type: INT8
- pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"},
- pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
- pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"},
- pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
}), );
INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
pooling_scale_activation_quantize,
::testing::ValuesIn(std::vector<pooling_test_params>{
- pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
- pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
- pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_average_opt"}, //currently not enabled, fusing not upported
}), );
pooling_test_params{CASE_POOLING_F32_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
// Input type: INT8
- pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
- pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
- pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
- pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
// Input type: INT8
- pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
- pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
- pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
- pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
}), );
-INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
- pooling_scale_activation,
- ::testing::ValuesIn(std::vector<pooling_test_params>{
- pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
- pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
- pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
- pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
- pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
- }), );
-
/* ----------------------------------------------------------------------------------------------------- */
/* -------------------------------- DepthToSpace cases ------------------------------------------------- */
/* ----------------------------------------------------------------------------------------------------- */
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
testing::Values(format::yxfb,
format::bfyx,
- format::byxf_af32,
format::b_fs_yx_fsv4,
format::b_fs_yx_fsv16,
format::b_fs_yx_fsv32)),
format::b_fs_yx_fsv16,
format::fs_b_yx_fsv32,
format::b_fs_yx_fsv32,
- format::b_fs_yx_fsv4,
- format::fs_bs_yx_bsv4_fsv32)),
+ format::b_fs_yx_fsv4)),
testing::internal::DefaultParamName<pooling_random_test_params>);
TEST(pooling_forward_gpu, bsv16_fsv16_max_16x16x8x8_input_2x2_pool_2x2_stride)
resample_random_test,
testing::ValuesIn(
resample_random_test_param_generator()
- .smoke_params(data_types::i8, format::byxf_af32, format::byxf_af32)
- .smoke_params(data_types::u8, format::byxf_af32, format::byxf_af32)
.smoke_params(data_types::i8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
.smoke_params(data_types::u8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
.smoke_params(data_types::i8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
test_tensor_offset({ 2, 19, 4, 3 }, { 1, 18, 3, 2 }, cldnn::format::b_fs_yx_fsv16, 754);
test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::fs_b_yx_fsv32, 675);
test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::fs_b_yx_fsv32, 1507);
-
- // Formats with alignment:
- test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::byxf_af32, 675);
- test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::byxf_af32, 1507);
- test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::byx8_f4, 331);
- test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::byx8_f4, 1755);
-
- // Non-standard blocked formats:
- // bf8_xy16 - b_fs_es_fsv8_esv16, where e is flattened yx := x + y * size_x
- test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::bf8_xy16, 185);
- test_tensor_offset({ 2, 19, 7, 3 }, { 1, 18, 3, 2 }, cldnn::format::bf8_xy16, 1441);
-
}