1 // Copyright (c) 2018-2019 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "convolution_kernel_imad_3x3.h"
17 #include "kernel_selector_utils.h"
18 #include "common_tools.h"
22 // Kernel specific constants
25 // Threshold value to calculate the block size.
26 #define OUT_BLOCK_THRESHOLD 7
28 static bool getOutBlock_WH(size_t output_size,
31 size_t& output_block_w,
32 size_t& output_block_h) {
33 bool verify_output_ranges = false;
35 output_block_w = output_block_h = 0;
37 size_t upper_border = output_size < SIMD_SIZE ? output_size : SIMD_SIZE;
39 size_t stride_restrictions = (SIMD_SIZE - (kernel_size - 1)) / stride;
41 size_t max_posible_tile_size = upper_border < stride_restrictions ? upper_border : stride_restrictions;
43 if (output_size % max_posible_tile_size == 0) {
44 output_block_w = max_posible_tile_size;
46 size_t min_horisontal_block_size = 2; // 4;
48 size_t block_size = 0;
50 for (size_t i = min_horisontal_block_size; i < max_posible_tile_size; i++) {
51 if (output_size % i == 0)
55 if (block_size != 0) {
56 output_block_w = block_size;
58 output_block_w = max_posible_tile_size;
59 verify_output_ranges = true;
63 if (output_block_w <= 4)
64 output_block_h = output_block_w;
68 return verify_output_ranges;
71 namespace kernel_selector {
73 ParamsKey ConvolutionKernel_imad_3x3::GetSupportedKey() const {
75 k.EnableInputDataType(Datatype::INT8);
76 k.EnableInputDataType(Datatype::UINT8);
77 k.EnableOutputDataType(Datatype::INT8);
78 k.EnableOutputDataType(Datatype::UINT8);
79 k.EnableInputWeightsType(WeightsType::INT8);
80 k.EnableInputWeightsType(WeightsType::UINT8);
81 k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
82 k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
83 k.EnableDifferentInputWeightsTypes();
84 k.EnableTensorOffset();
85 k.EnableTensorPitches();
87 k.EnableBiasPerFeature();
88 k.EnableNonBiasTerm();
90 k.EnableInt8Quantization();
91 k.EnableOutputCalibration();
96 KernelsData ConvolutionKernel_imad_3x3::GetKernelsData(const Params& params, const optional_params& options) const {
97 return GetCommonKernelsData(params, options);
100 JitConstants ConvolutionKernel_imad_3x3::GetJitConstants(const convolution_params& params,
101 const DispatchData& kd) const {
102 auto mem_consts = Parent::GetJitConstants(params, kd);
104 auto activation_constants =
105 MakeActivationJitConstants(params.activation, "_CONV");
106 mem_consts.Merge(activation_constants);
108 const auto& input = params.inputs[0];
109 const auto& output = params.output;
111 const auto& iDims = input.GetDims();
112 const auto& oDims = output.GetDims();
113 const auto& weights = params.weights;
114 const auto& wDims = weights.GetDims();
115 const int iX = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::X);
116 const int iY = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::Y);
117 const int iF = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::FEATURE);
118 const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM);
119 const int oX = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::X);
120 const int oY = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::Y);
121 mem_consts.AddConstants({
122 MakeJitConstant("_IW", iDims[iX].v),
123 MakeJitConstant("_IH", iDims[iY].v),
124 MakeJitConstant("_ID", RoundUp(iDims[iF].v, 4)),
125 MakeJitConstant("IWPAD", iDims[iX].pad.before + iDims[iX].pad.after),
126 MakeJitConstant("IHPAD", iDims[iY].pad.before + iDims[iY].pad.after),
127 MakeJitConstant("_OW", oDims[oX].v),
128 MakeJitConstant("_OH", oDims[oY].v),
129 MakeJitConstant("_OD", wDims[wOD].v),
130 MakeJitConstant("OWPAD", oDims[oX].pad.before + oDims[oX].pad.after),
131 MakeJitConstant("OHPAD", oDims[oY].pad.before + oDims[oY].pad.after),
132 MakeJitConstant("SIMD_SIZE", SIMD_SIZE),
133 MakeJitConstant("K_HEIGHT", wDims[iY].v),
134 MakeJitConstant("K_WIDTH", wDims[iX].v),
135 MakeJitConstant("K_STRIDE", params.stride.x), // X and Y must be equal
139 getOutBlock_WH(oDims[oX].v, params.stride.x, wDims[iX].v, obw, obh);
140 mem_consts.AddConstants({MakeJitConstant("OUT_BLOCK_WIDTH", obw), MakeJitConstant("OUT_BLOCK_HEIGHT", obh)});
145 ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_3x3::SetDefault(const convolution_params& params,
149 const auto& in = params.inputs[0];
150 const auto& output = params.output;
151 const auto& weights = params.weights;
152 const auto& iDims = in.GetDims();
153 const auto& oDims = output.GetDims();
154 const auto& wDims = weights.GetDims();
155 const int iX = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::X);
156 const int iY = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::Y);
157 const int iB = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::BATCH);
158 const int oX = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::X);
159 const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM);
162 getOutBlock_WH(oDims[oX].v, params.stride.x, wDims[iX].v, otw, oth);
164 std::vector<size_t> global = {// globalRange[0] = ((_IW / K_STRIDE) + (OTW - 1)) / OTW;
165 // number of tiles needed to cover output width
166 (((iDims[iX].v / params.stride.x) + (otw - 1)) / otw),
168 // globalRange[1] = ((_IH / K_STRIDE) + (OTH - 1)) / OTH;
169 // number of tiles needed to cover output height
170 (((iDims[iY].v / params.stride.y) + (oth - 1)) / oth),
172 // globalRange[2] = (_OD * _B) + ((_B *_OD) % __WORKGROUP_SIZE);
173 // round depth range up
174 ((wDims[wOD].v * iDims[iB].v) + ((wDims[wOD].v * iDims[iB].v) % SIMD_SIZE))};
176 std::vector<size_t> local = {1, 1, SIMD_SIZE};
186 kd.cldnnStyle = {0, 0, 0, 0, 0};
187 kd.gemmStyle = {0, 0, 0, 0, 0, 0};
188 kd.effiency = FORCE_PRIORITY_1;
193 bool ConvolutionKernel_imad_3x3::Validate(const Params& params, const optional_params& options) const {
194 if (!Parent::Validate(params, options)) {
198 KernelData kd = KernelData::Default<convolution_params>(params);
199 convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
201 if (newParams.stride.x != newParams.stride.y) {
202 // Strides must be equial
204 } else if ((newParams.filterSize.x != m_FilterSizeX) || (newParams.filterSize.y != m_FilterSizeY)) {
205 // Kernel does not support such filter size
208 const auto& in = newParams.inputs[0];
209 const auto& iDims = in.GetDims();
210 const int iX = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::X);
211 if (iDims[iX].v % OUT_BLOCK_THRESHOLD != 0) {
212 // Input size must be multiple of OUT_BLOCK_THRESHOLD
219 } // namespace kernel_selector