2 // Copyright (c) 2018-2019 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "convolution_kernel_imad_3x3.h"
18 #include "kernel_selector_utils.h"
19 #include "common_tools.h"
22 // Kernel specific constants
25 // Threshold value to calculate the block size.
26 #define OUT_BLOCK_THRESHOLD 7
27 // For images 7x7 it's 7 (default), for 14x14 and above it's 14.
28 #define OUT_BLOCK_WIDTH 7
29 // For images 7x7 it's 1 (default), for 14x14 and above it's 2.
30 #define OUT_BLOCK_HEIGHT 1
32 static void getOutBlock_WH(size_t inW, size_t Stride, size_t Pad, size_t& outW, size_t& outH)
34 outW = OUT_BLOCK_WIDTH * 2;
35 outH = OUT_BLOCK_HEIGHT * 2;
37 if ((inW <= OUT_BLOCK_THRESHOLD) ||
38 (outW * Stride + Pad > SIMD_SIZE)) {
39 outW = OUT_BLOCK_WIDTH;
40 outH = OUT_BLOCK_HEIGHT;
42 if (outW * Stride + Pad > SIMD_SIZE) {
46 assert(outW * Stride + Pad <= SIMD_SIZE);
49 namespace kernel_selector {
51 ParamsKey ConvolutionKernel_imad_3x3::GetSupportedKey() const
54 k.EnableInputDataType(Datatype::INT8);
55 k.EnableInputDataType(Datatype::UINT8);
56 k.EnableOutputDataType(Datatype::INT8);
57 k.EnableOutputDataType(Datatype::UINT8);
58 k.EnableInputWeightsType(WeightsType::INT8);
59 k.EnableInputWeightsType(WeightsType::UINT8);
60 k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
61 k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
62 k.EnableDifferentInputWeightsTypes();
63 k.EnableTensorOffset();
64 k.EnableTensorPitches();
66 k.EnableBiasPerFeature();
67 k.EnableNonBiasTerm();
69 k.EnableInt8Quantization();
70 k.EnableOutputCalibration();
76 ConvolutionKernel_imad_3x3::GetKernelsData(
78 const optional_params& options) const
80 return GetCommonKernelsData(params, options);
84 ConvolutionKernel_imad_3x3::GetJitConstants(
85 const convolution_params& params,
86 const DispatchData& kd) const
88 auto mem_consts = Parent::GetJitConstants(params, kd);
90 const auto& input = params.inputs[0];
91 const auto& output = params.output;
93 const auto& iDims = input.GetDims();
94 const auto& oDims = output.GetDims();
95 const auto& weights = params.weights;
96 const auto& wDims = weights.GetDims();
97 const int iX = DataTensor::Channelndex(
98 input.GetLayout(), Tensor::DataChannelName::X);
99 const int iY = DataTensor::Channelndex(
100 input.GetLayout(), Tensor::DataChannelName::Y);
101 const int iB = DataTensor::Channelndex(
102 input.GetLayout(), Tensor::DataChannelName::BATCH);
103 const int iF = DataTensor::Channelndex(
104 input.GetLayout(), Tensor::DataChannelName::FEATURE);
105 const int wOD = WeightsTensor::Channelndex(
106 weights.GetLayout(), Tensor::WeightsChannelName::OFM);
107 const int oX = DataTensor::Channelndex(
108 output.GetLayout(), Tensor::DataChannelName::X);
109 const int oY = DataTensor::Channelndex(
110 output.GetLayout(), Tensor::DataChannelName::Y);
111 mem_consts.AddConstants({
112 MakeJitConstant("_IMAD_DEFINES", 1),
113 //MakeJitConstant("SCALE_FACTOR", m_ScaleFactor), //(255.0f / 700000.0f);
114 MakeJitConstant("_IW", iDims[iX].v),
115 MakeJitConstant("_IH", iDims[iY].v),
116 MakeJitConstant("_ID", RoundUp(iDims[iF].v, 4)),
117 MakeJitConstant("IWPAD", iDims[iX].pad.before + iDims[iX].pad.after),
118 MakeJitConstant("IHPAD", iDims[iY].pad.before + iDims[iY].pad.after),
119 MakeJitConstant("_OW", oDims[oX].v),
120 MakeJitConstant("_OH", oDims[oY].v),
121 MakeJitConstant("_OD", wDims[wOD].v),
122 MakeJitConstant("OWPAD", oDims[oX].pad.before + oDims[oX].pad.after),
123 MakeJitConstant("OHPAD", oDims[oY].pad.before + oDims[oY].pad.after),
124 MakeJitConstant("SIMD_SIZE", SIMD_SIZE),
125 MakeJitConstant("K_HEIGHT", wDims[iY].v),
126 MakeJitConstant("K_WIDTH", wDims[iX].v),
127 MakeJitConstant("K_STRIDE", params.stride.x), // X and Y must be equal
128 MakeJitConstant("BATCH_SIZE", iDims[iB].v),
129 MakeJitConstant("WORKGROUP_SIZE", "SIMD_SIZE"),
133 getOutBlock_WH(iDims[iX].v, params.stride.x, iDims[iX].pad.before + iDims[iX].pad.after,
135 mem_consts.AddConstants({
136 MakeJitConstant("OUT_BLOCK_WIDTH", obw),
137 MakeJitConstant("OUT_BLOCK_HEIGHT", obh)
140 // FM_TILE definition
141 mem_consts.AddConstants({
142 MakeJitConstant("IMAD_LENGTH", 4),
143 MakeJitConstant("SYSTOLIC_DEPTH", 1),
144 MakeJitConstant("FM_TILE", "(IMAD_LENGTH * SYSTOLIC_DEPTH)")
147 if (input.GetDType() == Datatype::UINT8) {
148 // For unsigned types IMAD convolution kernel should skip
149 // all negative values.
150 mem_consts.AddConstants({
151 MakeJitConstant("CONVO_UNSIGNED", 1)
155 if (params.output.GetLayout() != DataLayout::b_fs_yx_fsv4) {
156 mem_consts.AddConstants({
157 // Produce unswizzelled results.
158 MakeJitConstant("TO_UNSWIZZLE", 1),
167 ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_3x3::SetDefault(
168 const convolution_params& params,
173 const auto& in = params.inputs[0];
174 const auto& weights = params.weights;
175 const auto& iDims = in.GetDims();
176 const auto& wDims = weights.GetDims();
177 const int iX = DataTensor::Channelndex(
178 in.GetLayout(), Tensor::DataChannelName::X);
179 const int iY = DataTensor::Channelndex(
180 in.GetLayout(), Tensor::DataChannelName::Y);
181 const int iB = DataTensor::Channelndex(
182 in.GetLayout(), Tensor::DataChannelName::BATCH);
183 const int wOD = WeightsTensor::Channelndex(
184 weights.GetLayout(), Tensor::WeightsChannelName::OFM);
187 getOutBlock_WH(iDims[iX].v, params.stride.x, iDims[iX].pad.before + iDims[iX].pad.after,
190 std::vector<size_t> global = {
191 //globalRange[0] = ((_IW / K_STRIDE) + (OTW - 1)) / OTW;
192 // number of tiles needed to cover output width
193 (((iDims[iX].v / params.stride.x) + (otw - 1)) / otw),
195 //globalRange[1] = ((_IH / K_STRIDE) + (OTH - 1)) / OTH;
196 // number of tiles needed to cover output height
197 (((iDims[iY].v / params.stride.y) + (oth - 1)) / oth),
199 // globalRange[2] = (_OD * _B) + ((_B *_OD) % __WORKGROUP_SIZE);
200 // round depth range up
201 ((wDims[wOD].v * iDims[iB].v) + ((wDims[wOD].v * iDims[iB].v) % SIMD_SIZE))
204 std::vector<size_t> local = {1, 1, SIMD_SIZE};
214 kd.cldnnStyle = { 0 };
215 kd.gemmStyle = { 0 };
216 kd.effiency = FORCE_PRIORITY_1;
223 ConvolutionKernel_imad_3x3::Validate(
224 const Params& params,
225 const optional_params& options) const
227 if (!Parent::Validate(params, options))
232 KernelData kd = KernelData::Default<convolution_params>(params);
233 convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
235 if (newParams.stride.x != newParams.stride.y) {
236 // Strides must be equial
239 else if ((newParams.filterSize.x != m_FilterSizeX) ||
240 (newParams.filterSize.y != m_FilterSizeY)) {
241 // Kernel does not support such filter size
245 const auto& in = newParams.inputs[0];
246 const auto& iDims = in.GetDims();
247 const int iX = DataTensor::Channelndex(
248 in.GetLayout(), Tensor::DataChannelName::X);
249 if (iDims[iX].v % OUT_BLOCK_THRESHOLD != 0) {
250 // Input size must be multiple of OUT_BLOCK_THRESHOLD
259 ConvolutionKernel_imad_3x3::GetCommonKernelsData(
260 const Params& params,
261 const optional_params& options,
262 const std::string exeMode,
263 int autoTuneIndex) const
265 if (!Validate(params, options))
270 KernelData kd = KernelData::Default<convolution_params>(params);
271 convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
272 DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
273 if (!CheckWorkGroups(runInfo))
275 // Internal Error - wrong calculation of global/local work group sizes
279 bool succeed = UpdateWeightsParams(
282 GetSupportedWeightLayouts(newParams),
283 kd.weightsReorderParams,
291 auto finalKernelName = GetKernelName(newParams);
292 auto cldnnJit = GetJitConstants(newParams, runInfo);
293 auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options);
294 auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint);
296 auto& kernel = kd.kernels[0];
297 FillCLKernelData(kernel, runInfo, params.engineInfo, finalKernelName, jit, entryPoint, exeMode, true, !newParams.bias.empty(), 1, newParams.int8_quantization, newParams.output_calibration);
299 kd.estimatedTime = runInfo.effiency;
300 kd.autoTuneIndex = autoTuneIndex;
304 } // GetCommonKernelsData