2 // Copyright (c) 2018 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h"
18 #include "kernel_selector_utils.h"
20 namespace kernel_selector {
22 static const size_t _SG_TILE_M = 32;
23 static const size_t _SG_TILE_N = 32;
24 static const size_t _SG_SIZE = 8; // sub group size
25 static const size_t _TILES_PER_SG_X = 1; // Persistent threads
26 static const size_t _TILES_PER_SG_Y = 1; // Persistent threads
28 ParamsKey ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetSupportedKey() const
31 k.EnableInputDataType(Datatype::INT8);
32 k.EnableOutputDataType(Datatype::INT8);
33 k.EnableInputWeightsType(WeightsType::INT8);
34 k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
35 k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
36 k.EnableTensorOffset();
37 k.EnableTensorPitches();
38 k.EnableBiasPerFeature();
40 k.EnableInt8Quantization();
41 k.EnableOutputCalibration();
46 bool ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::Validate(const Params& p, const optional_params& o) const
48 if (!ConvolutionKernelBase::Validate(p, o) ||
49 !CovolutionCheckInput(p, o))
54 const convolution_params& cp = static_cast<const convolution_params&>(p);
56 // make sure it's 1x1 conv
57 if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
60 // make sure stride is 1x1
61 if (cp.stride.x != 1 || cp.stride.y != 1)
64 // input padding not supported
65 if (cp.inputs[0].X().pad.Total() != 0 ||
66 cp.inputs[0].Y().pad.Total() != 0 ||
67 cp.inputs[0].Feature().pad.Total() != 0 ||
68 cp.inputs[0].Batch().pad.Total() != 0)
71 // input and output spatial sizes must match
72 if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v))
75 const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v ;
76 const auto k = cp.inputs[0].Feature().v;
77 const auto n = cp.output.Feature().v ;
79 if (m % 32 != 0 && m % 224 != 0) // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128
82 if (k % 32 != 0) // Matrix size K, Must be mutliple of 32
85 if (n % 32 != 0 && n % 128 != 0) // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128
92 ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::SetDefault(const convolution_params& arg, int) const
94 DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
96 runInfo.effiency = FORCE_PRIORITY_1;
98 size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v;
99 size_t mat_n = arg.output.Feature().v;
101 size_t _MATRIX_M = mat_m;
102 size_t _MATRIX_N = mat_n;
104 size_t _WG_TILE_M = 224;
105 size_t _WG_TILE_N = 128;
107 // Calculate number of threads needed
108 const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X;
109 const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y ;
111 // Define execution setup for kernel:
112 size_t globalWorkSize[3] = { threadsX, threadsY, 1 };
113 size_t localWorkSize[3] = { _SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1 };
115 runInfo.gws0 = globalWorkSize[0];
116 runInfo.gws1 = globalWorkSize[1];
117 runInfo.gws2 = globalWorkSize[2];
119 runInfo.lws0 = localWorkSize[0];
120 runInfo.lws1 = localWorkSize[1];
121 runInfo.lws2 = localWorkSize[2];
126 JitConstants ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const
128 auto jit = Parent::GetJitConstants(params, runInfo);
130 jit.AddConstant(MakeJitConstant("WG_TILE_M", 224)); // Work-Group tile size M, Must be mutliple of 32
131 jit.AddConstant(MakeJitConstant("WG_TILE_N", 128)); // Work-Group tile size N, Must be mutliple of 32
132 jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", _TILES_PER_SG_X));
133 jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", _TILES_PER_SG_Y));
135 // Do not change values below
136 jit.AddConstant(MakeJitConstant("DIM_X", 0));
137 jit.AddConstant(MakeJitConstant("DIM_Y", 1));
138 jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32));
139 jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16));
140 jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M));
141 jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N));
142 jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE));
143 jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M"));
144 jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)"));
145 jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)"));
147 jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", ""));
148 jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", ""));
149 jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", ""));
151 const auto& input = params.inputs[0];
152 const auto& output = params.output;
154 auto m = output.X().v * output.Y().v * output.Batch().v;
155 auto k = input.Feature().v;
156 auto n = output.Feature().v;
158 jit.AddConstant(MakeJitConstant("MATRIX_M", m)); // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M
159 jit.AddConstant(MakeJitConstant("MATRIX_K", k)); // Matrix size K, Must be mutliple of 32
160 jit.AddConstant(MakeJitConstant("MATRIX_N", n)); // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N
162 const size_t out_x_pitch = 32 * 4;
163 const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
164 const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded();
165 const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4);
166 const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before;
168 jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
169 jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
170 jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch));
171 jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch));
172 jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset));
174 bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0;
175 jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding));
180 KernelsData ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetKernelsData(const Params& params, const optional_params& options) const
182 KernelsData kd = GetCommonKernelsData(params, options);
184 kd[0].estimatedTime = FORCE_PRIORITY_1; //_3