1 // Copyright (c) 2018-2019 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "include/common.cl"
17 #include "include/fetch.cl"
18 #include "include/imad.cl"
19 #include "include/mmad.cl"
20 #include "include/data_types.cl"
25 #if FILTER_LAYOUT_OS_IS_YX_OSV16_ISV16
26 # define GET_WEIGHTS_INDEX(o, i, z, y, x) GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, o, i, y, x)
27 # define WEIGHTS_FEATURE_BLOCK_PITCH (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FSV)
28 # define WEIGHTS_IS_PITCH (FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y)
30 #elif FILTER_LAYOUT_OS_IS_ZYX_OSV32_ISV16
31 # define GET_WEIGHTS_INDEX(o, i, z, y, x) GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(FILTER, o, i, z, y, x)
32 # define WEIGHTS_FEATURE_BLOCK_PITCH (FSV * FSV)
33 # define WEIGHTS_IS_PITCH (2 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
35 #elif FILTER_LAYOUT_OS_IS_ZYX_OSV64_ISV16
36 # define GET_WEIGHTS_INDEX(o, i, z, y, x) GET_FILTER_OS_IS_ZYX_OSV64_ISV16_INDEX(FILTER, o, i, z, y, x)
37 # define WEIGHTS_FEATURE_BLOCK_PITCH (FSV * FSV)
38 # define WEIGHTS_IS_PITCH (4 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
42 #define AS_TYPE_N_(type, n, x) as_##type##n(x)
43 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
44 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
45 #define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
47 #define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
48 #define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
50 __attribute__((intel_reqd_sub_group_size(SIMD)))
51 __attribute__((reqd_work_group_size(1, SIMD * FEATURE_SLM_SPLIT, 1)))
52 KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
53 const __global INPUT0_TYPE *conv_input,
54 __global OUTPUT_TYPE *output,
55 const __global FILTER_TYPE *weights,
57 const __global BIAS_TYPE *biases,
59 #if HAS_FUSED_OPS_DECLS
64 // Use group ids to ease sub-group uniform variables optimization for compiler
65 const uint out_yx_sg = (uint)get_group_id(0) * OUT_BLOCK_SPATIAL;
66 uint out_fg = (uint)get_group_id(1) * OUT_BLOCK_FEATURES * SIMD;
67 const uint out_b = (uint)get_group_id(2);
68 uint out_f = out_fg + get_sub_group_local_id();
70 const uint sglid = get_sub_group_local_id();
72 uint out_x_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
73 uint out_y_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
75 const uint max_out_yx = OUTPUT_SIZE_X * OUTPUT_SIZE_Y;
76 uint max_local_yx = min(max_out_yx, out_yx_sg + OUT_BLOCK_SPATIAL);
77 __attribute__((opencl_unroll_hint))
78 for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
79 uint out_yx_shuffle = out_yx_sg + sglid + os * SIMD;
80 uint out_yx_clamp = max_out_yx % OUT_BLOCK_SPATIAL == 0
82 : min(out_yx_shuffle, max_local_yx - 1);
83 out_x_shuffle[os] = out_yx_clamp % OUTPUT_SIZE_X;
84 out_y_shuffle[os] = out_yx_clamp / OUTPUT_SIZE_X;
87 const uint ifm_blocks = CEIL_DIV(INPUT0_FEATURE_NUM, FSV);
88 const uint ifm_blocks_per_sg = ifm_blocks / FEATURE_SLM_SPLIT;
89 const uint ifm_per_sg = ifm_blocks_per_sg * FSV;
91 uint feature_offset = 0;
92 uint feature_blocks = ifm_blocks_per_sg;
93 #if FEATURE_SLM_SPLIT != 1
94 feature_offset = get_sub_group_id() * ifm_per_sg;
96 if (ifm_blocks % FEATURE_SLM_SPLIT != 0) {
97 bool bigger_sg = get_sub_group_id() < ifm_blocks % FEATURE_SLM_SPLIT;
98 feature_blocks = bigger_sg ? ifm_blocks_per_sg + 1 : ifm_blocks_per_sg;
99 feature_offset += bigger_sg ? get_sub_group_id() * FSV : ifm_blocks % FEATURE_SLM_SPLIT * FSV;
103 uint filter_idx = GET_WEIGHTS_INDEX(out_f, feature_offset, 0, 0, 0);
105 uint input_idx[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
106 __attribute__((opencl_unroll_hint))
107 for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
108 uint input_x = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
109 uint input_y = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
110 input_idx[os] = INPUT0_GET_INDEX(out_b, feature_offset, input_y, input_x);
113 ACCUMULATOR_TYPE dotProd[OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL] = { };
115 __attribute__((opencl_unroll_hint(1)))
116 for (uint k = 0; k < feature_blocks; ++k) {
117 uint4 weights_val[OUT_BLOCK_FEATURES] = { };
118 __attribute__((opencl_unroll_hint))
119 for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
120 weights_val[ofb] = vload4(0, (__global uint*)(weights + filter_idx + ofb * WEIGHTS_FEATURE_BLOCK_PITCH));
123 uint4 input_val[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
124 __attribute__((opencl_unroll_hint))
125 for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
126 input_val[os] = vload4(0, (__global uint *)(conv_input + input_idx[os]));
129 #if OUT_BLOCK_FEATURES > 1 && FEATURE_SLM_SPLIT != 1 && OUT_BLOCK_SPATIAL > 14
130 // For some cases compiler spills here due to loop order
131 // Use suboptimal order to avoid this at cost of instruction dispatch delays.
132 __attribute__((opencl_unroll_hint))
133 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
134 __attribute__((opencl_unroll_hint))
135 for (uint ive = 0; ive < 4; ++ive) {
136 __attribute__((opencl_unroll_hint))
137 for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
139 __attribute__((opencl_unroll_hint))
140 for (uint ive = 0; ive < 4; ++ive) {
141 __attribute__((opencl_unroll_hint))
142 for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
143 __attribute__((opencl_unroll_hint))
144 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
146 dotProd[ofb][os] = IMAD(dotProd[ofb][os],
147 AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD)),
148 AS_FILTER_TYPE_4(weights_val[ofb][ive]));
153 filter_idx += WEIGHTS_IS_PITCH;
154 __attribute__((opencl_unroll_hint))
155 for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
156 input_idx[os] += INPUT0_FEATURE_PITCH * FSV;
160 #if FEATURE_SLM_SPLIT != 1
161 // Additional local memory reduction for feature split mode
162 # if FEATURE_SLM_SPLIT < OUT_BLOCK_FEATURES
163 # error convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl - OUT_BLOCK_FEATURES must be less or equal to FEATURE_SLM_SPLIT
166 const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL;
167 __local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
169 uint sgid_start_idx = get_sub_group_id();
170 sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
171 __local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL + sglid;
173 if (get_sub_group_id() < OUT_BLOCK_FEATURES) {
174 __attribute__((opencl_unroll_hint))
175 for (uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
176 if (get_sub_group_id() == wg) {
177 __attribute__((opencl_unroll_hint))
178 for (uint ofb = 0; ofb < wg; ++ofb) {
179 __attribute__((opencl_unroll_hint))
180 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
181 const uint partial_acc_ptr_idx =
182 ofb * OUT_BLOCK_SPATIAL * SIMD +
184 partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
187 __attribute__((opencl_unroll_hint))
188 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
189 dotProd[0][os] = dotProd[wg][os];
191 __attribute__((opencl_unroll_hint))
192 for (uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
193 __attribute__((opencl_unroll_hint))
194 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
195 const uint partial_acc_ptr_idx =
196 ((wg != 0) ? OUT_BLOCK_SPATIAL * OUT_BLOCK_FEATURES * SIMD : 0) +
197 ofb * OUT_BLOCK_SPATIAL * SIMD +
199 partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
205 __attribute__((opencl_unroll_hint))
206 for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
207 __attribute__((opencl_unroll_hint))
208 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
209 const uint partial_acc_ptr_idx =
210 ofb * OUT_BLOCK_SPATIAL * SIMD +
212 partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
217 barrier(CLK_LOCAL_MEM_FENCE);
219 if (get_sub_group_id() >= OUT_BLOCK_FEATURES)
222 partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_SPATIAL * SIMD + sglid;
223 __attribute__((opencl_unroll_hint))
224 for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
225 __attribute__((opencl_unroll_hint))
226 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
227 const uint partial_acc_ptr_idx =
228 wg * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL +
230 dotProd[0][os] += partial_acc_ptr[partial_acc_ptr_idx];
235 #if FEATURE_SLM_SPLIT == 1
236 # define FINAL_OUT_BLOCK_FEATURES (OUT_BLOCK_FEATURES)
238 # define FINAL_OUT_BLOCK_FEATURES 1
239 out_f += get_sub_group_id() * SIMD;
240 out_fg += get_sub_group_id() * SIMD;
242 if (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OUT_BLOCK_FEATURES != 0 && out_fg >= OUTPUT_FEATURE_NUM)
248 BIAS_TYPE bias_val[FINAL_OUT_BLOCK_FEATURES];
249 for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
250 bias_val[ofb] = biases[out_f + ofb * SIMD];
254 // Convert accumulator type to activation type
255 ACTIVATION_TYPE dequantized[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
256 __attribute__((opencl_unroll_hint))
257 for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
258 __attribute__((opencl_unroll_hint))
259 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
260 dequantized[ofb][os] = TO_ACTIVATION_TYPE(dotProd[ofb][os]);
263 dequantized[ofb][os] += TO_ACTIVATION_TYPE(bias_val[ofb]);
268 // Fused ops/activation
269 OUTPUT_TYPE result[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
270 __attribute__((opencl_unroll_hint))
271 for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
272 #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
273 FUSED_OPS_PRELOAD_SCALAR;
275 __attribute__((opencl_unroll_hint))
276 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
278 #if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
279 FUSED_OPS_CALC_SCALAR;
283 result[ofb][os] = FUSED_OPS_RESULT_SCALAR;
285 result[ofb][os] = TO_OUTPUT_TYPE(ACTIVATION(dequantized[ofb][os], ACTIVATION_PARAMS));
291 // Check if can use block writes
292 bool only_x_block = OUTPUT_SIZE_X % OUT_BLOCK_SPATIAL == 0;
293 bool at_least_one_x_block = OUTPUT_SIZE_X >= OUT_BLOCK_SPATIAL;
294 bool full_x = out_yx_sg % OUTPUT_SIZE_X <= OUTPUT_SIZE_X - OUT_BLOCK_SPATIAL;
295 bool can_write_x = only_x_block || (at_least_one_x_block && full_x);
297 bool no_x_pad = OUTPUT_PAD_BEFORE_SIZE_X == 0 && OUTPUT_PAD_AFTER_SIZE_X == 0;
298 bool exact_spatial = max_out_yx % OUT_BLOCK_SPATIAL == 0;
299 bool full_spatial = out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL;
300 bool can_write_spatial = no_x_pad && (exact_spatial || full_spatial);
302 bool full_feature_block = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM);
304 bool can_use_full_block_write = full_feature_block && (can_write_x || can_write_spatial);
305 if (can_use_full_block_write) {
306 uint output_idx = OUTPUT_GET_INDEX(out_b,
308 intel_sub_group_shuffle(out_y_shuffle[0], 0),
309 intel_sub_group_shuffle(out_x_shuffle[0], 0));
310 __attribute__((opencl_unroll_hint))
311 for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
312 bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
313 || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
314 || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
317 #if OUTPUT_TYPE_SIZE == 1
318 for (; os + 8 <= OUT_BLOCK_SPATIAL; os += 8) {
319 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
320 __attribute__((opencl_unroll_hint))
321 for (uint i = 0; i < 8; ++i) {
322 result_val[i] = result[ofb][os + i];
324 DT_OUTPUT_BLOCK_WRITE8(output, output_idx, result_val);
325 output_idx += 8 * SIMD;
328 #if OUTPUT_TYPE_SIZE <= 2
329 for (; os + 4 <= OUT_BLOCK_SPATIAL; os += 4) {
330 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
331 __attribute__((opencl_unroll_hint))
332 for (uint i = 0; i < 4; ++i) {
333 result_val[i] = result[ofb][os + i];
335 DT_OUTPUT_BLOCK_WRITE4(output, output_idx, result_val);
336 output_idx += 4 * SIMD;
339 for (; os + 2 <= OUT_BLOCK_SPATIAL; os += 2) {
340 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
341 __attribute__((opencl_unroll_hint))
342 for (uint i = 0; i < 2; ++i) {
343 result_val[i] = result[ofb][os + i];
345 DT_OUTPUT_BLOCK_WRITE2(output, output_idx, result_val);
346 output_idx += 2 * SIMD;
348 if (OUT_BLOCK_SPATIAL % 2 == 1) {
349 OUTPUT_TYPE result_val = result[ofb][os];
350 DT_OUTPUT_BLOCK_WRITE(output, output_idx, result_val);
351 output_idx += 1 * SIMD;
354 output_idx += OUTPUT_FEATURE_PITCH * FSV - OUT_BLOCK_SPATIAL * SIMD;
357 uint output_idx_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
358 __attribute__((opencl_unroll_hint))
359 for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
360 output_idx_shuffle[os] = OUTPUT_GET_INDEX(out_b, out_fg, out_y_shuffle[os], out_x_shuffle[os]);
362 __attribute__((opencl_unroll_hint))
363 for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
364 bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
365 || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
366 || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
368 __attribute__((opencl_unroll_hint))
369 for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
370 bool good_os = (max_out_yx % OUT_BLOCK_SPATIAL == 0) || (out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL) || (os < max_out_yx % OUT_BLOCK_SPATIAL);
374 uint output_idx = intel_sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
375 bool good_of = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_f + ofb * SIMD < OUTPUT_FEATURE_NUM);
378 result[ofb][os] = (OUTPUT_TYPE)0;
380 output[output_idx + sglid] = result[ofb][os];
384 __attribute__((opencl_unroll_hint))
385 for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
386 output_idx_shuffle[os] += OUTPUT_FEATURE_PITCH * FSV;
391 #undef FINAL_OUT_BLOCK_FEATURES
394 #undef AS_INPUT0_TYPE_4
395 #undef AS_FILTER_TYPE_4