1 // Copyright (c) 2016-2017 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "include/common.cl"
17 #include "include/data_types.cl"
18 #include "include/fetch.cl"
19 #include "include/mmad.cl"
21 #define FILTER_IFM_SLICES ((FILTER_IFM_NUM + 3) /4)
22 #define FILTER_SIZE_X_SLICES ((FILTER_SIZE_X + 7) / 8)
24 #define OUT_BLOCK_HEIGHT 4
25 #define WEIGHTS_PER_WORKITEM 4 // currently needs to be set to 4, check output stage and float4 on quantizations etc.
29 #ifdef LIGHTWEIGHT_QUANTIZATION
31 #define QUANTIZATION \
32 out[w] = convert_uchar_sat((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * SCALE + bias_f[w]);
36 #define QUANTIZATION \
37 out[w] = convert_uchar_sat(dotProd[w*OUT_BLOCK_HEIGHT + h][i]);
41 #define QUANTIZATION \
42 out[w] = as_uchar( ACTIVATION( convert_char( round( ( (float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w])), NL_M, NL_N));
46 __attribute__((intel_reqd_sub_group_size(8)))
47 KERNEL(convolution_gpu_byx8_f4_fs_bs_yx_bsv4_fsv32)(
48 __global INPUT0_TYPE* input,
49 __global OUTPUT_TYPE* output,
50 __global FILTER_TYPE* weights,
51 __global BIAS_TYPE* biases,
52 __global float* quantizations,
54 __global float* calibrations,
58 const uint x = get_group_id(1) * 8;
59 const uint y = get_group_id(2) * OUT_BLOCK_HEIGHT;
61 const uint f = (get_group_id(0) * 8 * WEIGHTS_PER_WORKITEM ) % OUTPUT_FEATURE_NUM;
62 const uint b = (get_group_id(0) * 8 * WEIGHTS_PER_WORKITEM) / OUTPUT_FEATURE_NUM;
64 int8 dotProd[OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 };
66 const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
67 const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
69 const uint filter_offset = f*FILTER_OFM_PITCH;
70 const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET;
72 for (uint k = 0; k < FILTER_IFM_SLICES; ++k)
74 __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
75 for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
77 const int input_offset_y = input_y + j * DILATION_SIZE_Y;
79 __attribute__((opencl_unroll_hint(FILTER_SIZE_X_SLICES)))
80 for(uint i = 0; i < FILTER_SIZE_X_SLICES; i++)
82 int8 act_reg[OUT_BLOCK_HEIGHT]; // activations for MMAD
84 // preload spatial data
85 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
86 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
88 uint input_idx = GET_DATA_BYX8_F4_INDEX(INPUT0, b, k * 4, input_offset_y + h * STRIDE_SIZE_Y, input_x + i * 8);
89 int2 _input_data_01 = as_int2(intel_sub_group_block_read2((__global uint*)(input + input_idx)));
90 int _input_data_2 = as_int(intel_sub_group_block_read((__global uint*)(input + input_idx + 8 * 8)));
92 act_reg[h][0] = _input_data_01[0];
93 act_reg[h][1] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 1);
94 act_reg[h][2] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 2);
95 act_reg[h][3] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 3);
96 act_reg[h][4] = _input_data_01[1];
97 act_reg[h][5] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 1);
98 act_reg[h][6] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 2);
99 act_reg[h][7] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 3);
102 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
103 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) // iterate over output feature channels for weights
105 uint filter_idx = GET_FILTER_OS_IS_Y_X8_OSV8_ISV4(FILTER, f + w * 8, k * 4, j, i * 8);
106 int8 _w = as_int8(intel_sub_group_block_read8((__global uint*)(weights + filter_idx)));
108 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
109 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
111 // MMAD on 8x WEIGHTS_PER_WORKITEM input channels elements for 8x outputs in WI
112 dotProd[w*OUT_BLOCK_HEIGHT + h] = MMAD_8x8(act_reg[h], _w, dotProd[w*OUT_BLOCK_HEIGHT + h]);
119 float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + f) ));
120 float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + f) ));
122 float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + f) ));
125 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
126 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
128 const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f + get_sub_group_local_id(), y + h, x);
130 __attribute__((opencl_unroll_hint(8)))
131 for(uint i = 0; i < 8; i++)
134 #if WEIGHTS_PER_WORKITEM == 4
137 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
138 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
142 intel_sub_group_block_write_uc4((__global uchar*)(output + dst_index + 32 * 4 * i), out);
146 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
147 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
150 dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w]);
151 #else // CALIBRATION_TERM
152 dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * O_QF);
153 #endif // CALIBRATION_TERM
154 output[dst_index + 32 * 4 * i + 8 * w] = ACTIVATION(convert_char(dotProd[w*OUT_BLOCK_HEIGHT + h][i]), NL_M, NL_N);
163 #undef OUT_BLOCK_HEIGHT
164 #undef WEIGHTS_PER_WORKITEM
166 #undef FILTER_SIZE_X_SLICES
167 #undef FILTER_IFM_SLICES