1 // Copyright (c) 2018 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "include/fetch.cl"
16 #include "include/mmad.cl"
20 #ifdef LIGHTWEIGHT_QUANTIZATION
22 #define QUANTIZATION \
24 out[0] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s0);\
25 out[1] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b] * SCALE + bias_f.s1);\
26 out[2] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b] * SCALE + bias_f.s2);\
27 out[3] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s3);
31 #define QUANTIZATION \
33 out[0] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b]);\
34 out[1] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b]);\
35 out[2] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b]);\
36 out[3] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b]);
40 #define QUANTIZATION \
42 out[0] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0 ) ), NL_M, NL_N);\
43 out[1] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b] * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1 ) ), NL_M, NL_N);\
44 out[2] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b] * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2 ) ), NL_M, NL_N);\
45 out[3] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b] * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3 ) ), NL_M, NL_N);
49 #define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
50 #define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
51 #define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
52 #define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
53 // input data is in blocks 4batch x 32 features
55 #define NEEDED_INPUT_X ((OUT_BLOCK_WIDTH-1) * (STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) + 1)
56 #define NEEDED_INPUT_Y ((OUT_BLOCK_HEIGHT-1) * (STRIDE_SIZE_Y) + (FILTER_SIZE_Y - 1) + 1)
58 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
59 KERNEL(convolution_mmad_batched_block_1x1)(
60 __global INPUT0_TYPE* input,
61 __global OUTPUT_TYPE* output,
62 __global FILTER_TYPE* weights,
63 __global BIAS_TYPE* biases,
64 const __global float* quantizations,
66 const __global float* calibrations,
70 const uint x = get_global_id(0) * OUT_BLOCK_WIDTH;
71 const uint y = get_global_id(1) * OUT_BLOCK_HEIGHT;
73 #if WEIGHTS_PER_WORKITEM == 4
74 const uint f = (get_group_id(2) * 32 + get_sub_group_local_id() * 4) % FILTER_OFM_ALIGNED;
76 const uint f = ((get_group_id(2) * WEIGHTS_PER_WORKITEM * 8) + get_sub_group_local_id() ) % FILTER_OFM_ALIGNED;
78 const uint b_block = (get_group_id(2) * 8 * WEIGHTS_PER_WORKITEM) / FILTER_OFM_ALIGNED;
80 int4 dotProd[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 };
82 const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
83 const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
85 const uint filter_offset = ((get_group_id(2) * WEIGHTS_PER_WORKITEM) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
86 const uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block;
88 uint filter_idx = filter_offset;
89 for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
91 ////// preloading input data //////
92 int4 preloaded_input[NEEDED_INPUT_X * NEEDED_INPUT_Y];
93 for(int h = 0; h < NEEDED_INPUT_Y; h++)
95 for(int p = 0; p < NEEDED_INPUT_X; p++)
97 const int input_offset_y = input_y + h;
98 const int input_offset_x = input_x + p;
100 uint input_idx = input_offset + input_offset_y * IN_Y_PITCH + input_offset_x * IN_X_PITCH + k * IN_F_BLOCK_PITCH;
101 preloaded_input[p + h * NEEDED_INPUT_X] = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
105 __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
106 for (uint j = 0; j < FILTER_SIZE_Y; ++j)
108 __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
109 for (uint i = 0; i < FILTER_SIZE_X; ++i)
111 ////// preloading weights data //////
112 int8 preloaded_weights[WEIGHTS_PER_WORKITEM];
113 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
114 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
116 preloaded_weights[w] = as_int8(intel_sub_group_block_read8((const __global uint*) (weights + (filter_idx + w * FILTER_OFM_BLOCK_PITCH) ) ));
119 ////// computing //////
120 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
121 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
123 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
124 for(uint oy = 0; oy < OUT_BLOCK_HEIGHT; oy++)
126 __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
127 for(uint ox = 0; ox < OUT_BLOCK_WIDTH; ox++)
129 const uint out_idx = ox + OUT_BLOCK_WIDTH * (oy + w * OUT_BLOCK_HEIGHT);
130 const uint preloaded_idx =ox * STRIDE_SIZE_X + i + NEEDED_INPUT_X * (oy * STRIDE_SIZE_Y + j);
131 dotProd[out_idx] = MMAD_4x8(preloaded_input[preloaded_idx], preloaded_weights[w], dotProd[out_idx]);
135 filter_idx += FILTER_X_PITCH;
141 #if WEIGHTS_PER_WORKITEM == 4
143 float4 quant_f = vload4(0, quantizations + f);
144 float4 bias_f = vload4(0, biases + f);
145 float4 calib_f = vload4(0, calibrations + f);
146 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
147 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
149 __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
150 for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
152 const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y + h, x + o);
155 __attribute__((opencl_unroll_hint(4)))
156 for(uint b = 0; b < 4; b++)
158 const uint out_idx = o + OUT_BLOCK_WIDTH * h;
161 to_output[b] = as_uint(out);
163 intel_sub_group_block_write4((__global uint*)(output + dst_index), to_output);
167 #else // WEIGHTS_PER_WORKITEM ==4
169 ////// QUANTIZE & OUTPUT //////
170 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
171 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
173 float quant_f = quantizations[f + w * 8];
174 float bias_f = biases[f + w * 8];
176 float calib_f = calibrations[f + w * 8];
178 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
179 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
181 __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
182 for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
184 const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT);
185 for(uint b = 0; b < 4; b++)
188 dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * calib_f);
189 #else // CALIBRATION_TERM
190 dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * O_QF);
191 #endif // CALIBRATION_TERM
197 ////// OUTPUT STAGE //////
198 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
199 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
201 __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
202 for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
204 const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y + h, x + o);
206 __attribute__((opencl_unroll_hint(4)))
207 for(uint b = 0; b < 4; b++)
209 #if WEIGHTS_PER_WORKITEM == 2
211 const uint out_idx = o + OUT_BLOCK_WIDTH * h;
212 out[0] = ACTIVATION(convert_char(dotProd[out_idx][b]), NL_M, NL_N);
213 out[1] = ACTIVATION(convert_char(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT][b]), NL_M, NL_N);
215 intel_sub_group_block_write_uc2((__global uchar*)(output + dst_index + b * 32), as_uchar2(out));
217 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
218 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
220 const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT);
221 const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f + w * 8, y + h, x + o);
222 char char_val = ACTIVATION(convert_char(dotProd[out_idx][b]), NL_M, NL_N);
223 output[dst_index + b * 32] = char_val;
230 #endif // WEIGHTS_PER_WORKITEM ==4
234 #undef FILTER_IFM_MMAD_NUM
235 #undef FILTER_OFM_MMAD_NUM
236 #undef FILTER_IFM_ALIGNED
237 #undef FILTER_OFM_ALIGNED