Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / convolution_gpu_mmad_batched_block_1x1.cl
1 // Copyright (c) 2018 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "include/fetch.cl"
16 #include "include/mmad.cl"
17
18 #define SCALE 0.11f
19
20 #ifdef LIGHTWEIGHT_QUANTIZATION
21
22 #define QUANTIZATION \
23     uchar4 out;\
24     out[0] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s0);\
25     out[1] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b] * SCALE + bias_f.s1);\
26     out[2] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b] * SCALE + bias_f.s2);\
27     out[3] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s3);
28
29 #elif NO_QUANTIZATION
30
31 #define QUANTIZATION \
32     uchar4 out;\
33     out[0] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b]);\
34     out[1] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b]);\
35     out[2] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b]);\
36     out[3] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b]);
37
38 #else
39
40 #define QUANTIZATION \
41     char4 out;\
42     out[0] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b]  * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0 ) ), NL_M, NL_N);\
43     out[1] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b]  * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1 ) ), NL_M, NL_N);\
44     out[2] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b]  * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2 ) ), NL_M, NL_N);\
45     out[3] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b]  * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3 ) ), NL_M, NL_N);
46
47 #endif
48
49 #define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
50 #define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
51 #define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
52 #define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
53 // input data is in blocks 4batch x 32 features
54
55 #define NEEDED_INPUT_X ((OUT_BLOCK_WIDTH-1) * (STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) + 1)
56 #define NEEDED_INPUT_Y ((OUT_BLOCK_HEIGHT-1) * (STRIDE_SIZE_Y) + (FILTER_SIZE_Y - 1) + 1)
57
58 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
59 KERNEL(convolution_mmad_batched_block_1x1)(
60     __global INPUT0_TYPE* input,
61     __global OUTPUT_TYPE* output,
62     __global FILTER_TYPE* weights,
63     __global BIAS_TYPE* biases,
64     const __global float* quantizations,
65 #if CALIBRATION_TERM
66     const __global float* calibrations,
67 #endif
68     uint split_idx)
69 {
70     const uint x = get_global_id(0) * OUT_BLOCK_WIDTH;
71     const uint y = get_global_id(1) * OUT_BLOCK_HEIGHT;
72
73 #if WEIGHTS_PER_WORKITEM == 4
74     const uint f = (get_group_id(2) * 32 + get_sub_group_local_id() * 4) % FILTER_OFM_ALIGNED;
75 #else
76     const uint f = ((get_group_id(2) * WEIGHTS_PER_WORKITEM * 8) + get_sub_group_local_id() ) % FILTER_OFM_ALIGNED;
77 #endif
78     const uint b_block = (get_group_id(2) * 8 * WEIGHTS_PER_WORKITEM) / FILTER_OFM_ALIGNED;
79
80     int4 dotProd[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 };
81
82     const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
83     const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
84
85     const uint filter_offset = ((get_group_id(2) * WEIGHTS_PER_WORKITEM) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
86     const uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block;
87
88     uint filter_idx = filter_offset;
89     for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
90     {
91         ////// preloading input data //////
92         int4 preloaded_input[NEEDED_INPUT_X * NEEDED_INPUT_Y];
93         for(int h = 0; h < NEEDED_INPUT_Y; h++)
94         {
95             for(int p = 0; p < NEEDED_INPUT_X; p++)
96             {
97                 const int input_offset_y = input_y + h;
98                 const int input_offset_x = input_x + p;
99
100                 uint input_idx = input_offset + input_offset_y * IN_Y_PITCH + input_offset_x * IN_X_PITCH + k * IN_F_BLOCK_PITCH;
101                 preloaded_input[p + h * NEEDED_INPUT_X] = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
102             }
103         }
104
105         __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
106         for (uint j = 0; j < FILTER_SIZE_Y; ++j)
107         {
108             __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
109             for (uint i = 0; i < FILTER_SIZE_X; ++i)
110             {
111                 ////// preloading weights data //////
112                 int8 preloaded_weights[WEIGHTS_PER_WORKITEM];
113                 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
114                 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
115                 {
116                     preloaded_weights[w] = as_int8(intel_sub_group_block_read8((const __global uint*) (weights + (filter_idx + w * FILTER_OFM_BLOCK_PITCH) ) ));
117                 }
118
119                 ////// computing //////
120                 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
121                 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
122                 {
123                     __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
124                     for(uint oy = 0; oy < OUT_BLOCK_HEIGHT; oy++)
125                     {
126                         __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
127                         for(uint ox = 0; ox < OUT_BLOCK_WIDTH; ox++)
128                         {
129                             const uint out_idx = ox + OUT_BLOCK_WIDTH * (oy + w * OUT_BLOCK_HEIGHT);
130                             const uint preloaded_idx =ox * STRIDE_SIZE_X + i + NEEDED_INPUT_X * (oy * STRIDE_SIZE_Y + j);
131                             dotProd[out_idx] = MMAD_4x8(preloaded_input[preloaded_idx], preloaded_weights[w], dotProd[out_idx]);
132                         }
133                     }
134                 }
135                 filter_idx += FILTER_X_PITCH;
136             }
137         }
138     }
139
140
141 #if WEIGHTS_PER_WORKITEM == 4
142
143 float4 quant_f = vload4(0, quantizations + f);
144 float4 bias_f = vload4(0, biases + f);
145 float4 calib_f = vload4(0, calibrations + f);
146 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
147 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
148 {
149     __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
150     for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
151     {
152         const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y + h, x + o);
153
154         uint4 to_output;
155         __attribute__((opencl_unroll_hint(4)))
156         for(uint b = 0; b < 4; b++)
157         {
158             const uint out_idx = o + OUT_BLOCK_WIDTH * h;
159
160             QUANTIZATION;
161             to_output[b] = as_uint(out);
162         }
163         intel_sub_group_block_write4((__global uint*)(output + dst_index), to_output);
164     }
165 }
166
167 #else // WEIGHTS_PER_WORKITEM ==4
168
169 ////// QUANTIZE & OUTPUT //////
170 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
171 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
172 {
173     float quant_f = quantizations[f + w * 8];
174     float bias_f = biases[f + w * 8];
175 #if CALIBRATION_TERM
176     float calib_f = calibrations[f + w * 8];
177 #endif
178     __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
179     for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
180     {
181         __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
182         for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
183         {
184             const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT);
185             for(uint b = 0; b < 4; b++)
186             {
187             #if CALIBRATION_TERM
188                 dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * calib_f);
189             #else  // CALIBRATION_TERM
190                 dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * O_QF);
191             #endif // CALIBRATION_TERM
192             }
193         }
194     }
195 }
196
197 ////// OUTPUT STAGE //////
198 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
199 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
200 {
201     __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
202     for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
203     {
204         const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y + h, x + o);
205         
206         __attribute__((opencl_unroll_hint(4)))
207         for(uint b = 0; b < 4; b++)
208         {
209             #if WEIGHTS_PER_WORKITEM == 2
210                 char2 out;
211                 const uint out_idx = o + OUT_BLOCK_WIDTH * h;
212                 out[0] = ACTIVATION(convert_char(dotProd[out_idx][b]), NL_M, NL_N);
213                 out[1] = ACTIVATION(convert_char(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT][b]), NL_M, NL_N);
214
215                 intel_sub_group_block_write_uc2((__global uchar*)(output + dst_index + b * 32), as_uchar2(out));
216             #else
217             __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
218             for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
219             {
220                 const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT);
221                 const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f + w * 8, y + h, x + o);
222                 char char_val = ACTIVATION(convert_char(dotProd[out_idx][b]), NL_M, NL_N);
223                 output[dst_index + b * 32] = char_val;
224             }
225             #endif
226         }
227     }
228 }
229
230 #endif // WEIGHTS_PER_WORKITEM ==4
231
232 }
233
234 #undef FILTER_IFM_MMAD_NUM
235 #undef FILTER_OFM_MMAD_NUM
236 #undef FILTER_IFM_ALIGNED
237 #undef FILTER_OFM_ALIGNED
238
239
240 #undef SCALE
241 #undef QUANTIZATION