Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / convolution_gpu_bfyx_1x1_opt.cl
1 // Copyright (c) 2016-2017 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "include/include_all.cl"
16
17 #define SIMD_SIZE 8
18 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
19 KERNEL(convolution)(
20     __global INPUT0_TYPE* input, 
21     __global OUTPUT_TYPE* output, 
22     __global FILTER_TYPE* weights, 
23 #if BIAS_TERM
24     __global BIAS_TYPE* biases,
25 #endif
26     uint split_idx)
27 {
28     const uint group_x = get_group_id(0) * OUT_BLOCK_WIDTH;
29     const uint group_y = get_group_id(1) * OUT_BLOCK_HEIGHT;
30     const uint f = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) % OUTPUT_FEATURE_NUM;
31     const uint b = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) / OUTPUT_FEATURE_NUM;;
32
33     const uint ifm_part = get_sub_group_id();
34     uint ifm_offset = ifm_part* OUT_BLOCK_DEPTH/2;
35
36     UNIT_TYPE in[OUT_BLOCK_HEIGHT];
37     UNIT_TYPE dotProd0[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
38     UNIT_TYPE dotProd1[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
39
40     for(uint i = 0; i < OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2; i++)
41     {
42         dotProd0[i] = 0;
43         dotProd1[i] = 0;
44     }
45
46 #if OUT_BLOCK_DEPTH == 8
47     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(64 * FILTER_IFM_NUM/2);
48 #elif OUT_BLOCK_DEPTH == 4
49     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(32 * FILTER_IFM_NUM/2);
50 #elif OUT_BLOCK_DEPTH == 2
51     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(16 * FILTER_IFM_NUM/2);
52 #else
53     const uint filter_offset = f*FILTER_OFM_PITCH + ifm_part*(FILTER_IFM_NUM/2) * FILTER_IFM_PITCH;
54 #endif
55     const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + group_x * INPUT0_X_PITCH + group_y * INPUT0_Y_PITCH + ifm_part*(FILTER_IFM_NUM/2) * INPUT0_FEATURE_PITCH;
56
57     //--------------------------------------------------------------------
58     // main computation phase
59     //--------------------------------------------------------------------
60
61     for (uint k = 0; k < FILTER_IFM_NUM/2; ++k)
62     {
63         for(uint i = 0; i < OUT_BLOCK_HEIGHT; i++)
64         {
65             const uint in_offset = input_offset + get_sub_group_local_id() + i * INPUT0_Y_PITCH + k * INPUT0_FEATURE_PITCH;
66             in[i] = input[in_offset];
67         }
68
69 #if OUT_BLOCK_DEPTH == 8
70         float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
71 #elif OUT_BLOCK_DEPTH == 4
72         float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
73 #elif OUT_BLOCK_DEPTH == 2
74         float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
75 #endif
76
77         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
78         {
79             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
80             {
81                 float _in = intel_sub_group_shuffle(in[br], bc);
82                 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
83                 {
84                     dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd];
85                     dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd + OUT_BLOCK_DEPTH/2];
86                 }
87             }
88         }
89     }
90
91     __local float slm_vals[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD_SIZE];
92     __local float* slm_p = &slm_vals[0];
93     //--------------------------------------------------------------------
94     // second sub_group in workgroup task
95     //--------------------------------------------------------------------
96     
97     if(ifm_part == 1)
98     {
99         for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
100         {
101             for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
102             {
103                 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
104                 {
105                     slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * bd))] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
106                     dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
107                 }
108             }
109         }
110
111     }
112
113     //--------------------------------------------------------------------
114     // first sub_group in workgroup task
115     //--------------------------------------------------------------------
116     
117     if(ifm_part == 0)
118     {
119         for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
120         {
121             for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
122             {
123                 uint width_offset = 0;
124                 #if (OUT_BLOCK_WIDTH) >= 4
125                 const uint slm_off = OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + OUT_BLOCK_DEPTH/2) ));
126                 float4 tmp = (float4)(dotProd1[width_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
127                                       dotProd1[width_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
128                                       dotProd1[width_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
129                                       dotProd1[width_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
130                 vstore4(tmp, 0, slm_p + slm_off);
131                 width_offset += 4;
132                 #endif
133                 for(uint bc = width_offset; bc < OUT_BLOCK_WIDTH; bc++)
134                 {
135                     slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd+OUT_BLOCK_DEPTH/2) ))] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
136                 }
137             }
138         }
139
140     }
141
142     //--------------------------------------------------------------------
143     // add bias phase
144     //--------------------------------------------------------------------
145     
146     #if BIAS_TERM
147     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
148     {
149         float _bias = biases[f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id()];
150         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
151         {
152             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
153             {
154                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _bias;
155             }
156         }
157     }
158     #endif
159
160     barrier(CLK_LOCAL_MEM_FENCE); // we want to add barrier after biases addition so that the long slm write part latency is shadowed by it
161
162     //--------------------------------------------------------------------
163     // sum sub-group results + activation phase
164     //--------------------------------------------------------------------
165     
166     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
167     {
168         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
169         {
170             uint width_offset = 0;
171             #if (OUT_BLOCK_WIDTH) >= 4
172             const uint slm_off = OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) ));
173             float4 tmp = vload4(0, slm_p + slm_off);
174             dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[0];
175             dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[1];
176             dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[2];
177             dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[3];
178
179             dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
180             dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
181             dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
182             dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
183
184             width_offset += 4;
185             #endif
186
187             for(uint bc = width_offset; bc < OUT_BLOCK_WIDTH; bc++)
188             {
189                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) ))];
190                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
191             }
192         }
193     }
194
195     //--------------------------------------------------------------------
196     // output phase
197     //--------------------------------------------------------------------
198
199     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
200     {
201         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
202         {
203             uint dst_index = GET_DATA_INDEX(OUTPUT, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), group_y + br, group_x);
204             uint out_vstore_offset = 0;
205             #if (OUT_BLOCK_WIDTH >= 8)
206             float8 tmp = (float8)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
207                                   dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
208                                   dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
209                                   dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
210                                   dotProd0[out_vstore_offset + 4 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
211                                   dotProd0[out_vstore_offset + 5 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
212                                   dotProd0[out_vstore_offset + 6 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
213                                   dotProd0[out_vstore_offset + 7 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
214             vstore8(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
215             out_vstore_offset += 8;
216             #endif
217             #if (OUT_BLOCK_WIDTH % 8) > 3
218             float4 tmp = (float4)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
219                                   dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
220                                   dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
221                                   dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
222             vstore4(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
223             out_vstore_offset += 4;
224             #endif
225             #if (OUT_BLOCK_WIDTH % 4) > 1
226             float2 tmp2 = (float2)(dotProd0[out_vstore_offset + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
227                                  dotProd0[out_vstore_offset+1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
228             vstore2(tmp2, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
229             out_vstore_offset += 2;
230             #endif
231             //dst_index += 4 * OUTPUT_X_PITCH;
232             for(uint bc = out_vstore_offset; bc < OUT_BLOCK_WIDTH; bc++)
233             {
234                 output[dst_index + bc * OUTPUT_X_PITCH] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
235             }
236         }
237     }
238 }