Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / fused_conv_eltwise_gpu_bfyx_1x1_opt_fp32.cl
1 // Copyright (c) 2016-2017 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "include/include_all.cl"
16
17 #define SIMD_SIZE 8
18 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
19 KERNEL(fused_conv_eltwise_gpu_bfyx_1x1_opt)(
20     __global INPUT0_TYPE* input, 
21     __global OUTPUT_TYPE* output, 
22     __global FILTER_TYPE* weights, 
23 #if BIAS_TERM
24     __global BIAS_TYPE* biases,
25 #endif
26     uint split_idx,
27     const __global float* src3)
28 {
29    const uint group_x = get_group_id(0) * OUT_BLOCK_WIDTH;
30     const uint group_y = get_group_id(1) * OUT_BLOCK_HEIGHT;
31     const uint f = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) % OUTPUT_FEATURE_NUM;
32     const uint b = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) / OUTPUT_FEATURE_NUM;;
33
34     const uint ifm_part = get_sub_group_id();
35     uint ifm_offset = ifm_part* OUT_BLOCK_DEPTH/2;
36
37     UNIT_TYPE in[OUT_BLOCK_HEIGHT];
38     UNIT_TYPE dotProd0[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
39     UNIT_TYPE dotProd1[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
40
41     for(uint i = 0; i < OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2; i++)
42     {
43         dotProd0[i] = 0;
44         dotProd1[i] = 0;
45     }
46
47 #if OUT_BLOCK_DEPTH == 8
48     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(64 * FILTER_IFM_NUM/2);
49 #elif OUT_BLOCK_DEPTH == 4
50     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(32 * FILTER_IFM_NUM/2);
51 #elif OUT_BLOCK_DEPTH == 2
52     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(16 * FILTER_IFM_NUM/2);
53 #else
54     const uint filter_offset = f*FILTER_OFM_PITCH + ifm_part*(FILTER_IFM_NUM/2) * FILTER_IFM_PITCH;
55 #endif
56     const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + group_x * INPUT0_X_PITCH + group_y * INPUT0_Y_PITCH + ifm_part*(FILTER_IFM_NUM/2) * INPUT0_FEATURE_PITCH;
57
58     //--------------------------------------------------------------------
59     // main computation phase
60     //--------------------------------------------------------------------
61
62     for (uint k = 0; k < FILTER_IFM_NUM/2; ++k)
63     {
64         for(uint i = 0; i < OUT_BLOCK_HEIGHT; i++)
65         {
66             const uint in_offset = input_offset + get_sub_group_local_id() + i * INPUT0_Y_PITCH + k * INPUT0_FEATURE_PITCH;
67             in[i] = input[in_offset];
68         }
69
70 #if OUT_BLOCK_DEPTH == 8
71         float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
72 #elif OUT_BLOCK_DEPTH == 4
73         float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
74 #elif OUT_BLOCK_DEPTH == 2
75         float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
76 #endif
77
78         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
79         {
80             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
81             {
82                 float _in = intel_sub_group_shuffle(in[br], bc);
83                 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
84                 {
85                     dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd];
86                     dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd + OUT_BLOCK_DEPTH/2];
87                 }
88             }
89         }
90     }
91
92     __local float slm_vals[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD_SIZE];
93
94     //--------------------------------------------------------------------
95     // second sub_group in workgroup task
96     //--------------------------------------------------------------------
97     
98     if(ifm_part == 1)
99     {
100         for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
101         {
102             for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
103             {
104                 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
105                 {
106                     slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)) + get_sub_group_local_id()] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
107                     dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
108                 }
109             }
110         }
111
112     }
113
114     //--------------------------------------------------------------------
115     // first sub_group in workgroup task
116     //--------------------------------------------------------------------
117     
118     if(ifm_part == 0)
119     {
120         for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
121         {
122             for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
123             {
124                 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
125                 {
126                     slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * (bd+OUT_BLOCK_DEPTH/2) )) + get_sub_group_local_id()] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
127                 }
128             }
129         }
130
131     }
132
133     //--------------------------------------------------------------------
134     // add bias phase
135     //--------------------------------------------------------------------
136     
137     #if BIAS_TERM
138     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
139     {
140         float _bias = biases[f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id()];
141         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
142         {
143             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
144             {
145                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _bias;
146             }
147         }
148     }
149     #endif
150
151     barrier(CLK_LOCAL_MEM_FENCE); // we want to add barrier after biases addition so that the long slm write part latency is shadowed by it
152
153     //--------------------------------------------------------------------
154     // sum sub-group results + activation phase
155     //--------------------------------------------------------------------
156     
157     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
158     {
159         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
160         {
161             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
162             {
163                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) )) + get_sub_group_local_id()];
164                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
165             }
166         }
167     }
168
169     //--------------------------------------------------------------------
170     // eltwise with eltwise activation phase
171     //--------------------------------------------------------------------
172     #if IN_OUT_OPT != 1
173     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
174     {
175         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
176         {
177             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
178             {
179                 uint src3_offset = GET_DATA_INDEX(INPUT1, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), (group_y + br) * ELTW_STRIDE_Y, (group_x + bc) * ELTW_STRIDE_X);
180                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += src3[src3_offset];
181                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION_ELTW(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M_ELTW, NL_N_ELTW);
182             }
183         }
184     }
185     #endif
186
187     //--------------------------------------------------------------------
188     // output phase
189     //--------------------------------------------------------------------
190
191     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
192     {
193         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
194         {
195             uint dst_index = GET_DATA_INDEX(OUTPUT, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), group_y + br, group_x);
196             uint out_vstore_offset = 0;
197             #if (OUT_BLOCK_WIDTH >= 8)
198             {
199                 float8 tmp = (float8)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
200                                       dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
201                                       dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
202                                       dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
203                                       dotProd0[out_vstore_offset + 4 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
204                                       dotProd0[out_vstore_offset + 5 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
205                                       dotProd0[out_vstore_offset + 6 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
206                                       dotProd0[out_vstore_offset + 7 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
207 #if IN_OUT_OPT == 1
208                 float8 tmp2 = vload8(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
209                 tmp += tmp2;
210                 tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW);
211 #endif
212                 vstore8(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
213                 out_vstore_offset += 8;
214             }
215             #endif
216             #if (OUT_BLOCK_WIDTH % 8) > 3
217             {
218                 float4 tmp = (float4)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
219                                       dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
220                                       dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
221                                       dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
222 #if IN_OUT_OPT == 1
223                 float4 tmp2 = vload4(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
224                 tmp += tmp2;
225                 tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW);
226 #endif
227                 vstore4(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
228                 out_vstore_offset += 4;
229             }
230             #endif
231             #if (OUT_BLOCK_WIDTH % 4) > 1
232             {
233                 float2 tmp = (float2)(dotProd0[out_vstore_offset + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
234                                        dotProd0[out_vstore_offset+1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
235 #if IN_OUT_OPT == 1
236                 float2 tmp2 = vload2(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
237                 tmp += tmp2;
238                 tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW);
239 #endif
240                 vstore2(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
241                 out_vstore_offset += 2;
242             }
243             #endif
244             for(uint bc = out_vstore_offset; bc < OUT_BLOCK_WIDTH; bc++)
245             {
246 #if IN_OUT_OPT == 1
247                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += output[dst_index + bc * OUTPUT_X_PITCH];
248                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION_ELTW(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M_ELTW, NL_N_ELTW);
249 #endif                
250                 output[dst_index + bc * OUTPUT_X_PITCH] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
251             }
252         }
253     }
254 }