1 // Copyright (c) 2016-2017 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "include/include_all.cl"
18 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
19 KERNEL(fused_conv_eltwise_gpu_bfyx_1x1_opt)(
20 __global INPUT0_TYPE* input,
21 __global OUTPUT_TYPE* output,
22 __global FILTER_TYPE* weights,
24 __global BIAS_TYPE* biases,
27 const __global float* src3)
29 const uint group_x = get_group_id(0) * OUT_BLOCK_WIDTH;
30 const uint group_y = get_group_id(1) * OUT_BLOCK_HEIGHT;
31 const uint f = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) % OUTPUT_FEATURE_NUM;
32 const uint b = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) / OUTPUT_FEATURE_NUM;;
34 const uint ifm_part = get_sub_group_id();
35 uint ifm_offset = ifm_part* OUT_BLOCK_DEPTH/2;
37 UNIT_TYPE in[OUT_BLOCK_HEIGHT];
38 UNIT_TYPE dotProd0[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
39 UNIT_TYPE dotProd1[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
41 for(uint i = 0; i < OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2; i++)
47 #if OUT_BLOCK_DEPTH == 8
48 const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(64 * FILTER_IFM_NUM/2);
49 #elif OUT_BLOCK_DEPTH == 4
50 const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(32 * FILTER_IFM_NUM/2);
51 #elif OUT_BLOCK_DEPTH == 2
52 const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(16 * FILTER_IFM_NUM/2);
54 const uint filter_offset = f*FILTER_OFM_PITCH + ifm_part*(FILTER_IFM_NUM/2) * FILTER_IFM_PITCH;
56 const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + group_x * INPUT0_X_PITCH + group_y * INPUT0_Y_PITCH + ifm_part*(FILTER_IFM_NUM/2) * INPUT0_FEATURE_PITCH;
58 //--------------------------------------------------------------------
59 // main computation phase
60 //--------------------------------------------------------------------
62 for (uint k = 0; k < FILTER_IFM_NUM/2; ++k)
64 for(uint i = 0; i < OUT_BLOCK_HEIGHT; i++)
66 const uint in_offset = input_offset + get_sub_group_local_id() + i * INPUT0_Y_PITCH + k * INPUT0_FEATURE_PITCH;
67 in[i] = input[in_offset];
70 #if OUT_BLOCK_DEPTH == 8
71 float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
72 #elif OUT_BLOCK_DEPTH == 4
73 float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
74 #elif OUT_BLOCK_DEPTH == 2
75 float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
78 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
80 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
82 float _in = intel_sub_group_shuffle(in[br], bc);
83 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
85 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd];
86 dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd + OUT_BLOCK_DEPTH/2];
92 __local float slm_vals[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD_SIZE];
94 //--------------------------------------------------------------------
95 // second sub_group in workgroup task
96 //--------------------------------------------------------------------
100 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
102 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
104 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
106 slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)) + get_sub_group_local_id()] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
107 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
114 //--------------------------------------------------------------------
115 // first sub_group in workgroup task
116 //--------------------------------------------------------------------
120 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
122 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
124 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
126 slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * (bd+OUT_BLOCK_DEPTH/2) )) + get_sub_group_local_id()] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
133 //--------------------------------------------------------------------
135 //--------------------------------------------------------------------
138 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
140 float _bias = biases[f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id()];
141 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
143 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
145 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _bias;
151 barrier(CLK_LOCAL_MEM_FENCE); // we want to add barrier after biases addition so that the long slm write part latency is shadowed by it
153 //--------------------------------------------------------------------
154 // sum sub-group results + activation phase
155 //--------------------------------------------------------------------
157 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
159 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
161 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
163 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) )) + get_sub_group_local_id()];
164 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
169 //--------------------------------------------------------------------
170 // eltwise with eltwise activation phase
171 //--------------------------------------------------------------------
173 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
175 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
177 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
179 uint src3_offset = GET_DATA_INDEX(INPUT1, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), (group_y + br) * ELTW_STRIDE_Y, (group_x + bc) * ELTW_STRIDE_X);
180 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += src3[src3_offset];
181 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION_ELTW(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M_ELTW, NL_N_ELTW);
187 //--------------------------------------------------------------------
189 //--------------------------------------------------------------------
191 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
193 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
195 uint dst_index = GET_DATA_INDEX(OUTPUT, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), group_y + br, group_x);
196 uint out_vstore_offset = 0;
197 #if (OUT_BLOCK_WIDTH >= 8)
199 float8 tmp = (float8)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
200 dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
201 dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
202 dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
203 dotProd0[out_vstore_offset + 4 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
204 dotProd0[out_vstore_offset + 5 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
205 dotProd0[out_vstore_offset + 6 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
206 dotProd0[out_vstore_offset + 7 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
208 float8 tmp2 = vload8(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
210 tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW);
212 vstore8(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
213 out_vstore_offset += 8;
216 #if (OUT_BLOCK_WIDTH % 8) > 3
218 float4 tmp = (float4)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
219 dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
220 dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
221 dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
223 float4 tmp2 = vload4(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
225 tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW);
227 vstore4(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
228 out_vstore_offset += 4;
231 #if (OUT_BLOCK_WIDTH % 4) > 1
233 float2 tmp = (float2)(dotProd0[out_vstore_offset + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
234 dotProd0[out_vstore_offset+1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
236 float2 tmp2 = vload2(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
238 tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW);
240 vstore2(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
241 out_vstore_offset += 2;
244 for(uint bc = out_vstore_offset; bc < OUT_BLOCK_WIDTH; bc++)
247 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += output[dst_index + bc * OUTPUT_X_PITCH];
248 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION_ELTW(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M_ELTW, NL_N_ELTW);
250 output[dst_index + bc * OUTPUT_X_PITCH] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];