1 // Copyright (c) 2016-2017 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "include/include_all.cl"
18 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
20 __global INPUT0_TYPE* input,
21 __global OUTPUT_TYPE* output,
22 __global FILTER_TYPE* weights,
24 __global BIAS_TYPE* biases,
28 const uint group_x = get_group_id(0) * OUT_BLOCK_WIDTH;
29 const uint group_y = get_group_id(1) * OUT_BLOCK_HEIGHT;
30 const uint f = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) % OUTPUT_FEATURE_NUM;
31 const uint b = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) / OUTPUT_FEATURE_NUM;;
33 const uint ifm_part = get_sub_group_id();
34 uint ifm_offset = ifm_part* OUT_BLOCK_DEPTH/2;
36 UNIT_TYPE in[OUT_BLOCK_HEIGHT];
37 UNIT_TYPE dotProd0[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
38 UNIT_TYPE dotProd1[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
40 for(uint i = 0; i < OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2; i++)
46 #if OUT_BLOCK_DEPTH == 8
47 const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(64 * FILTER_IFM_NUM/2);
48 #elif OUT_BLOCK_DEPTH == 4
49 const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(32 * FILTER_IFM_NUM/2);
50 #elif OUT_BLOCK_DEPTH == 2
51 const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(16 * FILTER_IFM_NUM/2);
53 const uint filter_offset = f*FILTER_OFM_PITCH + ifm_part*(FILTER_IFM_NUM/2) * FILTER_IFM_PITCH;
55 const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + group_x * INPUT0_X_PITCH + group_y * INPUT0_Y_PITCH + ifm_part*(FILTER_IFM_NUM/2) * INPUT0_FEATURE_PITCH;
57 //--------------------------------------------------------------------
58 // main computation phase
59 //--------------------------------------------------------------------
61 for (uint k = 0; k < FILTER_IFM_NUM/2; ++k)
63 for(uint i = 0; i < OUT_BLOCK_HEIGHT; i++)
65 const uint in_offset = input_offset + get_sub_group_local_id() + i * INPUT0_Y_PITCH + k * INPUT0_FEATURE_PITCH;
66 in[i] = input[in_offset];
69 #if OUT_BLOCK_DEPTH == 8
70 float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
71 #elif OUT_BLOCK_DEPTH == 4
72 float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
73 #elif OUT_BLOCK_DEPTH == 2
74 float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
77 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
79 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
81 float _in = intel_sub_group_shuffle(in[br], bc);
82 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
84 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd];
85 dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd + OUT_BLOCK_DEPTH/2];
91 __local float slm_vals[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD_SIZE];
92 __local float* slm_p = &slm_vals[0];
93 //--------------------------------------------------------------------
94 // second sub_group in workgroup task
95 //--------------------------------------------------------------------
99 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
101 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
103 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
105 slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * bd))] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
106 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
113 //--------------------------------------------------------------------
114 // first sub_group in workgroup task
115 //--------------------------------------------------------------------
119 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
121 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
123 uint width_offset = 0;
124 #if (OUT_BLOCK_WIDTH) >= 4
125 const uint slm_off = OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + OUT_BLOCK_DEPTH/2) ));
126 float4 tmp = (float4)(dotProd1[width_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
127 dotProd1[width_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
128 dotProd1[width_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
129 dotProd1[width_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
130 vstore4(tmp, 0, slm_p + slm_off);
133 for(uint bc = width_offset; bc < OUT_BLOCK_WIDTH; bc++)
135 slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd+OUT_BLOCK_DEPTH/2) ))] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
142 //--------------------------------------------------------------------
144 //--------------------------------------------------------------------
147 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
149 float _bias = biases[f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id()];
150 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
152 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
154 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _bias;
160 barrier(CLK_LOCAL_MEM_FENCE); // we want to add barrier after biases addition so that the long slm write part latency is shadowed by it
162 //--------------------------------------------------------------------
163 // sum sub-group results + activation phase
164 //--------------------------------------------------------------------
166 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
168 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
170 uint width_offset = 0;
171 #if (OUT_BLOCK_WIDTH) >= 4
172 const uint slm_off = OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) ));
173 float4 tmp = vload4(0, slm_p + slm_off);
174 dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[0];
175 dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[1];
176 dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[2];
177 dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[3];
179 dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
180 dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
181 dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
182 dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
187 for(uint bc = width_offset; bc < OUT_BLOCK_WIDTH; bc++)
189 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) ))];
190 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
195 //--------------------------------------------------------------------
197 //--------------------------------------------------------------------
199 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
201 for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
203 uint dst_index = GET_DATA_INDEX(OUTPUT, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), group_y + br, group_x);
204 uint out_vstore_offset = 0;
205 #if (OUT_BLOCK_WIDTH >= 8)
206 float8 tmp = (float8)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
207 dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
208 dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
209 dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
210 dotProd0[out_vstore_offset + 4 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
211 dotProd0[out_vstore_offset + 5 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
212 dotProd0[out_vstore_offset + 6 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
213 dotProd0[out_vstore_offset + 7 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
214 vstore8(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
215 out_vstore_offset += 8;
217 #if (OUT_BLOCK_WIDTH % 8) > 3
218 float4 tmp = (float4)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
219 dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
220 dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
221 dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
222 vstore4(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
223 out_vstore_offset += 4;
225 #if (OUT_BLOCK_WIDTH % 4) > 1
226 float2 tmp2 = (float2)(dotProd0[out_vstore_offset + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
227 dotProd0[out_vstore_offset+1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
228 vstore2(tmp2, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
229 out_vstore_offset += 2;
231 //dst_index += 4 * OUTPUT_X_PITCH;
232 for(uint bc = out_vstore_offset; bc < OUT_BLOCK_WIDTH; bc++)
234 output[dst_index + bc * OUTPUT_X_PITCH] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];