1 // Copyright (c) 2016-2017 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "include/include_all.cl"
16 #include "include/sub_group.cl"
18 __attribute__((reqd_work_group_size(LOCAL_WORK_GROUP_SIZE, 1, 1)))
19 KERNEL(convolution_gpu_yxfb_yxio_b1_block_multiple_x)(
20 const __global float* input,
21 __global float* output,
22 const __global float* filter,
24 const __global float* bias,
29 #define VECTOR_FLOAT float8
30 #define BLOCK_READ(IN) as_float8(intel_sub_group_block_read8((const __global uint*)IN))
31 #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write8((__global uint*)OUT, as_uint8(DATA));
34 #define VECTOR_FLOAT float4
35 #define BLOCK_READ(IN) as_float4(intel_sub_group_block_read4((const __global uint*)IN))
36 #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write4((__global uint*)OUT, as_uint4(DATA));
39 #define VECTOR_FLOAT float2
40 #define BLOCK_READ(IN) as_float2(intel_sub_group_block_read2((const __global uint*)IN))
41 #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write2((__global uint*)OUT, as_uint2(DATA));
44 #define VECTOR_FLOAT float
45 #define BLOCK_READ(IN) as_float(intel_sub_group_block_read((const __global uint*)IN))
46 #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write((__global uint*)OUT, as_uint(DATA));
49 const uint batch_num = INPUT0_BATCH_NUM;
50 const uint linear_id_xy = (uint)get_group_id(1) * X_PER_WORK_ITEM + OUTPUT_SIZE_X * (uint)get_group_id(2);
51 uint global_id = (((uint)get_group_id(0) * LOCAL_WORK_GROUP_SIZE) / batch_num) * batch_num + ( linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (FILTER_OFM_NUM / OFM_PER_WORK_ITEM) * batch_num;
53 const uint out_batch_id = (uint)get_local_id(0) % INPUT0_BATCH_NUM;
54 const uint out_x = (uint)get_group_id(1) * X_PER_WORK_ITEM;
55 const uint out_y = get_group_id(2);
57 uint out_id[X_PER_WORK_ITEM];
58 for(uint i = 0; i < X_PER_WORK_ITEM; i++)
60 out_id[i] = OUTPUT_OFFSET + ( (global_id + i * FILTER_ARRAY_NUM * (FILTER_OFM_NUM / OFM_PER_WORK_ITEM) * INPUT0_BATCH_NUM) / batch_num) * OFM_PER_WORK_ITEM * batch_num + out_batch_id;
63 const uint ofm_offset = (global_id * (OFM_PER_WORK_ITEM / batch_num)) % FILTER_OFM_NUM;
65 const uint sub_group_id = (uint)get_local_id(0) % INPUT0_BATCH_NUM;
67 VECTOR_FLOAT _data[X_PER_WORK_ITEM];
68 for(uint i = 0; i < X_PER_WORK_ITEM; i++)
73 const int x = (int)out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
74 const int y = (int)out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
76 for (uint i = 0; i < FILTER_SIZE_Y; i++)
78 const int input_offset_y = y + i * DILATION_SIZE_Y;
79 const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
83 for (uint j = 0; j < FILTER_SIZE_X; j++)
85 const int input_offset_x = x + j * DILATION_SIZE_X;
87 bool zero_x[X_PER_WORK_ITEM];
88 for(int z = 0; z < X_PER_WORK_ITEM; z++)
90 zero_x[z] = (input_offset_x + z * STRIDE_SIZE_X) >= INPUT0_SIZE_X || (input_offset_x + z * STRIDE_SIZE_X) < 0;
93 VECTOR_FLOAT _tmp[X_PER_WORK_ITEM];
94 for(uint t = 0; t < X_PER_WORK_ITEM; t++)
99 uint input_idx = input_offset_x*INPUT0_X_PITCH + input_offset_y*INPUT0_Y_PITCH;
100 input_idx += INPUT0_OFFSET + split_idx * FILTER_IFM_NUM * INPUT0_FEATURE_PITCH;
101 input_idx += out_batch_id;
103 uint filter_idx = ofm_offset + sub_group_id + i*FILTER_Y_PITCH + j*FILTER_X_PITCH;
105 #if FILTER_IFM_NUM >= 8
106 for(uint h = 0; h < FILTER_IFM_NUM / 8; h++)
108 float _in[X_PER_WORK_ITEM];
109 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
111 _in[a] = as_float(intel_sub_group_block_read((const __global uint*)input + (input_idx + a * INPUT0_FEATURE_NUM * STRIDE_SIZE_X)));
113 float8 _input[X_PER_WORK_ITEM];
114 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
116 _input[a] = TRANSPOSE_BLOCK_8(_in[a]);
119 VECTOR_FLOAT _filter;
120 _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
121 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
123 _tmp[a] = mad(_input[a].s0, _filter, _tmp[a]);
126 _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
127 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
129 _tmp[a] = mad(_input[a].s1, _filter, _tmp[a]);
132 _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
133 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
135 _tmp[a] = mad(_input[a].s2, _filter, _tmp[a]);
138 _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
139 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
141 _tmp[a] = mad(_input[a].s3, _filter, _tmp[a]);
145 _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
146 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
148 _tmp[a] = mad(_input[a].s4, _filter, _tmp[a]);
151 _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
152 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
154 _tmp[a] = mad(_input[a].s5, _filter, _tmp[a]);
157 _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
158 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
160 _tmp[a] = mad(_input[a].s6, _filter, _tmp[a]);
163 _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
164 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
166 _tmp[a] = mad(_input[a].s7, _filter, _tmp[a]);
169 input_idx += 8 * INPUT0_FEATURE_PITCH;
171 for (uint h = FILTER_IFM_NUM - (FILTER_IFM_NUM % 8); h < FILTER_IFM_NUM; h++)
173 for (uint h = 0; h < FILTER_IFM_NUM; h++)
176 VECTOR_FLOAT _filter = BLOCK_READ(filter + filter_idx);
177 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
179 _tmp[a] = mad(input[input_idx + a * INPUT0_FEATURE_NUM * STRIDE_SIZE_X], _filter, _tmp[a]);
181 filter_idx += FILTER_IFM_PITCH;
182 input_idx += INPUT0_FEATURE_PITCH;
184 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
194 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
196 _data[a] += BLOCK_READ(bias + ofm_offset);
199 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
201 _data[a] = ACTIVATION(_data[a], NL_M, NL_N);
204 BLOCK_WRITE(output + out_id[0], _data[0]);
205 for(uint a = 1; a < X_PER_WORK_ITEM; a++)
207 if(out_x + a < OUTPUT_SIZE_X)
209 BLOCK_WRITE(output + out_id[a], _data[a]);
213 #if defined(USE_VECTOR)