1 // Copyright (c) 2016-2017 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "include/common.cl"
16 #include "include/data_types.cl"
19 // ---------------------------------------------------------------------------------------------------------------------
20 // Just-in-time macro definitions:
21 // ---------------------------------------------------------------------------------------------------------------------
23 // Required JIT constants:
24 // - INPUT - [tensor] Input dimensions (batch, spatial and feature).
25 // - OUTPUT - [tensor] Output dimensions (batch, spatial and feature).
26 // - STRIDE - [tensor] Stride (only spatial). Factors that describe step size in X or Y dimension of
27 // input position of application of convolution filter when next ouput value
28 // (step 1 in in X or Y dimension of output) is computed.
29 // - INPUT0_OFFSET - [tensor] Offset for the first element
30 // initial offset input position of application of convolution filter and output position.
31 // - FP16_SUPPORTED - [0/1] Value indicating whether device supports FP16 OpenCL extension (cl_khr_fp16).
32 // - FP16_UNIT_USED - [0/1] Value indicating that current kernel should use FP16.
33 // - UNIT_TYPE - Type of unit of input/output/weight/bias.
34 // - UNIT_VAL_ZERO - Literal of current UNIT_TYPE that represents 0.
35 // - RELU - [0/1] Indicates that ReLU activation function should be used on output.
36 // - NEGATIVE_SLOPE - [float] Factor for negative output values (required when ReLU is specified).
38 // - SUB_GROUP_SIZE - [int] Size of used subgroup (SIMD).
39 // - LEFTOVERS - [int] Optional parameter, required only when number of ofm is not dividable by SUB_GROUP_SIZE
40 // see comment for FEATURES_THREADS_PER_BATCH for more informations
43 gpu::make_jit_constant("OUTPUT_LIMIT", output_size),
44 gpu::make_jit_constant("FILTER", filter_mem.argument().size),
45 gpu::make_jit_constant("FILTER_ARRAY_NUM", split),
46 gpu::make_jit_constant("OUTPUT_BLOCK_WIDTH", _kernel_data.block_width));
47 gpu::make_jit_constant("OUTPUT_BLOCK_HEIGHT", _kernel_data.block_height));
48 gpu::make_jit_constant("IN_BLOCK_ARRAY_SIZE", _kernel_data.input_block_array_size));
49 gpu::make_jit_constant("IN_BLOCK_WIDTH", _kernel_data.input_block_width));
50 gpu::make_jit_constant("PREFETCH", _kernel_data.prefetch));
51 if (_kernel_data.leftovers)
52 gpu::make_jit_constant("LEFTOVERS", _kernel_data.leftovers));
55 // FEATURES_THREADS_PER_BATCH defines how many threads in z-dimension are processing single batch.
56 // ideally, z-dimension of value n should indicate processing of n-th output feature. however, since
57 // threads are stack in groups of SUB_GROUP_SIZE, when number of ofm is not dividable by SUB_GROUP_SIZE
58 // there are dummy threads added in z-dimension in count of LEFTOVERS. We need to take them into consideration
59 // while calculating batch's id (see lines 86-87). Values calculated by dummy threads are discarded at line 210.
61 #define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM + LEFTOVERS)
63 #define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM)
66 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
67 __attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
68 KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
69 const __global UNIT_TYPE* input,
70 __global UNIT_TYPE* output,
71 const __global UNIT_TYPE* weights,
73 const __global UNIT_TYPE* bias,
75 uint split_idx) // TODO: removing this parameter cause a performance degradation... :)
77 const uint oc = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH; // oc = Output Column
78 const uint or = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT; // or = Output Row
79 const uint fm = get_global_id(2); // fm = Feature Map = od = Output Depth
80 const uint lid = get_sub_group_local_id();
82 uint batch_idx = fm / FEATURES_THREADS_PER_BATCH;
83 uint feature_idx = fm % FEATURES_THREADS_PER_BATCH;
84 uint fmg = feature_idx / SUB_GROUP_SIZE;
86 UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE];
87 UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT];
88 UNIT_TYPE w[PREFETCH];
90 uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid;
92 for(int i = 0; i < (OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT); i++) {
93 out[i] = UNIT_VAL_ZERO;
96 uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
97 in_addr = batch_idx * INPUT0_BATCH_PITCH;
98 in_addr += in_split_offset + INPUT0_OFFSET_WITH_PADDING + (or * STRIDE_SIZE_Y * INPUT0_Y_PITCH) + (oc * STRIDE_SIZE_X + lid) * INPUT0_X_PITCH;
100 for(int kd = 0; kd < FILTER_IFM_NUM; kd++) // _ID = 3, RGB
102 uint tmp_in_addr = in_addr;
104 #if IN_BLOCK_WIDTH % SUB_GROUP_SIZE == 0
105 __attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE)))
106 for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) {
107 // Horizontal position in input block after read.
108 const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE;
110 in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + (in_block_pos % IN_BLOCK_WIDTH) * INPUT0_X_PITCH];
112 // If we have row break, move to the next row.
113 if (in_block_next_x_pos == IN_BLOCK_WIDTH)
114 tmp_in_addr += INPUT0_Y_PITCH;
116 #elif (2 * IN_BLOCK_WIDTH) % SUB_GROUP_SIZE == 0
117 __attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE)))
118 for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) {
119 // Horizontal position in input block after read.
120 const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE;
122 if (in_block_next_x_pos <= IN_BLOCK_WIDTH) { //
123 in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + (in_block_pos % IN_BLOCK_WIDTH) * INPUT0_X_PITCH];
125 // If we have row break, move to the next row.
126 if (in_block_next_x_pos == IN_BLOCK_WIDTH)
127 tmp_in_addr += INPUT0_Y_PITCH;
130 // TODO: Generalize this step to relax IN_BLOCK_WIDTH restrictions.
131 // Position in sub-group on which new row need to be read.
132 const uint sg_br_pos = IN_BLOCK_WIDTH - in_block_pos % IN_BLOCK_WIDTH;
135 in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + (in_block_pos % IN_BLOCK_WIDTH) * INPUT0_X_PITCH];
136 // We have row break inside sub-group. Need to move to next line.
137 tmp_in_addr += INPUT0_Y_PITCH;
138 if (lid >= sg_br_pos)
139 in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr - (sg_br_pos * INPUT0_X_PITCH)];
141 // If we have another row break, move to the next row.
142 if (in_block_next_x_pos == 2 * IN_BLOCK_WIDTH)
143 tmp_in_addr += INPUT0_Y_PITCH;
147 #error IN_BLOCK_WIDTH must be multiple of SUB_GROUP_SIZE or half of SUB_GROUP_SIZE. Other scenarios are not currently implemented.
150 //move to next filter
151 in_addr += INPUT0_FEATURE_PITCH;
153 for(int pf=0; pf<PREFETCH; pf++) {
154 w[pf] = weights[weight_addr]; weight_addr += SUB_GROUP_SIZE;
158 uint kr = 0; // kr = Kernel Row
159 LOOP(FILTER_SIZE_Y, kr, // LOOP is a macro that unrolls the loop.
161 uint kc = 0; // kc = Kernel Column
162 LOOP(FILTER_SIZE_X, kc,
164 //w = weights[weight_addr];
165 for(uint br=0; br<OUTPUT_BLOCK_HEIGHT; br++) {
166 for(uint bc=0; bc<OUTPUT_BLOCK_WIDTH; bc++) {
168 #if IN_BLOCK_WIDTH != SUB_GROUP_SIZE
169 //if we fix the programming model, then we could use a nice simple 2d array: val = in[br * STRIDE_SIZE_Y + kr][bc * STRIDE_SIZE_X + kc];
170 UNIT_TYPE val = intel_sub_group_shuffle( in[(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) / SUB_GROUP_SIZE],
171 (((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) % SUB_GROUP_SIZE);
173 UNIT_TYPE val = intel_sub_group_shuffle( in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
176 out[br * OUTPUT_BLOCK_WIDTH + bc] = mad(w[wi % PREFETCH], val, out[br * OUTPUT_BLOCK_WIDTH + bc]);
179 w[wi % PREFETCH] = weights[weight_addr];
180 weight_addr += SUB_GROUP_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
184 // addr went beyond due to prefetch so move it back to correct location.
185 weight_addr -= PREFETCH * SUB_GROUP_SIZE;
188 uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM;
189 uint out_addr = OUTPUT_OFFSET;
190 out_addr += batch_idx * OUTPUT_BATCH_PITCH;
191 out_addr += out_split_offset + feature_idx * OUTPUT_FEATURE_PITCH; // out_addr indices into start of 16 feature maps.
192 out_addr += or * OUTPUT_Y_PITCH + oc; // offset for the 4x3 block that this workitem is working on;
195 for(uint r = 0; r < OUTPUT_BLOCK_HEIGHT; r++) {
196 for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) {
198 const unsigned bias_index = feature_idx*OUTPUT_SIZE_X*OUTPUT_SIZE_Y + or*OUTPUT_SIZE_X + oc;
200 const unsigned bias_index = feature_idx;
202 out[r * OUTPUT_BLOCK_WIDTH + c] += bias[bias_index];
208 for(uint r = 0; r < OUTPUT_BLOCK_HEIGHT; r++) {
209 for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) {
210 out[r * OUTPUT_BLOCK_WIDTH + c] = ACTIVATION(out[r * OUTPUT_BLOCK_WIDTH + c], NL_M, NL_N);
215 //--------------------------------------------------------------------
217 //--------------------------------------------------------------------
220 if (feature_idx < OUTPUT_FEATURE_NUM)
222 for(uint r = 0; r < OUTPUT_BLOCK_HEIGHT; r++) {
223 if(!(or + r >= OUTPUT_SIZE_Y))
225 #if (OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH) == 0 // in this case we don't need to check if we're outside of X boundaries
226 uint out_vstore_offset = 0;
227 #if (OUT_BLOCK_WIDTH % 8) > 3
228 MAKE_VECTOR_TYPE(UNIT_TYPE, 4) tmp = MAKE_VECTOR_TYPE(UNIT_TYPE, 4)(
229 out[out_vstore_offset + 0 + r * OUTPUT_BLOCK_WIDTH],
230 out[out_vstore_offset + 1 + r * OUTPUT_BLOCK_WIDTH],
231 out[out_vstore_offset + 2 + r * OUTPUT_BLOCK_WIDTH],
232 out[out_vstore_offset + 3 + r * OUTPUT_BLOCK_WIDTH]
235 vstore4(tmp, 0, output + out_addr + r * OUTPUT_Y_PITCH + out_vstore_offset * OUTPUT_X_PITCH);
236 out_vstore_offset += 4;
239 #if (OUT_BLOCK_WIDTH % 4) > 1
240 MAKE_VECTOR_TYPE(UNIT_TYPE, 2) tmp2 = MAKE_VECTOR_TYPE(UNIT_TYPE, 2)(
241 out[out_vstore_offset + 0 + r * OUTPUT_BLOCK_WIDTH],
242 out[out_vstore_offset + 1 + r * OUTPUT_BLOCK_WIDTH]
245 vstore2(tmp2, 0, output + out_addr + r * OUTPUT_Y_PITCH + out_vstore_offset * OUTPUT_X_PITCH);
246 out_vstore_offset += 2;
248 for(uint c = out_vstore_offset; c < OUTPUT_BLOCK_WIDTH; c++) {
249 // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
250 output[out_addr + r * OUTPUT_Y_PITCH + c] = out[r * OUTPUT_BLOCK_WIDTH + c];
253 for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) {
254 // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
255 if(!(oc + c >= OUTPUT_SIZE_X))
256 output[out_addr + r * OUTPUT_Y_PITCH + c] = out[r * OUTPUT_BLOCK_WIDTH + c];
263 #undef FEATURES_THREADS_PER_BATCH