Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / convolution_gpu_yxfb_yxio_b16_fp16.cl
1 // Copyright (c) 2016-2017 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15
16 #include "include/include_all.cl"
17 #include "include/sub_group.cl"
18
19 __attribute__((intel_reqd_sub_group_size(16)))
20 __attribute__((reqd_work_group_size(16, 1, 1)))
21 KERNEL(convolution_gpu_yxfb_yxio_b16)(
22     const __global UNIT_TYPE* input,
23     __global UNIT_TYPE* output,
24     const __global UNIT_TYPE* filter,
25 #if BIAS_TERM
26     const __global UNIT_TYPE* bias,
27 #endif
28     uint split_idx)
29 {
30     // get_global_size(0) -> Number of work items needed to compute all features and all batches for single output spatial position
31     //                       (single (x, y) point in output).
32     // get_global_size(1) -> Output size in X-dimension.
33     // get_global_size(2) -> Output size in Y-dimension.
34     // get_global_id(0)   -> Id of work item computing single spatial point of output indicated by get_global_id(1), get_global_id(2).
35     // get_group_id(1)   -> Current x-position in output.
36     // get_group_id(2)   -> Current y-position in output.
37     //
38     // WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS -> Number of work items needed to compute entire one batch for at least one feature and one spatial point.
39     //                                           (this number in current implementation computes also OFM_PER_WORK_ITEM output features at the same time).
40     // FILTER_ARRAY_NUM                       -> Number of filters groups (split size).
41
42     const uint out_x = get_group_id(1);
43     const uint out_y = get_group_id(2);
44
45     const uint output_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM;
46     const uint output_x_size = OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X;
47     const uint linear_id_xy = OUTPUT_PAD_BEFORE_SIZE_X + out_x + output_x_size * (out_y + OUTPUT_PAD_BEFORE_SIZE_Y);
48     uint global_id = (((uint)get_global_id(0) / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) + (linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (output_f_size / OFM_PER_WORK_ITEM)) * WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS;
49
50     const uint sub_group_id = get_local_id(0);
51
52 #if defined(USE_BLOCK_READ_2) || defined(USE_BLOCK_READ_1)
53     const uint chunk_size = sizeof(uint)/sizeof(UNIT_TYPE);
54 #else
55     const uint chunk_size = 1;
56 #endif
57
58     const uint out_batch_id = chunk_size * sub_group_id + LOCAL_WORK_GROUP_SIZE * BATCHES_PER_WORK_ITEM * ((uint)get_group_id(0) % LOCAL_WORK_GROUPS_PER_SINGLE_BATCHES_ELEMENTS);
59
60     const uint out_id = (global_id / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) * OFM_PER_WORK_ITEM * OUTPUT_FEATURE_PITCH + OUTPUT_PAD_BEFORE_FEATURE_NUM * OUTPUT_FEATURE_PITCH + OUTPUT_PAD_BEFORE_BATCH_NUM + out_batch_id;
61
62     const uint ofm_offset = ((global_id * OFM_PER_WORK_ITEM) / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) % output_f_size;
63
64     // Each component of vector element contains computation for separate output feature.
65     half16 _data[BATCHES_PER_WORK_ITEM];
66     for(uint i = 0; i < BATCHES_PER_WORK_ITEM; i++)
67     {
68         _data[i] = UNIT_VAL_ZERO;
69     }
70
71     const int x = (int)out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
72     const int y = (int)out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
73
74     for (uint i = 0; i < FILTER_SIZE_Y; i++)
75     {
76         const int input_offset_y = y + i * DILATION_SIZE_Y;
77         const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
78
79         if(!zero_y)
80         {
81             for (uint j = 0; j < FILTER_SIZE_X; j++)
82             {
83                 const int input_offset_x = x + j * DILATION_SIZE_X;
84                 const bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
85
86                 if(!zero)
87                 {
88                     uint input_idx = input_offset_x*INPUT0_X_PITCH + input_offset_y*INPUT0_Y_PITCH;
89                     input_idx += INPUT0_OFFSET + split_idx * FILTER_IFM_NUM * INPUT0_FEATURE_PITCH;
90                     input_idx += out_batch_id;
91
92                     //sub_group_id used as offset to make each workitem load different filter, and then shuffle it
93                     // 2 * sub_group_id is used because we group 2 halfs as one uint element.
94                     uint filter_idx = ofm_offset + 2*sub_group_id + i*FILTER_Y_PITCH + j*FILTER_X_PITCH;
95
96                     for (uint h = 0; h < FILTER_IFM_NUM; h++)
97                     {
98 #if defined(USE_BLOCK_READ_2)
99                         half4 _input = as_half4(intel_sub_group_block_read2((const __global uint*)(input + input_idx)));
100                         uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
101                         half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
102                         _data[0] = fma(_input.s0, filter_transp, _data[0]);
103                         _data[1] = fma(_input.s1, filter_transp, _data[1]);
104                         _data[2] = fma(_input.s2, filter_transp, _data[2]);
105                         _data[3] = fma(_input.s3, filter_transp, _data[3]);
106                         input_idx += INPUT0_FEATURE_PITCH;
107 #elif defined(USE_BLOCK_READ_1)
108                         half2 _input = as_half2(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
109                         uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
110                         half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
111                         _data[0] = fma(_input.s0, filter_transp, _data[0]);
112                         _data[1] = fma(_input.s1, filter_transp, _data[1]);
113                         input_idx += INPUT0_FEATURE_PITCH;
114 #else
115                         uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
116                         half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
117                         for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
118                         {
119                             _data[s] = fma(input[input_idx], filter_transp, _data[s]);
120                             input_idx += LOCAL_WORK_GROUP_SIZE;
121                         }
122                         input_idx += INPUT0_FEATURE_PITCH - BATCHES_PER_WORK_ITEM * LOCAL_WORK_GROUP_SIZE;
123 #endif
124                         filter_idx += FILTER_IFM_PITCH;
125                     }
126                 }
127             }
128         }
129     }
130
131 #if BIAS_TERM
132     uint bias_val_pair = *(const __global uint*)(bias + (ofm_offset + 2 * sub_group_id));
133     for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
134     {
135         ADD_BIAS_16_FP16(_data[s], bias_val_pair);
136     }
137 #endif
138     for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
139     {
140         _data[s] = ACTIVATION(_data[s], NL_M, NL_N);
141     }
142
143 #if defined(USE_BLOCK_READ_2) || defined(USE_BLOCK_READ_1)
144     #if BATCHES_PER_WORK_ITEM == 4
145         uint _out_id = OUTPUT_VIEW_OFFSET + out_id;
146         for(uint i = 0; i < 16; i++)
147         {
148             *(__global uint*)(output + _out_id) = as_uint((half2)(_data[0][i], _data[1][i]));
149             *(__global uint*)(output + _out_id + 32) = as_uint((half2)(_data[2][i], _data[3][i]));
150             _out_id += OUTPUT_FEATURE_PITCH;
151         }
152     #else
153     for(uint s = 0; s < BATCHES_PER_WORK_ITEM / 2; s++)
154     {
155         uint _out_id = OUTPUT_VIEW_OFFSET + out_id + chunk_size * s * LOCAL_WORK_GROUP_SIZE;
156         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s0, _data[chunk_size * s + 1].s0)); _out_id += OUTPUT_FEATURE_PITCH;
157         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s1, _data[chunk_size * s + 1].s1)); _out_id += OUTPUT_FEATURE_PITCH;
158         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s2, _data[chunk_size * s + 1].s2)); _out_id += OUTPUT_FEATURE_PITCH;
159         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s3, _data[chunk_size * s + 1].s3)); _out_id += OUTPUT_FEATURE_PITCH;
160         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s4, _data[chunk_size * s + 1].s4)); _out_id += OUTPUT_FEATURE_PITCH;
161         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s5, _data[chunk_size * s + 1].s5)); _out_id += OUTPUT_FEATURE_PITCH;
162         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s6, _data[chunk_size * s + 1].s6)); _out_id += OUTPUT_FEATURE_PITCH;
163         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s7, _data[chunk_size * s + 1].s7)); _out_id += OUTPUT_FEATURE_PITCH;
164         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s8, _data[chunk_size * s + 1].s8)); _out_id += OUTPUT_FEATURE_PITCH;
165         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s9, _data[chunk_size * s + 1].s9)); _out_id += OUTPUT_FEATURE_PITCH;
166         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sa, _data[chunk_size * s + 1].sa)); _out_id += OUTPUT_FEATURE_PITCH;
167         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sb, _data[chunk_size * s + 1].sb)); _out_id += OUTPUT_FEATURE_PITCH;
168         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sc, _data[chunk_size * s + 1].sc)); _out_id += OUTPUT_FEATURE_PITCH;
169         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sd, _data[chunk_size * s + 1].sd)); _out_id += OUTPUT_FEATURE_PITCH;
170         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].se, _data[chunk_size * s + 1].se)); _out_id += OUTPUT_FEATURE_PITCH;
171         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sf, _data[chunk_size * s + 1].sf)); _out_id += OUTPUT_FEATURE_PITCH;
172     }
173     #endif
174 #else
175     for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
176     {
177         uint _out_id = OUTPUT_VIEW_OFFSET + out_id + s * LOCAL_WORK_GROUP_SIZE;
178         output[_out_id] = _data[s].s0; _out_id += OUTPUT_FEATURE_PITCH;
179         output[_out_id] = _data[s].s1; _out_id += OUTPUT_FEATURE_PITCH;
180         output[_out_id] = _data[s].s2; _out_id += OUTPUT_FEATURE_PITCH;
181         output[_out_id] = _data[s].s3; _out_id += OUTPUT_FEATURE_PITCH;
182         output[_out_id] = _data[s].s4; _out_id += OUTPUT_FEATURE_PITCH;
183         output[_out_id] = _data[s].s5; _out_id += OUTPUT_FEATURE_PITCH;
184         output[_out_id] = _data[s].s6; _out_id += OUTPUT_FEATURE_PITCH;
185         output[_out_id] = _data[s].s7; _out_id += OUTPUT_FEATURE_PITCH;
186         output[_out_id] = _data[s].s8; _out_id += OUTPUT_FEATURE_PITCH;
187         output[_out_id] = _data[s].s9; _out_id += OUTPUT_FEATURE_PITCH;
188         output[_out_id] = _data[s].sa; _out_id += OUTPUT_FEATURE_PITCH;
189         output[_out_id] = _data[s].sb; _out_id += OUTPUT_FEATURE_PITCH;
190         output[_out_id] = _data[s].sc; _out_id += OUTPUT_FEATURE_PITCH;
191         output[_out_id] = _data[s].sd; _out_id += OUTPUT_FEATURE_PITCH;
192         output[_out_id] = _data[s].se; _out_id += OUTPUT_FEATURE_PITCH;
193         output[_out_id] = _data[s].sf; _out_id += OUTPUT_FEATURE_PITCH;
194     }
195 #endif
196 }