Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / fully_connected_gpu_fb_io_b8_f8_vload.cl
1 // Copyright (c) 2016-2017 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15
16 #include "include/include_all.cl"
17 #include "include/sub_group.cl"
18
19 #if FP16_UNIT_USED
20     #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB)  \
21     {   \
22         const half8 acol0 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 0 ); \
23         const half8 acol1 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 1 ); \
24         const half8 acol2 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 2 ); \
25         const half8 acol3 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 3 ); \
26         const half8 acol4 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 4 ); \
27         const half8 acol5 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 5 ); \
28         const half8 acol6 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 6 ); \
29         const half8 acol7 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 7 ); \
30         _result = fma( _blockB.s0, acol0, _result ); \
31         _result = fma( _blockB.s1, acol1, _result ); \
32         _result = fma( _blockB.s2, acol2, _result ); \
33         _result = fma( _blockB.s3, acol3, _result ); \
34         _result = fma( _blockB.s4, acol4, _result ); \
35         _result = fma( _blockB.s5, acol5, _result ); \
36         _result = fma( _blockB.s6, acol6, _result ); \
37         _result = fma( _blockB.s7, acol7, _result ); \
38     }
39 #else
40     #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB)  \
41     {   \
42         const float8 acol0 = TRANSPOSE_BLOCK_8_COL( _blockA, 0 ); \
43         const float8 acol1 = TRANSPOSE_BLOCK_8_COL( _blockA, 1 ); \
44         const float8 acol2 = TRANSPOSE_BLOCK_8_COL( _blockA, 2 ); \
45         const float8 acol3 = TRANSPOSE_BLOCK_8_COL( _blockA, 3 ); \
46         const float8 acol4 = TRANSPOSE_BLOCK_8_COL( _blockA, 4 ); \
47         const float8 acol5 = TRANSPOSE_BLOCK_8_COL( _blockA, 5 ); \
48         const float8 acol6 = TRANSPOSE_BLOCK_8_COL( _blockA, 6 ); \
49         const float8 acol7 = TRANSPOSE_BLOCK_8_COL( _blockA, 7 ); \
50         _result = mad( _blockB.s0, acol0, _result ); \
51         _result = mad( _blockB.s1, acol1, _result ); \
52         _result = mad( _blockB.s2, acol2, _result ); \
53         _result = mad( _blockB.s3, acol3, _result ); \
54         _result = mad( _blockB.s4, acol4, _result ); \
55         _result = mad( _blockB.s5, acol5, _result ); \
56         _result = mad( _blockB.s6, acol6, _result ); \
57         _result = mad( _blockB.s7, acol7, _result ); \
58     }
59 #endif
60
61 #define SUB_GROUP_SIZE 8
62
63 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
64 KERNEL (fully_connected_gpu_xb_xb_b8_x8_vload)(
65     const __global UNIT_TYPE* input,
66     __global UNIT_TYPE* output,
67     const __global UNIT_TYPE* weight
68 #if BIAS_TERM
69     , __global UNIT_TYPE* bias)
70 #else
71     )
72 #endif
73 {
74     const uint global_id = get_global_id(0);
75     const uint group_id = get_global_id(1); // which part of batches we are computing, for example for batch 64 we compute batches 0..31 for group_id == 0 and batches 32..65 for group_id == 1
76     uint sub_group_idx = (uint)get_local_id(0) % 8;
77
78     const uint out_id = (sub_group_idx * BATCHES_PER_WORK_ITEM * (uint)get_global_size(1)) / 8 + (global_id / 8) * BATCHES_PER_WORK_ITEM * NEURONS_PER_WORK_ITEM * (uint)get_global_size(1) + (BATCHES_PER_WORK_ITEM * group_id) / 8;
79
80     uint neuronIdx = sub_group_idx + (global_id / 8) * 8 * NEURONS_PER_WORK_ITEM;
81
82     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC00 = UNIT_VAL_ZERO;
83     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC10 = UNIT_VAL_ZERO;
84
85 #if BATCHES_PER_WORK_ITEM >= 16
86     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC01 = UNIT_VAL_ZERO;
87     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC11 = UNIT_VAL_ZERO;
88 #endif
89
90 #if BATCHES_PER_WORK_ITEM >= 32
91     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC02 = UNIT_VAL_ZERO;
92     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC12 = UNIT_VAL_ZERO;
93
94     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC03 = UNIT_VAL_ZERO;
95     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC13 = UNIT_VAL_ZERO;
96 #endif
97
98     uint weight_offset = neuronIdx;
99 #if NEURONS_PER_WORK_ITEM > 1
100
101     uint weight_offset2 = neuronIdx + 8;
102
103 #endif // #if NEURONS_PER_WORK_ITEM > 1
104
105     uint input_idx = sub_group_idx * (BATCHES_PER_WORK_ITEM / 8) * (uint)get_global_size(1) + (group_id * BATCHES_PER_WORK_ITEM) / 8;
106     for(uint h = 0; h < INPUT0_ELEMENTS_COUNT / 8; h++)
107     {
108         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00 = vload8(input_idx, input);
109
110 #if BATCHES_PER_WORK_ITEM >= 16
111         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA01 = vload8(input_idx + 1, input);
112 #endif
113
114 #if BATCHES_PER_WORK_ITEM >= 32
115         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA02 = vload8(input_idx + 2, input);
116         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA03 = vload8(input_idx + 3, input);
117 #endif
118         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00;
119         blockB00.s0 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
120         blockB00.s1 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
121         blockB00.s2 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
122         blockB00.s3 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
123         blockB00.s4 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
124         blockB00.s5 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
125         blockB00.s6 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
126         blockB00.s7 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
127         MULTIPLY_BLOCKS_8x8(blockC00, blockA00, blockB00)
128
129 #if BATCHES_PER_WORK_ITEM >= 16
130         MULTIPLY_BLOCKS_8x8(blockC01, blockA01, blockB00)
131 #endif
132
133 #if BATCHES_PER_WORK_ITEM >= 32
134         MULTIPLY_BLOCKS_8x8(blockC02, blockA02, blockB00)
135         MULTIPLY_BLOCKS_8x8(blockC03, blockA03, blockB00)
136 #endif
137
138 #if NEURONS_PER_WORK_ITEM > 1
139
140         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB10;
141         blockB10.s0 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
142         blockB10.s1 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
143         blockB10.s2 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
144         blockB10.s3 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
145         blockB10.s4 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
146         blockB10.s5 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
147         blockB10.s6 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
148         blockB10.s7 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
149         MULTIPLY_BLOCKS_8x8(blockC10, blockA00, blockB10)
150
151 #if BATCHES_PER_WORK_ITEM >= 16
152         MULTIPLY_BLOCKS_8x8(blockC11, blockA01, blockB10)
153 #endif
154 #if BATCHES_PER_WORK_ITEM >= 32
155         MULTIPLY_BLOCKS_8x8(blockC12, blockA02, blockB10)
156         MULTIPLY_BLOCKS_8x8(blockC13, blockA03, blockB10)
157 #endif
158
159 #endif // #if NEURONS_PER_WORK_ITEM > 1
160         input_idx += INPUT0_BATCH_NUM; // we don't need to multiply by 8 because of vload8
161     }
162
163 #if BIAS_TERM
164     blockC00 += bias[neuronIdx];
165 #if BATCHES_PER_WORK_ITEM >= 16
166     blockC01 += bias[neuronIdx];
167 #endif
168
169 #if BATCHES_PER_WORK_ITEM >= 32
170     blockC02 += bias[neuronIdx];
171     blockC03 += bias[neuronIdx];
172 #endif
173
174 #if NEURONS_PER_WORK_ITEM > 1
175
176     blockC10 += bias[neuronIdx+8];
177 #if BATCHES_PER_WORK_ITEM >= 16
178     blockC11 += bias[neuronIdx+8];
179 #endif
180 #if BATCHES_PER_WORK_ITEM >= 32
181     blockC12 += bias[neuronIdx+8];
182     blockC13 += bias[neuronIdx+8];
183 #endif
184
185 #endif // #if NEURONS_PER_WORK_ITEM > 1
186
187     blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
188 #if BATCHES_PER_WORK_ITEM >= 16
189     blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
190 #endif
191 #if BATCHES_PER_WORK_ITEM >= 32
192     blockC02 = ACTIVATION(blockC02, NL_M, NL_N);
193     blockC03 = ACTIVATION(blockC03, NL_M, NL_N);
194 #endif
195
196 #if NEURONS_PER_WORK_ITEM > 1
197
198     blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
199 #if BATCHES_PER_WORK_ITEM >= 16
200     blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
201 #endif
202 #if BATCHES_PER_WORK_ITEM >= 32
203     blockC12 = ACTIVATION(blockC12, NL_M, NL_N);
204     blockC13 = ACTIVATION(blockC13, NL_M, NL_N);
205 #endif
206
207 #endif // #if NEURONS_PER_WORK_ITEM > 1
208
209     vstore8(blockC00, out_id, output);
210 #if BATCHES_PER_WORK_ITEM >= 16
211     vstore8(blockC01, out_id + 1, output);
212 #endif
213 #if BATCHES_PER_WORK_ITEM >= 32
214     vstore8(blockC02, out_id + 2, output);
215     vstore8(blockC03, out_id + 3, output);
216 #endif
217 #endif // #if BIAS_TERM
218 #if NEURONS_PER_WORK_ITEM > 1
219
220     vstore8(blockC10, out_id+INPUT0_BATCH_NUM, output);
221
222 #if BATCHES_PER_WORK_ITEM >= 16
223     vstore8(blockC11, out_id+INPUT0_BATCH_NUM+1, output);
224 #endif
225
226 #if BATCHES_PER_WORK_ITEM >= 32
227     vstore8(blockC12, out_id+INPUT0_BATCH_NUM+2, output);
228     vstore8(blockC13, out_id+INPUT0_BATCH_NUM+3, output);
229 #endif
230
231 #endif // #if NEURONS_PER_WORK_ITEM > 1
232 }
233
234 #undef SUB_GROUP_SIZE
235 #undef ALIGNED_BLOCK_READ8
236 #undef MAKE_VECTOR_TYPE
237 #undef CONCAT_TOKEN
238 #undef CONCAT_TOKEN_HANDLER1
239 #undef MULTIPLY_BLOCKS_8x8