43789cedaaf99bbd1c97db8b71494a3b0e5b8794
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / fully_connected_gpu_MMAD.cl
1 // Copyright (c) 2016-2020 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15
16 #include "include/common.cl"
17
18 #include "include/data_types.cl"
19 #include "include/fetch.cl"
20 #include "include/mmad.cl"
21
22 #define INPUT_PACKED_TYPE_8  CAT(INPUT_PACKED_TYPE, 8)
23 #define FILTER_PACKED_TYPE_8 CAT(FILTER_PACKED_TYPE, 8)
24
25 #define AS_TYPE(type, val) CAT(as_, type)(val)
26
27 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
28 KERNEL(fully_connected_gpu_MMAD)(
29     const __global INPUT0_TYPE* input,
30     __global OUTPUT_TYPE* output,
31     const __global FILTER_TYPE* weights
32 #if BIAS_TERM
33     , const __global BIAS_TYPE* biases
34 #endif
35 #if HAS_FUSED_OPS_DECLS
36     , FUSED_OPS_DECLS
37 #endif
38     )
39 {
40 #if OUTPUT_BATCH_NUM == 1
41     const uint f = (uint)get_global_id(0);
42     const uint b = 0;
43 #else
44     const uint f = (uint)get_global_id(0);
45     const uint b = (uint)get_global_id(1);
46 #endif
47
48     int dotProd = 0;
49
50     const uint filter_offset = FILTER_GET_OFFSET(f);
51 #if INPUT0_DIMS == 5
52     const uint input_offset = INPUT0_GET_INDEX(b, 0, 0, 0, 0);
53 #else
54     const uint input_offset = INPUT0_GET_INDEX(b, 0, 0, 0);
55 #endif
56
57 #if SPATIAL_MAJOR
58     for (uint k = 0; k < FEATURE_BLOCKS_COUNT; ++k) {
59 #   if !SPLIT_SPATIAL
60         for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
61 #   else
62         for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi)
63         for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
64         for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
65             const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
66 #endif
67 #else  // SPATIAL_MAJOR
68 #   if !SPLIT_SPATIAL
69     for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
70 #   else
71     for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi)
72     for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
73     for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
74         const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
75 #   endif
76         for (uint k = 0; k < FEATURE_BLOCKS_COUNT; ++k) {
77 #endif
78 #if !SPLIT_SPATIAL
79             uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + k * MMAD_INPUT_FBLOCK_PITCH;
80 #else
81             uint input_idx = input_offset + k * MMAD_INPUT_FBLOCK_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
82 #endif
83             uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + k * MMAD_FILTER_FBLOCK_PITCH;
84
85             uint input_data_u = intel_sub_group_block_read((const __global uint*)(input + input_idx));
86             INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
87
88             INPUT_PACKED_TYPE_8 activations;  //activations of all lanes
89             activations.s0 = sub_group_broadcast(input_data, 0);
90             activations.s1 = sub_group_broadcast(input_data, 1);
91             activations.s2 = sub_group_broadcast(input_data, 2);
92             activations.s3 = sub_group_broadcast(input_data, 3);
93             activations.s4 = sub_group_broadcast(input_data, 4);
94             activations.s5 = sub_group_broadcast(input_data, 5);
95             activations.s6 = sub_group_broadcast(input_data, 6);
96             activations.s7 = sub_group_broadcast(input_data, 7);
97
98             uint8 weights_data_u = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx));
99             FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u);
100
101             dotProd = MMAD_8(activations, weights_data, dotProd);
102         }
103     }
104
105 #if HAS_FEATURE_LEFTOVERS
106         const uint lid = get_sub_group_local_id();
107 #if SPATIAL_MAJOR
108 #if !SPLIT_SPATIAL
109         for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
110 #else
111         for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi)
112         for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
113         for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
114             const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
115 #endif  // !SPLIT_SPATIAL
116
117 #else  // SPATIAL_MAJOR
118 #if !SPLIT_SPATIAL
119     for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
120 #else  // !SPLIT_SPATIAL
121     for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi)
122         for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
123         for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
124             const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
125 #endif  // !SPLIT_SPATIAL
126 #endif  // SPATIAL_MAJOR
127
128 #if !SPLIT_SPATIAL
129             uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH;
130 #else  // !SPLIT_SPATIAL
131             uint input_idx = input_offset + FEATURE_BLOCK_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
132 #endif  // !SPLIT_SPATIAL
133             uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;
134
135             MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0);
136             for (uint i = 0; i < 4; i++) {
137                 if (FEATURE_BLOCKS_COUNT*32 + lid*4 + i < INPUT0_FEATURE_NUM) {
138                     input_data_u[i] = input[input_idx + (lid*4 + i)*INPUT0_FEATURE_PITCH];
139                 }
140             }
141             INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
142
143             INPUT_PACKED_TYPE_8 activations;  //activations of all lanes
144             activations.s0 = sub_group_broadcast(input_data, 0);
145             activations.s1 = sub_group_broadcast(input_data, 1);
146             activations.s2 = sub_group_broadcast(input_data, 2);
147             activations.s3 = sub_group_broadcast(input_data, 3);
148             activations.s4 = sub_group_broadcast(input_data, 4);
149             activations.s5 = sub_group_broadcast(input_data, 5);
150             activations.s6 = sub_group_broadcast(input_data, 6);
151             activations.s7 = sub_group_broadcast(input_data, 7);
152
153             uint8 weights_data_u = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx));
154             FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u);
155
156             dotProd = MMAD_8(activations, weights_data, dotProd);
157         }
158 #endif  // HAS_FEATURE_LEFTOVERS
159
160     if (OUTPUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && f >= OUTPUT_FEATURE_NUM)
161         return;
162
163 #if BIAS_TERM
164 #if   BIAS_PER_OUTPUT
165     const uint bias_index = GET_DATA_INDEX(BIAS, b, f, 0, 0);
166 #elif BIAS_PER_OFM
167     const uint bias_index = f;
168 #endif
169
170     float dequantized = (float)dotProd + biases[bias_index];
171 #else  // BIAS_TERM
172     float dequantized = (float)dotProd;
173 #endif
174
175     const uint out_idx = OUTPUT_GET_INDEX(b, f, 0, 0);
176
177 #if HAS_FUSED_OPS
178     FUSED_OPS;
179     OUTPUT_TYPE res = FUSED_OPS_RESULT;
180
181     output[out_idx] = res;
182 #else
183     output[out_idx] = TO_OUTPUT_TYPE(dequantized);
184 #endif
185 }
186
187 #undef INPUT_PACKED_TYPE_8
188 #undef FILTER_PACKED_TYPE_8
189 #undef AS_TYPE