Publishing 2019 R3 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / binary_convolution_gpu_1x1.cl
1 // Copyright (c) 2019 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15
16 #include "include/include_all.cl"
17
18 #define OC_BLOCK_SIZE 32
19
20 #define GET_WEI(data, id) intel_sub_group_shuffle(data, id)
21 #define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
22 #define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint(val))
23 #define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
24
25 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
26 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
27 KERNEL(binary_convolution_1x1)(const __global INPUT0_TYPE* input,
28                                      __global OUTPUT_TYPE* output,
29                                const __global FILTER_TYPE* weights,
30 #if HAS_FUSED_OPS_DECLS
31                                FUSED_OPS_DECLS,
32 #endif
33                                uint split_idx)
34 {
35     const int xy = get_group_id(0);
36     const int f_block = get_global_id(1);
37     const int b = get_global_id(2);
38     const int lid = get_sub_group_local_id();
39 #if PADDED_INPUT
40     const int x = (xy * XY_BLOCK_SIZE + lid) % OUTPUT_SIZE_X;
41     const int y = (xy * XY_BLOCK_SIZE + lid) / OUTPUT_SIZE_X;
42     const uint input_offset = INPUT0_OFFSET
43                             + b*INPUT0_FEATURE_NUM_PACKED*INPUT0_FEATURE_PITCH
44                             + y*INPUT0_Y_PITCH;
45 #else
46     const int x = (xy * XY_BLOCK_SIZE + lid) % OUTPUT_SIZE_X;
47     const int y = (xy * XY_BLOCK_SIZE + lid) / OUTPUT_SIZE_X;
48     const uint input_offset = INPUT0_OFFSET
49                             + b*INPUT0_FEATURE_NUM_PACKED*INPUT0_FEATURE_PITCH
50                             + xy*XY_BLOCK_SIZE;
51 #endif
52     typedef MAKE_VECTOR_TYPE(FILTER_TYPE, 2) wei_t;
53
54 #if BINARY_PACKED_OUTPUT
55     const uint dst_index = OUTPUT_OFFSET
56                          + b*OUTPUT_FEATURE_NUM_PACKED*OUTPUT_FEATURE_PITCH
57                          + f_block*OUTPUT_FEATURE_PITCH;
58 #else
59     const uint dst_index = OUTPUT_OFFSET
60                          + b*OUTPUT_BATCH_PITCH
61                          + f_block*OC_BLOCK_SIZE*OUTPUT_FEATURE_PITCH;
62 #endif
63     const uint filter_offset = f_block*OC_BLOCK_SIZE*INPUT0_FEATURE_NUM_PACKED;
64
65     int dst_buf[OC_BLOCK_SIZE] = { 0 }; // 32 OC
66
67     for (int k = 0; k < INPUT0_FEATURE_NUM_PACKED; ++k)
68     {
69         // Load 16 input elements from feature map by subgroup
70 #if PADDED_INPUT
71         INPUT0_TYPE src = input[input_offset + k*INPUT0_FEATURE_PITCH + x];
72 #else
73         INPUT0_TYPE src = ALIGNED_BLOCK_READ(input, input_offset + k*INPUT0_FEATURE_PITCH);
74 #endif
75
76         // Load 32 OC x 32 ICP. Each WI has lid-th and (lid+16)-th channels
77         wei_t wei = ALIGNED_BLOCK_READ2(weights, filter_offset + k * OC_BLOCK_SIZE);
78
79         // Shuffle 32 OC x 32 ICP of weights in each WI
80         const wei_t wei0  = GET_WEI(wei, 0);
81         const wei_t wei1  = GET_WEI(wei, 1);
82         const wei_t wei2  = GET_WEI(wei, 2);
83         const wei_t wei3  = GET_WEI(wei, 3);
84         const wei_t wei4  = GET_WEI(wei, 4);
85         const wei_t wei5  = GET_WEI(wei, 5);
86         const wei_t wei6  = GET_WEI(wei, 6);
87         const wei_t wei7  = GET_WEI(wei, 7);
88         const wei_t wei8  = GET_WEI(wei, 8);
89         const wei_t wei9  = GET_WEI(wei, 9);
90         const wei_t wei10 = GET_WEI(wei, 10);
91         const wei_t wei11 = GET_WEI(wei, 11);
92         const wei_t wei12 = GET_WEI(wei, 12);
93         const wei_t wei13 = GET_WEI(wei, 13);
94         const wei_t wei14 = GET_WEI(wei, 14);
95         const wei_t wei15 = GET_WEI(wei, 15);
96
97 #if LEFTOVERS_IC
98         if (k == INPUT0_FEATURE_NUM_PACKED - 1)
99         {
100             dst_buf[0]  += popcount((wei0.s0 ^ src) & FILTER_MASK);
101             dst_buf[1]  += popcount((wei1.s0 ^ src) & FILTER_MASK);
102             dst_buf[2]  += popcount((wei2.s0 ^ src) & FILTER_MASK);
103             dst_buf[3]  += popcount((wei3.s0 ^ src) & FILTER_MASK);
104             dst_buf[4]  += popcount((wei4.s0 ^ src) & FILTER_MASK);
105             dst_buf[5]  += popcount((wei5.s0 ^ src) & FILTER_MASK);
106             dst_buf[6]  += popcount((wei6.s0 ^ src) & FILTER_MASK);
107             dst_buf[7]  += popcount((wei7.s0 ^ src) & FILTER_MASK);
108             dst_buf[8]  += popcount((wei8.s0 ^ src) & FILTER_MASK);
109             dst_buf[9]  += popcount((wei9.s0 ^ src) & FILTER_MASK);
110             dst_buf[10] += popcount((wei10.s0 ^ src) & FILTER_MASK);
111             dst_buf[11] += popcount((wei11.s0 ^ src) & FILTER_MASK);
112             dst_buf[12] += popcount((wei12.s0 ^ src) & FILTER_MASK);
113             dst_buf[13] += popcount((wei13.s0 ^ src) & FILTER_MASK);
114             dst_buf[14] += popcount((wei14.s0 ^ src) & FILTER_MASK);
115             dst_buf[15] += popcount((wei15.s0 ^ src) & FILTER_MASK);
116
117 #if OUTPUT_FEATURE_NUM > 16
118             dst_buf[16] += popcount((wei0.s1 ^ src) & FILTER_MASK);
119             dst_buf[17] += popcount((wei1.s1 ^ src) & FILTER_MASK);
120             dst_buf[18] += popcount((wei2.s1 ^ src) & FILTER_MASK);
121             dst_buf[19] += popcount((wei3.s1 ^ src) & FILTER_MASK);
122             dst_buf[20] += popcount((wei4.s1 ^ src) & FILTER_MASK);
123             dst_buf[21] += popcount((wei5.s1 ^ src) & FILTER_MASK);
124             dst_buf[22] += popcount((wei6.s1 ^ src) & FILTER_MASK);
125             dst_buf[23] += popcount((wei7.s1 ^ src) & FILTER_MASK);
126             dst_buf[24] += popcount((wei8.s1 ^ src) & FILTER_MASK);
127             dst_buf[25] += popcount((wei9.s1 ^ src) & FILTER_MASK);
128             dst_buf[26] += popcount((wei10.s1 ^ src) & FILTER_MASK);
129             dst_buf[27] += popcount((wei11.s1 ^ src) & FILTER_MASK);
130             dst_buf[28] += popcount((wei12.s1 ^ src) & FILTER_MASK);
131             dst_buf[29] += popcount((wei13.s1 ^ src) & FILTER_MASK);
132             dst_buf[30] += popcount((wei14.s1 ^ src) & FILTER_MASK);
133             dst_buf[31] += popcount((wei15.s1 ^ src) & FILTER_MASK);
134 #endif
135             break;
136         }
137 #endif
138         dst_buf[0]  += popcount(wei0.s0 ^ src);
139         dst_buf[1]  += popcount(wei1.s0 ^ src);
140         dst_buf[2]  += popcount(wei2.s0 ^ src);
141         dst_buf[3]  += popcount(wei3.s0 ^ src);
142         dst_buf[4]  += popcount(wei4.s0 ^ src);
143         dst_buf[5]  += popcount(wei5.s0 ^ src);
144         dst_buf[6]  += popcount(wei6.s0 ^ src);
145         dst_buf[7]  += popcount(wei7.s0 ^ src);
146         dst_buf[8]  += popcount(wei8.s0 ^ src);
147         dst_buf[9]  += popcount(wei9.s0 ^ src);
148         dst_buf[10] += popcount(wei10.s0 ^ src);
149         dst_buf[11] += popcount(wei11.s0 ^ src);
150         dst_buf[12] += popcount(wei12.s0 ^ src);
151         dst_buf[13] += popcount(wei13.s0 ^ src);
152         dst_buf[14] += popcount(wei14.s0 ^ src);
153         dst_buf[15] += popcount(wei15.s0 ^ src);
154
155 #if OUTPUT_FEATURE_NUM > 16
156         dst_buf[16] += popcount(wei0.s1 ^ src);
157         dst_buf[17] += popcount(wei1.s1 ^ src);
158         dst_buf[18] += popcount(wei2.s1 ^ src);
159         dst_buf[19] += popcount(wei3.s1 ^ src);
160         dst_buf[20] += popcount(wei4.s1 ^ src);
161         dst_buf[21] += popcount(wei5.s1 ^ src);
162         dst_buf[22] += popcount(wei6.s1 ^ src);
163         dst_buf[23] += popcount(wei7.s1 ^ src);
164         dst_buf[24] += popcount(wei8.s1 ^ src);
165         dst_buf[25] += popcount(wei9.s1 ^ src);
166         dst_buf[26] += popcount(wei10.s1 ^ src);
167         dst_buf[27] += popcount(wei11.s1 ^ src);
168         dst_buf[28] += popcount(wei12.s1 ^ src);
169         dst_buf[29] += popcount(wei13.s1 ^ src);
170         dst_buf[30] += popcount(wei14.s1 ^ src);
171         dst_buf[31] += popcount(wei15.s1 ^ src);
172 #endif
173     }
174
175     // Load data for fused operations (scales, biases, quantization thresholds, etc)
176 #if CUSTOM_FUSED_OPS
177     FUSED_OPS_PREPARE_DATA;
178 #endif
179
180     UNIT_TYPE dst[OC_BLOCK_SIZE];
181     for (int oc = 0; oc < OC_BLOCK_SIZE; oc++)
182     {
183         CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM - 2*dst_buf[oc]);
184 #if CUSTOM_FUSED_OPS
185         DO_ELTWISE_FUSED_OPS;
186 // Don't save floating-point intermediate result, since packed one is already computed
187 #if !BINARY_PACKED_OUTPUT
188         dst[oc] = res;
189 #endif
190 #elif HAS_FUSED_OPS
191         FUSED_OPS;
192         dst[oc] = FINAL_NAME;
193 #endif
194
195     }
196
197     bool in_x = x < OUTPUT_SIZE_X;
198     bool in_y = y < OUTPUT_SIZE_Y;
199 #if BINARY_PACKED_OUTPUT
200
201 #if PADDED_OUTPUT
202     if (in_x && in_y)
203         output[dst_index + y*OUTPUT_Y_PITCH + x] = TO_OUTPUT_TYPE(packed_res);
204 #else
205     if (xy * XY_BLOCK_SIZE < OUTPUT_SIZE_X*OUTPUT_SIZE_Y)
206         ALIGNED_BLOCK_WRITE(output, dst_index + xy*XY_BLOCK_SIZE, TO_OUTPUT_TYPE(packed_res));
207     else if (in_x && in_y)
208         output[dst_index + y*OUTPUT_Y_PITCH + x] = TO_OUTPUT_TYPE(packed_res);
209
210 #endif
211
212 #else
213
214     for (int oc = 0; oc < OC_BLOCK_SIZE; oc++)
215     {
216         bool in_fm = f_block*OC_BLOCK_SIZE + oc < OUTPUT_FEATURE_NUM;
217         if (in_x && in_y && in_fm)
218             output[dst_index + oc*OUTPUT_FEATURE_PITCH + y*OUTPUT_Y_PITCH + x] = TO_OUTPUT_TYPE(dst[oc]);
219     }
220
221 #endif
222
223 }