1 // Copyright (c) 2019 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "include/include_all.cl"
18 #define OC_BLOCK_SIZE 32
20 #define GET_WEI(data, id) intel_sub_group_shuffle(data, id)
21 #define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
22 #define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint(val))
23 #define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
25 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
26 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
27 KERNEL(binary_convolution_1x1)(const __global INPUT0_TYPE* input,
28 __global OUTPUT_TYPE* output,
29 const __global FILTER_TYPE* weights,
30 #if HAS_FUSED_OPS_DECLS
35 const int xy = get_group_id(0);
36 const int f_block = get_global_id(1);
37 const int b = get_global_id(2);
38 const int lid = get_sub_group_local_id();
40 const int x = (xy * XY_BLOCK_SIZE + lid) % OUTPUT_SIZE_X;
41 const int y = (xy * XY_BLOCK_SIZE + lid) / OUTPUT_SIZE_X;
42 const uint input_offset = INPUT0_OFFSET
43 + b*INPUT0_FEATURE_NUM_PACKED*INPUT0_FEATURE_PITCH
46 const int x = (xy * XY_BLOCK_SIZE + lid) % OUTPUT_SIZE_X;
47 const int y = (xy * XY_BLOCK_SIZE + lid) / OUTPUT_SIZE_X;
48 const uint input_offset = INPUT0_OFFSET
49 + b*INPUT0_FEATURE_NUM_PACKED*INPUT0_FEATURE_PITCH
52 typedef MAKE_VECTOR_TYPE(FILTER_TYPE, 2) wei_t;
54 #if BINARY_PACKED_OUTPUT
55 const uint dst_index = OUTPUT_OFFSET
56 + b*OUTPUT_FEATURE_NUM_PACKED*OUTPUT_FEATURE_PITCH
57 + f_block*OUTPUT_FEATURE_PITCH;
59 const uint dst_index = OUTPUT_OFFSET
60 + b*OUTPUT_BATCH_PITCH
61 + f_block*OC_BLOCK_SIZE*OUTPUT_FEATURE_PITCH;
63 const uint filter_offset = f_block*OC_BLOCK_SIZE*INPUT0_FEATURE_NUM_PACKED;
65 int dst_buf[OC_BLOCK_SIZE] = { 0 }; // 32 OC
67 for (int k = 0; k < INPUT0_FEATURE_NUM_PACKED; ++k)
69 // Load 16 input elements from feature map by subgroup
71 INPUT0_TYPE src = input[input_offset + k*INPUT0_FEATURE_PITCH + x];
73 INPUT0_TYPE src = ALIGNED_BLOCK_READ(input, input_offset + k*INPUT0_FEATURE_PITCH);
76 // Load 32 OC x 32 ICP. Each WI has lid-th and (lid+16)-th channels
77 wei_t wei = ALIGNED_BLOCK_READ2(weights, filter_offset + k * OC_BLOCK_SIZE);
79 // Shuffle 32 OC x 32 ICP of weights in each WI
80 const wei_t wei0 = GET_WEI(wei, 0);
81 const wei_t wei1 = GET_WEI(wei, 1);
82 const wei_t wei2 = GET_WEI(wei, 2);
83 const wei_t wei3 = GET_WEI(wei, 3);
84 const wei_t wei4 = GET_WEI(wei, 4);
85 const wei_t wei5 = GET_WEI(wei, 5);
86 const wei_t wei6 = GET_WEI(wei, 6);
87 const wei_t wei7 = GET_WEI(wei, 7);
88 const wei_t wei8 = GET_WEI(wei, 8);
89 const wei_t wei9 = GET_WEI(wei, 9);
90 const wei_t wei10 = GET_WEI(wei, 10);
91 const wei_t wei11 = GET_WEI(wei, 11);
92 const wei_t wei12 = GET_WEI(wei, 12);
93 const wei_t wei13 = GET_WEI(wei, 13);
94 const wei_t wei14 = GET_WEI(wei, 14);
95 const wei_t wei15 = GET_WEI(wei, 15);
98 if (k == INPUT0_FEATURE_NUM_PACKED - 1)
100 dst_buf[0] += popcount((wei0.s0 ^ src) & FILTER_MASK);
101 dst_buf[1] += popcount((wei1.s0 ^ src) & FILTER_MASK);
102 dst_buf[2] += popcount((wei2.s0 ^ src) & FILTER_MASK);
103 dst_buf[3] += popcount((wei3.s0 ^ src) & FILTER_MASK);
104 dst_buf[4] += popcount((wei4.s0 ^ src) & FILTER_MASK);
105 dst_buf[5] += popcount((wei5.s0 ^ src) & FILTER_MASK);
106 dst_buf[6] += popcount((wei6.s0 ^ src) & FILTER_MASK);
107 dst_buf[7] += popcount((wei7.s0 ^ src) & FILTER_MASK);
108 dst_buf[8] += popcount((wei8.s0 ^ src) & FILTER_MASK);
109 dst_buf[9] += popcount((wei9.s0 ^ src) & FILTER_MASK);
110 dst_buf[10] += popcount((wei10.s0 ^ src) & FILTER_MASK);
111 dst_buf[11] += popcount((wei11.s0 ^ src) & FILTER_MASK);
112 dst_buf[12] += popcount((wei12.s0 ^ src) & FILTER_MASK);
113 dst_buf[13] += popcount((wei13.s0 ^ src) & FILTER_MASK);
114 dst_buf[14] += popcount((wei14.s0 ^ src) & FILTER_MASK);
115 dst_buf[15] += popcount((wei15.s0 ^ src) & FILTER_MASK);
117 #if OUTPUT_FEATURE_NUM > 16
118 dst_buf[16] += popcount((wei0.s1 ^ src) & FILTER_MASK);
119 dst_buf[17] += popcount((wei1.s1 ^ src) & FILTER_MASK);
120 dst_buf[18] += popcount((wei2.s1 ^ src) & FILTER_MASK);
121 dst_buf[19] += popcount((wei3.s1 ^ src) & FILTER_MASK);
122 dst_buf[20] += popcount((wei4.s1 ^ src) & FILTER_MASK);
123 dst_buf[21] += popcount((wei5.s1 ^ src) & FILTER_MASK);
124 dst_buf[22] += popcount((wei6.s1 ^ src) & FILTER_MASK);
125 dst_buf[23] += popcount((wei7.s1 ^ src) & FILTER_MASK);
126 dst_buf[24] += popcount((wei8.s1 ^ src) & FILTER_MASK);
127 dst_buf[25] += popcount((wei9.s1 ^ src) & FILTER_MASK);
128 dst_buf[26] += popcount((wei10.s1 ^ src) & FILTER_MASK);
129 dst_buf[27] += popcount((wei11.s1 ^ src) & FILTER_MASK);
130 dst_buf[28] += popcount((wei12.s1 ^ src) & FILTER_MASK);
131 dst_buf[29] += popcount((wei13.s1 ^ src) & FILTER_MASK);
132 dst_buf[30] += popcount((wei14.s1 ^ src) & FILTER_MASK);
133 dst_buf[31] += popcount((wei15.s1 ^ src) & FILTER_MASK);
138 dst_buf[0] += popcount(wei0.s0 ^ src);
139 dst_buf[1] += popcount(wei1.s0 ^ src);
140 dst_buf[2] += popcount(wei2.s0 ^ src);
141 dst_buf[3] += popcount(wei3.s0 ^ src);
142 dst_buf[4] += popcount(wei4.s0 ^ src);
143 dst_buf[5] += popcount(wei5.s0 ^ src);
144 dst_buf[6] += popcount(wei6.s0 ^ src);
145 dst_buf[7] += popcount(wei7.s0 ^ src);
146 dst_buf[8] += popcount(wei8.s0 ^ src);
147 dst_buf[9] += popcount(wei9.s0 ^ src);
148 dst_buf[10] += popcount(wei10.s0 ^ src);
149 dst_buf[11] += popcount(wei11.s0 ^ src);
150 dst_buf[12] += popcount(wei12.s0 ^ src);
151 dst_buf[13] += popcount(wei13.s0 ^ src);
152 dst_buf[14] += popcount(wei14.s0 ^ src);
153 dst_buf[15] += popcount(wei15.s0 ^ src);
155 #if OUTPUT_FEATURE_NUM > 16
156 dst_buf[16] += popcount(wei0.s1 ^ src);
157 dst_buf[17] += popcount(wei1.s1 ^ src);
158 dst_buf[18] += popcount(wei2.s1 ^ src);
159 dst_buf[19] += popcount(wei3.s1 ^ src);
160 dst_buf[20] += popcount(wei4.s1 ^ src);
161 dst_buf[21] += popcount(wei5.s1 ^ src);
162 dst_buf[22] += popcount(wei6.s1 ^ src);
163 dst_buf[23] += popcount(wei7.s1 ^ src);
164 dst_buf[24] += popcount(wei8.s1 ^ src);
165 dst_buf[25] += popcount(wei9.s1 ^ src);
166 dst_buf[26] += popcount(wei10.s1 ^ src);
167 dst_buf[27] += popcount(wei11.s1 ^ src);
168 dst_buf[28] += popcount(wei12.s1 ^ src);
169 dst_buf[29] += popcount(wei13.s1 ^ src);
170 dst_buf[30] += popcount(wei14.s1 ^ src);
171 dst_buf[31] += popcount(wei15.s1 ^ src);
175 // Load data for fused operations (scales, biases, quantization thresholds, etc)
177 FUSED_OPS_PREPARE_DATA;
180 UNIT_TYPE dst[OC_BLOCK_SIZE];
181 for (int oc = 0; oc < OC_BLOCK_SIZE; oc++)
183 CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM - 2*dst_buf[oc]);
185 DO_ELTWISE_FUSED_OPS;
186 // Don't save floating-point intermediate result, since packed one is already computed
187 #if !BINARY_PACKED_OUTPUT
192 dst[oc] = FINAL_NAME;
197 bool in_x = x < OUTPUT_SIZE_X;
198 bool in_y = y < OUTPUT_SIZE_Y;
199 #if BINARY_PACKED_OUTPUT
203 output[dst_index + y*OUTPUT_Y_PITCH + x] = TO_OUTPUT_TYPE(packed_res);
205 if (xy * XY_BLOCK_SIZE < OUTPUT_SIZE_X*OUTPUT_SIZE_Y)
206 ALIGNED_BLOCK_WRITE(output, dst_index + xy*XY_BLOCK_SIZE, TO_OUTPUT_TYPE(packed_res));
207 else if (in_x && in_y)
208 output[dst_index + y*OUTPUT_Y_PITCH + x] = TO_OUTPUT_TYPE(packed_res);
214 for (int oc = 0; oc < OC_BLOCK_SIZE; oc++)
216 bool in_fm = f_block*OC_BLOCK_SIZE + oc < OUTPUT_FEATURE_NUM;
217 if (in_x && in_y && in_fm)
218 output[dst_index + oc*OUTPUT_FEATURE_PITCH + y*OUTPUT_Y_PITCH + x] = TO_OUTPUT_TYPE(dst[oc]);