1 // Copyright (c) 2016-2017 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "include/include_all.cl"
17 #include "include/sub_group.cl"
20 #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB) \
22 const half8 acol0 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 0 ); \
23 const half8 acol1 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 1 ); \
24 const half8 acol2 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 2 ); \
25 const half8 acol3 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 3 ); \
26 const half8 acol4 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 4 ); \
27 const half8 acol5 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 5 ); \
28 const half8 acol6 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 6 ); \
29 const half8 acol7 = TRANSPOSE_BLOCK_8_COL_FP16( _blockA, 7 ); \
30 _result = fma( _blockB.s0, acol0, _result ); \
31 _result = fma( _blockB.s1, acol1, _result ); \
32 _result = fma( _blockB.s2, acol2, _result ); \
33 _result = fma( _blockB.s3, acol3, _result ); \
34 _result = fma( _blockB.s4, acol4, _result ); \
35 _result = fma( _blockB.s5, acol5, _result ); \
36 _result = fma( _blockB.s6, acol6, _result ); \
37 _result = fma( _blockB.s7, acol7, _result ); \
40 #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB) \
42 const float8 acol0 = TRANSPOSE_BLOCK_8_COL( _blockA, 0 ); \
43 const float8 acol1 = TRANSPOSE_BLOCK_8_COL( _blockA, 1 ); \
44 const float8 acol2 = TRANSPOSE_BLOCK_8_COL( _blockA, 2 ); \
45 const float8 acol3 = TRANSPOSE_BLOCK_8_COL( _blockA, 3 ); \
46 const float8 acol4 = TRANSPOSE_BLOCK_8_COL( _blockA, 4 ); \
47 const float8 acol5 = TRANSPOSE_BLOCK_8_COL( _blockA, 5 ); \
48 const float8 acol6 = TRANSPOSE_BLOCK_8_COL( _blockA, 6 ); \
49 const float8 acol7 = TRANSPOSE_BLOCK_8_COL( _blockA, 7 ); \
50 _result = mad( _blockB.s0, acol0, _result ); \
51 _result = mad( _blockB.s1, acol1, _result ); \
52 _result = mad( _blockB.s2, acol2, _result ); \
53 _result = mad( _blockB.s3, acol3, _result ); \
54 _result = mad( _blockB.s4, acol4, _result ); \
55 _result = mad( _blockB.s5, acol5, _result ); \
56 _result = mad( _blockB.s6, acol6, _result ); \
57 _result = mad( _blockB.s7, acol7, _result ); \
61 #define SUB_GROUP_SIZE 8
63 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
64 KERNEL (fully_connected_gpu_xb_xb_b8_x8_vload)(
65 const __global UNIT_TYPE* input,
66 __global UNIT_TYPE* output,
67 const __global UNIT_TYPE* weight
69 , __global UNIT_TYPE* bias)
74 const uint global_id = get_global_id(0);
75 const uint group_id = get_global_id(1); // which part of batches we are computing, for example for batch 64 we compute batches 0..31 for group_id == 0 and batches 32..65 for group_id == 1
76 uint sub_group_idx = (uint)get_local_id(0) % 8;
78 const uint out_id = (sub_group_idx * BATCHES_PER_WORK_ITEM * (uint)get_global_size(1)) / 8 + (global_id / 8) * BATCHES_PER_WORK_ITEM * NEURONS_PER_WORK_ITEM * (uint)get_global_size(1) + (BATCHES_PER_WORK_ITEM * group_id) / 8;
80 uint neuronIdx = sub_group_idx + (global_id / 8) * 8 * NEURONS_PER_WORK_ITEM;
82 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC00 = UNIT_VAL_ZERO;
83 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC10 = UNIT_VAL_ZERO;
85 #if BATCHES_PER_WORK_ITEM >= 16
86 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC01 = UNIT_VAL_ZERO;
87 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC11 = UNIT_VAL_ZERO;
90 #if BATCHES_PER_WORK_ITEM >= 32
91 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC02 = UNIT_VAL_ZERO;
92 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC12 = UNIT_VAL_ZERO;
94 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC03 = UNIT_VAL_ZERO;
95 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC13 = UNIT_VAL_ZERO;
98 uint weight_offset = neuronIdx;
99 #if NEURONS_PER_WORK_ITEM > 1
101 uint weight_offset2 = neuronIdx + 8;
103 #endif // #if NEURONS_PER_WORK_ITEM > 1
105 uint input_idx = sub_group_idx * (BATCHES_PER_WORK_ITEM / 8) * (uint)get_global_size(1) + (group_id * BATCHES_PER_WORK_ITEM) / 8;
106 for(uint h = 0; h < INPUT0_ELEMENTS_COUNT / 8; h++)
108 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00 = vload8(input_idx, input);
110 #if BATCHES_PER_WORK_ITEM >= 16
111 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA01 = vload8(input_idx + 1, input);
114 #if BATCHES_PER_WORK_ITEM >= 32
115 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA02 = vload8(input_idx + 2, input);
116 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA03 = vload8(input_idx + 3, input);
118 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00;
119 blockB00.s0 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
120 blockB00.s1 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
121 blockB00.s2 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
122 blockB00.s3 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
123 blockB00.s4 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
124 blockB00.s5 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
125 blockB00.s6 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
126 blockB00.s7 = weight[weight_offset]; weight_offset += FILTER_OFM_NUM;
127 MULTIPLY_BLOCKS_8x8(blockC00, blockA00, blockB00)
129 #if BATCHES_PER_WORK_ITEM >= 16
130 MULTIPLY_BLOCKS_8x8(blockC01, blockA01, blockB00)
133 #if BATCHES_PER_WORK_ITEM >= 32
134 MULTIPLY_BLOCKS_8x8(blockC02, blockA02, blockB00)
135 MULTIPLY_BLOCKS_8x8(blockC03, blockA03, blockB00)
138 #if NEURONS_PER_WORK_ITEM > 1
140 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB10;
141 blockB10.s0 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
142 blockB10.s1 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
143 blockB10.s2 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
144 blockB10.s3 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
145 blockB10.s4 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
146 blockB10.s5 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
147 blockB10.s6 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
148 blockB10.s7 = weight[weight_offset2]; weight_offset2 += FILTER_OFM_NUM;
149 MULTIPLY_BLOCKS_8x8(blockC10, blockA00, blockB10)
151 #if BATCHES_PER_WORK_ITEM >= 16
152 MULTIPLY_BLOCKS_8x8(blockC11, blockA01, blockB10)
154 #if BATCHES_PER_WORK_ITEM >= 32
155 MULTIPLY_BLOCKS_8x8(blockC12, blockA02, blockB10)
156 MULTIPLY_BLOCKS_8x8(blockC13, blockA03, blockB10)
159 #endif // #if NEURONS_PER_WORK_ITEM > 1
160 input_idx += INPUT0_BATCH_NUM; // we don't need to multiply by 8 because of vload8
164 blockC00 += bias[neuronIdx];
165 #if BATCHES_PER_WORK_ITEM >= 16
166 blockC01 += bias[neuronIdx];
169 #if BATCHES_PER_WORK_ITEM >= 32
170 blockC02 += bias[neuronIdx];
171 blockC03 += bias[neuronIdx];
174 #if NEURONS_PER_WORK_ITEM > 1
176 blockC10 += bias[neuronIdx+8];
177 #if BATCHES_PER_WORK_ITEM >= 16
178 blockC11 += bias[neuronIdx+8];
180 #if BATCHES_PER_WORK_ITEM >= 32
181 blockC12 += bias[neuronIdx+8];
182 blockC13 += bias[neuronIdx+8];
185 #endif // #if NEURONS_PER_WORK_ITEM > 1
187 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
188 #if BATCHES_PER_WORK_ITEM >= 16
189 blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
191 #if BATCHES_PER_WORK_ITEM >= 32
192 blockC02 = ACTIVATION(blockC02, NL_M, NL_N);
193 blockC03 = ACTIVATION(blockC03, NL_M, NL_N);
196 #if NEURONS_PER_WORK_ITEM > 1
198 blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
199 #if BATCHES_PER_WORK_ITEM >= 16
200 blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
202 #if BATCHES_PER_WORK_ITEM >= 32
203 blockC12 = ACTIVATION(blockC12, NL_M, NL_N);
204 blockC13 = ACTIVATION(blockC13, NL_M, NL_N);
207 #endif // #if NEURONS_PER_WORK_ITEM > 1
209 vstore8(blockC00, out_id, output);
210 #if BATCHES_PER_WORK_ITEM >= 16
211 vstore8(blockC01, out_id + 1, output);
213 #if BATCHES_PER_WORK_ITEM >= 32
214 vstore8(blockC02, out_id + 2, output);
215 vstore8(blockC03, out_id + 3, output);
217 #endif // #if BIAS_TERM
218 #if NEURONS_PER_WORK_ITEM > 1
220 vstore8(blockC10, out_id+INPUT0_BATCH_NUM, output);
222 #if BATCHES_PER_WORK_ITEM >= 16
223 vstore8(blockC11, out_id+INPUT0_BATCH_NUM+1, output);
226 #if BATCHES_PER_WORK_ITEM >= 32
227 vstore8(blockC12, out_id+INPUT0_BATCH_NUM+2, output);
228 vstore8(blockC13, out_id+INPUT0_BATCH_NUM+3, output);
231 #endif // #if NEURONS_PER_WORK_ITEM > 1
234 #undef SUB_GROUP_SIZE
235 #undef ALIGNED_BLOCK_READ8
236 #undef MAKE_VECTOR_TYPE
238 #undef CONCAT_TOKEN_HANDLER1
239 #undef MULTIPLY_BLOCKS_8x8