1 // Copyright (c) 2016-2017 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "include/include_all.cl"
17 #include "include/sub_group.cl"
19 // Block read - currently block is 4 bytes aligned.
20 #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
22 #define MULTIPLY_BLOCKS_16x8(_result, _blockA, _blockB) \
24 const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \
25 const half16 acol1 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s1 ); \
26 const half16 acol2 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s2 ); \
27 const half16 acol3 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s3 ); \
28 const half16 acol4 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s4 ); \
29 const half16 acol5 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s5 ); \
30 const half16 acol6 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s6 ); \
31 const half16 acol7 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s7 ); \
32 _result = fma( _blockB.s0, acol0, _result ); \
33 _result = fma( _blockB.s1, acol1, _result ); \
34 _result = fma( _blockB.s2, acol2, _result ); \
35 _result = fma( _blockB.s3, acol3, _result ); \
36 _result = fma( _blockB.s4, acol4, _result ); \
37 _result = fma( _blockB.s5, acol5, _result ); \
38 _result = fma( _blockB.s6, acol6, _result ); \
39 _result = fma( _blockB.s7, acol7, _result ); \
42 #define SUB_GROUP_SIZE 16
44 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
45 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
46 KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv16_vload)(
47 const __global UNIT_TYPE* input,
48 __global UNIT_TYPE* output,
49 const __global UNIT_TYPE* weight
51 , __global UNIT_TYPE* bias)
56 const uint global_id = get_global_id(0);
57 const uint group_id = get_group_id(0);
58 const uint batch_group_id = get_global_id(1); // which part of batches we are computing, for example for batch 64 we compute batches 0..31 for batch_group_id == 0 and batches 32..65 for batch_group_id == 1
59 const uint id_in_sub_group = get_sub_group_local_id();
61 const uint out_id = (id_in_sub_group * BATCHES_PER_WORK_ITEM * (uint)get_global_size(1)) / SUB_GROUP_SIZE + group_id * BATCHES_PER_WORK_ITEM * (uint)get_global_size(1) + (BATCHES_PER_WORK_ITEM * batch_group_id) / SUB_GROUP_SIZE;
63 uint neuronIdx = id_in_sub_group + group_id * SUB_GROUP_SIZE;
65 MAKE_VECTOR_TYPE(UNIT_TYPE, 16) blockC00 = UNIT_VAL_ZERO;
67 uint weight_offset = id_in_sub_group + SUB_GROUP_SIZE * group_id * INPUT0_ELEMENTS_COUNT;
69 uint input_idx = id_in_sub_group + batch_group_id * BATCHES_PER_WORK_ITEM * INPUT0_ELEMENTS_COUNT;
70 for(uint h = 0; h < INPUT0_ELEMENTS_COUNT / 8; h++)
72 // read input data in blocks ( 16 batch * 8 x )
73 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00 = ALIGNED_BLOCK_READ8(input, input_idx);
74 MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00 = ALIGNED_BLOCK_READ8(weight, weight_offset);
76 MULTIPLY_BLOCKS_16x8(blockC00, blockA00, blockB00)
79 input_idx += 128; // 128 = 16x8 - because of input format which have blocks of 128 elements
83 blockC00 += bias[neuronIdx];
84 #endif // #if BIAS_TERM
86 blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
88 vstore16(blockC00, out_id, output);
93 #undef ALIGNED_BLOCK_READ8
94 #undef MULTIPLY_BLOCKS_16x8