inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bs_f_bsv16_af8_vload.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17 #include "include/sub_group.cl"
  18
  19 // Block read - currently block is 4 bytes aligned.
  20 #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
  21
  22 #define MULTIPLY_BLOCKS_16x8(_result, _blockA, _blockB)  \
  23 {   \
  24     const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \
  25     const half16 acol1 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s1 ); \
  26     const half16 acol2 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s2 ); \
  27     const half16 acol3 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s3 ); \
  28     const half16 acol4 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s4 ); \
  29     const half16 acol5 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s5 ); \
  30     const half16 acol6 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s6 ); \
  31     const half16 acol7 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s7 ); \
  32     _result = fma( _blockB.s0, acol0, _result ); \
  33     _result = fma( _blockB.s1, acol1, _result ); \
  34     _result = fma( _blockB.s2, acol2, _result ); \
  35     _result = fma( _blockB.s3, acol3, _result ); \
  36     _result = fma( _blockB.s4, acol4, _result ); \
  37     _result = fma( _blockB.s5, acol5, _result ); \
  38     _result = fma( _blockB.s6, acol6, _result ); \
  39     _result = fma( _blockB.s7, acol7, _result ); \
  40 }
  41
  42 #define SUB_GROUP_SIZE 16
  43
  44 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
  45 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
  46 KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv16_vload)(
  47     const __global UNIT_TYPE* input,
  48     __global UNIT_TYPE* output,
  49     const __global UNIT_TYPE* weight
  50 #if BIAS_TERM
  51     , __global UNIT_TYPE* bias)
  52 #else
  53     )
  54 #endif
  55 {
  56     const uint global_id = get_global_id(0);
  57     const uint group_id = get_group_id(0);
  58     const uint batch_group_id = get_global_id(1); // which part of batches we are computing, for example for batch 64 we compute batches 0..31 for batch_group_id == 0 and batches 32..65 for batch_group_id == 1
  59     const uint id_in_sub_group = get_sub_group_local_id();
  60
  61     const uint out_id = (id_in_sub_group * BATCHES_PER_WORK_ITEM * (uint)get_global_size(1)) / SUB_GROUP_SIZE + group_id * BATCHES_PER_WORK_ITEM * (uint)get_global_size(1) + (BATCHES_PER_WORK_ITEM * batch_group_id) / SUB_GROUP_SIZE;
  62
  63     uint neuronIdx = id_in_sub_group + group_id * SUB_GROUP_SIZE;
  64
  65     MAKE_VECTOR_TYPE(UNIT_TYPE, 16) blockC00 = UNIT_VAL_ZERO;
  66
  67     uint weight_offset = id_in_sub_group + SUB_GROUP_SIZE * group_id * INPUT0_ELEMENTS_COUNT;
  68
  69     uint input_idx = id_in_sub_group + batch_group_id * BATCHES_PER_WORK_ITEM * INPUT0_ELEMENTS_COUNT;
  70     for(uint h = 0; h < INPUT0_ELEMENTS_COUNT / 8; h++)
  71     {
  72         // read input data in blocks ( 16 batch * 8 x )
  73         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00 = ALIGNED_BLOCK_READ8(input, input_idx);
  74         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00 = ALIGNED_BLOCK_READ8(weight, weight_offset);
  75
  76         MULTIPLY_BLOCKS_16x8(blockC00, blockA00, blockB00)
  77
  78         weight_offset += 128;
  79         input_idx     += 128; // 128 = 16x8 - because of input format which have blocks of 128 elements
  80     }
  81
  82 #if BIAS_TERM
  83     blockC00 += bias[neuronIdx];
  84 #endif // #if BIAS_TERM
  85
  86     blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
  87
  88     vstore16(blockC00, out_id, output);
  89
  90 }
  91
  92 #undef SUB_GROUP_SIZE
  93 #undef ALIGNED_BLOCK_READ8
  94 #undef MULTIPLY_BLOCKS_16x8