inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bs_f_bsv8_af8_vload.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17 #include "include/sub_group.cl"
  18
  19 #if FP16_UNIT_USED
  20     // Block read - currently block is 4 bytes aligned.
  21     #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
  22
  23     #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB)  \
  24     {   \
  25         const half8 acol0 = TRANSPOSE_BLOCK_8_FP16( _blockA.s0 ); \
  26         const half8 acol1 = TRANSPOSE_BLOCK_8_FP16( _blockA.s1 ); \
  27         const half8 acol2 = TRANSPOSE_BLOCK_8_FP16( _blockA.s2 ); \
  28         const half8 acol3 = TRANSPOSE_BLOCK_8_FP16( _blockA.s3 ); \
  29         const half8 acol4 = TRANSPOSE_BLOCK_8_FP16( _blockA.s4 ); \
  30         const half8 acol5 = TRANSPOSE_BLOCK_8_FP16( _blockA.s5 ); \
  31         const half8 acol6 = TRANSPOSE_BLOCK_8_FP16( _blockA.s6 ); \
  32         const half8 acol7 = TRANSPOSE_BLOCK_8_FP16( _blockA.s7 ); \
  33         _result = fma( _blockB.s0, acol0, _result ); \
  34         _result = fma( _blockB.s1, acol1, _result ); \
  35         _result = fma( _blockB.s2, acol2, _result ); \
  36         _result = fma( _blockB.s3, acol3, _result ); \
  37         _result = fma( _blockB.s4, acol4, _result ); \
  38         _result = fma( _blockB.s5, acol5, _result ); \
  39         _result = fma( _blockB.s6, acol6, _result ); \
  40         _result = fma( _blockB.s7, acol7, _result ); \
  41     }
  42 #else
  43     // Block read - currently block is 4 bytes aligned.
  44     #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
  45
  46     #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB)  \
  47     {   \
  48         const float8 acol0 = TRANSPOSE_BLOCK_8( _blockA.s0 ); \
  49         const float8 acol1 = TRANSPOSE_BLOCK_8( _blockA.s1 ); \
  50         const float8 acol2 = TRANSPOSE_BLOCK_8( _blockA.s2 ); \
  51         const float8 acol3 = TRANSPOSE_BLOCK_8( _blockA.s3 ); \
  52         const float8 acol4 = TRANSPOSE_BLOCK_8( _blockA.s4 ); \
  53         const float8 acol5 = TRANSPOSE_BLOCK_8( _blockA.s5 ); \
  54         const float8 acol6 = TRANSPOSE_BLOCK_8( _blockA.s6 ); \
  55         const float8 acol7 = TRANSPOSE_BLOCK_8( _blockA.s7 ); \
  56         _result = mad( _blockB.s0, acol0, _result ); \
  57         _result = mad( _blockB.s1, acol1, _result ); \
  58         _result = mad( _blockB.s2, acol2, _result ); \
  59         _result = mad( _blockB.s3, acol3, _result ); \
  60         _result = mad( _blockB.s4, acol4, _result ); \
  61         _result = mad( _blockB.s5, acol5, _result ); \
  62         _result = mad( _blockB.s6, acol6, _result ); \
  63         _result = mad( _blockB.s7, acol7, _result ); \
  64     }
  65 #endif
  66
  67 #define SUB_GROUP_SIZE 8
  68
  69 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
  70 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
  71 KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv8_vload)(
  72     const __global UNIT_TYPE* input,
  73     __global UNIT_TYPE* output,
  74     const __global UNIT_TYPE* weight
  75 #if BIAS_TERM
  76     , __global UNIT_TYPE* bias)
  77 #else
  78     )
  79 #endif
  80 {
  81     const uint global_id = get_global_id(0);
  82     const uint group_id = get_group_id(0);
  83     const uint batch_group_id = get_global_id(1); // which part of batches we are computing, for example for batch 64 we compute batches 0..31 for batch_group_id == 0 and batches 32..65 for batch_group_id == 1
  84     const uint id_in_sub_group = get_sub_group_local_id();
  85
  86     const uint out_id = (id_in_sub_group * BATCHES_PER_WORK_ITEM * (uint)get_global_size(1)) / SUB_GROUP_SIZE + group_id * BATCHES_PER_WORK_ITEM * NEURONS_PER_WORK_ITEM * (uint)get_global_size(1) + (BATCHES_PER_WORK_ITEM * batch_group_id) / SUB_GROUP_SIZE;
  87
  88     uint neuronIdx = id_in_sub_group + group_id * SUB_GROUP_SIZE * NEURONS_PER_WORK_ITEM;
  89
  90     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC00 = UNIT_VAL_ZERO;
  91     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC10 = UNIT_VAL_ZERO;
  92
  93 #if BATCHES_PER_WORK_ITEM >= 16
  94     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC01 = UNIT_VAL_ZERO;
  95     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC11 = UNIT_VAL_ZERO;
  96 #if BATCHES_PER_WORK_ITEM >= 32
  97     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC02 = UNIT_VAL_ZERO;
  98     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC12 = UNIT_VAL_ZERO;
  99     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC03 = UNIT_VAL_ZERO;
 100     MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockC13 = UNIT_VAL_ZERO;
 101 #endif
 102 #endif
 103
 104     uint weight_offset = id_in_sub_group + SUB_GROUP_SIZE * group_id * NEURONS_PER_WORK_ITEM * INPUT0_ELEMENTS_COUNT;
 105 #if NEURONS_PER_WORK_ITEM > 1
 106
 107     uint weight_offset2 = weight_offset + SUB_GROUP_SIZE * INPUT0_ELEMENTS_COUNT;
 108
 109 #endif // #if NEURONS_PER_WORK_ITEM > 1
 110
 111     uint input_idx = id_in_sub_group + batch_group_id * BATCHES_PER_WORK_ITEM * INPUT0_ELEMENTS_COUNT;
 112     for(uint h = 0; h < INPUT0_ELEMENTS_COUNT / 8; h++)
 113     {
 114         // read input data in blocks ( 8 batch * 8 x )
 115         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00 = ALIGNED_BLOCK_READ8(input, input_idx);
 116 #if BATCHES_PER_WORK_ITEM >= 16
 117         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA01 = ALIGNED_BLOCK_READ8(input, input_idx + (INPUT0_ELEMENTS_COUNT*8));
 118 #if BATCHES_PER_WORK_ITEM >= 32
 119         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA02 = ALIGNED_BLOCK_READ8(input, input_idx + (INPUT0_ELEMENTS_COUNT*16));
 120         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA03 = ALIGNED_BLOCK_READ8(input, input_idx + (INPUT0_ELEMENTS_COUNT*24));
 121 #endif
 122 #endif
 123
 124         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00 = ALIGNED_BLOCK_READ8(weight, weight_offset); weight_offset += 64;
 125
 126         MULTIPLY_BLOCKS_8x8(blockC00, blockA00, blockB00)
 127 #if BATCHES_PER_WORK_ITEM >= 16
 128         MULTIPLY_BLOCKS_8x8(blockC01, blockA01, blockB00)
 129 #if BATCHES_PER_WORK_ITEM >= 32
 130         MULTIPLY_BLOCKS_8x8(blockC02, blockA02, blockB00)
 131         MULTIPLY_BLOCKS_8x8(blockC03, blockA03, blockB00)
 132 #endif
 133 #endif
 134
 135 #if NEURONS_PER_WORK_ITEM > 1
 136
 137         MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB10 = ALIGNED_BLOCK_READ8(weight, weight_offset2); weight_offset2 += 64;
 138
 139         MULTIPLY_BLOCKS_8x8(blockC10, blockA00, blockB10)
 140 #if BATCHES_PER_WORK_ITEM >= 16
 141         MULTIPLY_BLOCKS_8x8(blockC11, blockA01, blockB10)
 142 #if BATCHES_PER_WORK_ITEM >= 32
 143         MULTIPLY_BLOCKS_8x8(blockC12, blockA02, blockB10)
 144         MULTIPLY_BLOCKS_8x8(blockC13, blockA03, blockB10)
 145 #endif
 146 #endif
 147
 148 #endif // #if NEURONS_PER_WORK_ITEM > 1
 149         input_idx += 64; // 64 because of input format which have blocks of 64 elements
 150     }
 151
 152 #if BIAS_TERM
 153     blockC00 += bias[neuronIdx];
 154 #if BATCHES_PER_WORK_ITEM >= 16
 155     blockC01 += bias[neuronIdx];
 156 #if BATCHES_PER_WORK_ITEM >= 32
 157     blockC02 += bias[neuronIdx];
 158     blockC03 += bias[neuronIdx];
 159 #endif
 160 #endif
 161
 162 #if NEURONS_PER_WORK_ITEM > 1
 163
 164     blockC10 += bias[neuronIdx+8];
 165 #if BATCHES_PER_WORK_ITEM >= 16
 166     blockC11 += bias[neuronIdx+8];
 167 #if BATCHES_PER_WORK_ITEM >= 32
 168     blockC12 += bias[neuronIdx+8];
 169     blockC13 += bias[neuronIdx+8];
 170 #endif
 171 #endif
 172
 173 #endif // #if NEURONS_PER_WORK_ITEM > 1
 174 #endif // #if BIAS_TERM
 175     blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
 176 #if BATCHES_PER_WORK_ITEM >= 16
 177     blockC01 = ACTIVATION(blockC01, NL_M, NL_N);
 178 #if BATCHES_PER_WORK_ITEM >= 32
 179     blockC02 = ACTIVATION(blockC02, NL_M, NL_N);
 180     blockC03 = ACTIVATION(blockC03, NL_M, NL_N);
 181 #endif
 182 #endif
 183
 184 #if NEURONS_PER_WORK_ITEM > 1
 185
 186     blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
 187 #if BATCHES_PER_WORK_ITEM >= 16
 188     blockC11 = ACTIVATION(blockC11, NL_M, NL_N);
 189 #if BATCHES_PER_WORK_ITEM >= 32
 190     blockC12 = ACTIVATION(blockC12, NL_M, NL_N);
 191     blockC13 = ACTIVATION(blockC13, NL_M, NL_N);
 192 #endif
 193 #endif
 194
 195 #endif // #if NEURONS_PER_WORK_ITEM > 1
 196
 197     if(neuronIdx >= OUTPUT_ELEMENTS_COUNT)
 198         return;
 199
 200     vstore8(blockC00, out_id, output);
 201 #if BATCHES_PER_WORK_ITEM >= 16
 202     vstore8(blockC01, out_id + 1, output);
 203 #if BATCHES_PER_WORK_ITEM >= 32
 204     vstore8(blockC02, out_id + 2, output);
 205     vstore8(blockC03, out_id + 3, output);
 206 #endif
 207 #endif
 208
 209 #if NEURONS_PER_WORK_ITEM > 1
 210
 211     if(neuronIdx + 8 >= OUTPUT_ELEMENTS_COUNT)
 212         return;
 213
 214     vstore8(blockC10, out_id+INPUT0_BATCH_NUM, output);
 215 #if BATCHES_PER_WORK_ITEM >= 16
 216     vstore8(blockC11, out_id+INPUT0_BATCH_NUM+1, output);
 217 #if BATCHES_PER_WORK_ITEM >= 32
 218     vstore8(blockC12, out_id+INPUT0_BATCH_NUM+2, output);
 219     vstore8(blockC13, out_id+INPUT0_BATCH_NUM+3, output);
 220 #endif
 221 #endif
 222
 223 #endif // #if NEURONS_PER_WORK_ITEM > 1
 224 }
 225
 226 #undef SUB_GROUP_SIZE
 227 #undef ALIGNED_BLOCK_READ8
 228 #undef MULTIPLY_BLOCKS_8x8