inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl

   1 // Copyright (c) 2016-2020 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/common.cl"
  17
  18 #include "include/data_types.cl"
  19 #include "include/fetch.cl"
  20 #include "include/mmad.cl"
  21
  22 #define INPUT_PACKED_TYPE_8  CAT(INPUT_PACKED_TYPE, 8)
  23 #define FILTER_PACKED_TYPE_8 CAT(FILTER_PACKED_TYPE, 8)
  24
  25 #define AS_TYPE(type, val) CAT(as_, type)(val)
  26
  27 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
  28 KERNEL(fully_connected_gpu_MMAD)(
  29     const __global INPUT0_TYPE* input,
  30     __global OUTPUT_TYPE* output,
  31     const __global FILTER_TYPE* weights
  32 #if BIAS_TERM
  33     , const __global BIAS_TYPE* biases
  34 #endif
  35 #if HAS_FUSED_OPS_DECLS
  36     , FUSED_OPS_DECLS
  37 #endif
  38     )
  39 {
  40 #if OUTPUT_BATCH_NUM == 1
  41     const uint f = (uint)get_global_id(0);
  42     const uint b = 0;
  43 #else
  44     const uint f = (uint)get_global_id(0);
  45     const uint b = (uint)get_global_id(1);
  46 #endif
  47
  48     int dotProd = 0;
  49
  50     const uint filter_offset = FILTER_GET_OFFSET(f);
  51 #if INPUT0_DIMS == 5
  52     const uint input_offset = INPUT0_GET_INDEX(b, 0, 0, 0, 0);
  53 #else
  54     const uint input_offset = INPUT0_GET_INDEX(b, 0, 0, 0);
  55 #endif
  56
  57 #if SPATIAL_MAJOR
  58     for (uint k = 0; k < FEATURE_BLOCKS_COUNT; ++k) {
  59 #   if !SPLIT_SPATIAL
  60         for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
  61 #   else
  62         for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi)
  63         for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
  64         for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
  65             const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
  66 #endif
  67 #else  // SPATIAL_MAJOR
  68 #   if !SPLIT_SPATIAL
  69     for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
  70 #   else
  71     for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi)
  72     for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
  73     for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
  74         const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
  75 #   endif
  76         for (uint k = 0; k < FEATURE_BLOCKS_COUNT; ++k) {
  77 #endif
  78 #if !SPLIT_SPATIAL
  79             uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + k * MMAD_INPUT_FBLOCK_PITCH;
  80 #else
  81             uint input_idx = input_offset + k * MMAD_INPUT_FBLOCK_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
  82 #endif
  83             uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + k * MMAD_FILTER_FBLOCK_PITCH;
  84
  85             uint input_data_u = intel_sub_group_block_read((const __global uint*)(input + input_idx));
  86             INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
  87
  88             INPUT_PACKED_TYPE_8 activations;  //activations of all lanes
  89             activations.s0 = sub_group_broadcast(input_data, 0);
  90             activations.s1 = sub_group_broadcast(input_data, 1);
  91             activations.s2 = sub_group_broadcast(input_data, 2);
  92             activations.s3 = sub_group_broadcast(input_data, 3);
  93             activations.s4 = sub_group_broadcast(input_data, 4);
  94             activations.s5 = sub_group_broadcast(input_data, 5);
  95             activations.s6 = sub_group_broadcast(input_data, 6);
  96             activations.s7 = sub_group_broadcast(input_data, 7);
  97
  98             uint8 weights_data_u = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx));
  99             FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u);
 100
 101             dotProd = MMAD_8(activations, weights_data, dotProd);
 102         }
 103     }
 104
 105 #if HAS_FEATURE_LEFTOVERS
 106         const uint lid = get_sub_group_local_id();
 107 #if SPATIAL_MAJOR
 108 #if !SPLIT_SPATIAL
 109         for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
 110 #else
 111         for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi)
 112         for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
 113         for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
 114             const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
 115 #endif  // !SPLIT_SPATIAL
 116
 117 #else  // SPATIAL_MAJOR
 118 #if !SPLIT_SPATIAL
 119     for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
 120 #else  // !SPLIT_SPATIAL
 121     for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi)
 122         for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
 123         for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
 124             const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
 125 #endif  // !SPLIT_SPATIAL
 126 #endif  // SPATIAL_MAJOR
 127
 128 #if !SPLIT_SPATIAL
 129             uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH;
 130 #else  // !SPLIT_SPATIAL
 131             uint input_idx = input_offset + FEATURE_BLOCK_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
 132 #endif  // !SPLIT_SPATIAL
 133             uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;
 134
 135             MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0);
 136             for (uint i = 0; i < 4; i++) {
 137                 if (FEATURE_BLOCKS_COUNT*32 + lid*4 + i < INPUT0_FEATURE_NUM) {
 138                     input_data_u[i] = input[input_idx + (lid*4 + i)*INPUT0_FEATURE_PITCH];
 139                 }
 140             }
 141             INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
 142
 143             INPUT_PACKED_TYPE_8 activations;  //activations of all lanes
 144             activations.s0 = sub_group_broadcast(input_data, 0);
 145             activations.s1 = sub_group_broadcast(input_data, 1);
 146             activations.s2 = sub_group_broadcast(input_data, 2);
 147             activations.s3 = sub_group_broadcast(input_data, 3);
 148             activations.s4 = sub_group_broadcast(input_data, 4);
 149             activations.s5 = sub_group_broadcast(input_data, 5);
 150             activations.s6 = sub_group_broadcast(input_data, 6);
 151             activations.s7 = sub_group_broadcast(input_data, 7);
 152
 153             uint8 weights_data_u = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx));
 154             FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u);
 155
 156             dotProd = MMAD_8(activations, weights_data, dotProd);
 157         }
 158 #endif  // HAS_FEATURE_LEFTOVERS
 159
 160     if (OUTPUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && f >= OUTPUT_FEATURE_NUM)
 161         return;
 162
 163 #if BIAS_TERM
 164 #if   BIAS_PER_OUTPUT
 165     const uint bias_index = GET_DATA_INDEX(BIAS, b, f, 0, 0);
 166 #elif BIAS_PER_OFM
 167     const uint bias_index = f;
 168 #endif
 169
 170     float dequantized = (float)dotProd + biases[bias_index];
 171 #else  // BIAS_TERM
 172     float dequantized = (float)dotProd;
 173 #endif
 174
 175     const uint out_idx = OUTPUT_GET_INDEX(b, f, 0, 0);
 176
 177 #if HAS_FUSED_OPS
 178     FUSED_OPS;
 179     OUTPUT_TYPE res = FUSED_OPS_RESULT;
 180
 181     output[out_idx] = res;
 182 #else
 183     output[out_idx] = TO_OUTPUT_TYPE(dequantized);
 184 #endif
 185 }
 186
 187 #undef INPUT_PACKED_TYPE_8
 188 #undef FILTER_PACKED_TYPE_8
 189 #undef AS_TYPE