inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_imad.cl

   1 // Copyright (c) 2019 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/common.cl"
  17
  18 #include "include/data_types.cl"
  19 #include "include/fetch.cl"
  20 #include "include/imad.cl"
  21
  22 #define SIMD_SIZE         16
  23 #define BYTES_PER_READ    (sizeof(int))
  24 #define BYTES_PER_READ8   (8 * BYTES_PER_READ)
  25
  26 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
  27 KERNEL(fully_connected_gpu_IMAD)(
  28     const __global INPUT0_TYPE* input,
  29     __global OUTPUT_TYPE* output,
  30     const __global FILTER_TYPE* weights
  31 #if BIAS_TERM
  32     , const __global BIAS_TYPE* biases
  33 #endif
  34 #if QUANTIZATION_TERM
  35     ,const __global float* quantizations
  36 #endif
  37 #if CALIBRATION_TERM
  38     ,const __global float* calibrations
  39 #endif
  40     )
  41 {
  42     // This kernel works with linearized data w/o strides and padding
  43     // so only one dimension 'F' is required
  44     const uint f = get_global_id(0);
  45     const uint b = get_global_id(1);
  46
  47     if (f >= OUTPUT_FEATURE_NUM) {
  48         return;
  49     }
  50
  51     int dotProd = 0;
  52
  53     uint idx_w = ((f / SIMD_SIZE) * SIMD_SIZE) * INPUT0_FEATURE_NUM;
  54     const __global INPUT0_TYPE* current_input = &input[GET_DATA_INDEX(INPUT0, b, 0, 0, 0)];
  55
  56     for (uint idx_i = 0; idx_i < INPUT0_FEATURE_NUM; idx_i += BYTES_PER_READ8) {
  57         int input_data = as_int(intel_sub_group_block_read((const __global uint*)(current_input + idx_i)));
  58         int8 activations;  //activations of all lanes
  59         activations.s0 = sub_group_broadcast(input_data, 0);
  60         activations.s1 = sub_group_broadcast(input_data, 1);
  61         activations.s2 = sub_group_broadcast(input_data, 2);
  62         activations.s3 = sub_group_broadcast(input_data, 3);
  63         activations.s4 = sub_group_broadcast(input_data, 4);
  64         activations.s5 = sub_group_broadcast(input_data, 5);
  65         activations.s6 = sub_group_broadcast(input_data, 6);
  66         activations.s7 = sub_group_broadcast(input_data, 7);
  67
  68         int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + idx_w)));
  69         idx_w += SIMD_SIZE * BYTES_PER_READ8;
  70
  71         for (int i = 0; i < 8; i++) {
  72             dotProd = IMAD(dotProd, as_char4(activations[i]), as_char4(weights_data[i]));
  73         }
  74     }
  75
  76 #if BIAS_TERM
  77 #if   BIAS_PER_OUTPUT
  78     const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
  79 #elif BIAS_PER_OFM
  80     const uint bias_index = f;
  81 #endif
  82 #if CALIBRATION_TERM
  83     dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]);
  84 #else  // CALIBRATION_TERM
  85     dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * O_QF);
  86 #endif // CALIBRATION_TERM
  87 #endif // BIAS_TERM
  88
  89     const uint out_index = GET_DATA_INDEX(OUTPUT, b, f, 0, 0);
  90     output[out_index] = ACTIVATION(convert_char(dotProd), NL_M, NL_N);
  91 }
  92
  93 #undef SIMD_SIZE
  94 #undef BYTES_PER_READ
  95 #undef BYTES_PER_READ8