inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/include_all.cl"
  16
  17 #define OBS 8
  18 __attribute__((intel_reqd_sub_group_size(8)))
  19 KERNEL(convolution)(
  20     __global INPUT0_TYPE* input,
  21     __global OUTPUT_TYPE* output,
  22     __global FILTER_TYPE* weights,
  23 #if BIAS_TERM
  24     __global BIAS_TYPE* biases,
  25 #endif
  26 #if QUANTIZATION_TERM
  27     __global float* quantizations,
  28 #endif
  29 #if CALIBRATION_TERM
  30     __global float* calibrations,
  31 #endif
  32     uint split_idx)
  33 {
  34     const uint f_pack = (get_group_id(0) * 32) % OUTPUT_FEATURE_NUM;
  35     const uint b = (get_group_id(0) * 32) / OUTPUT_FEATURE_NUM;
  36
  37     const uint x = get_group_id(1) * OBS;
  38     const uint y = get_group_id(2);
  39
  40     int4 dotProd[OBS] = { 0 };
  41
  42     const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
  43     const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
  44
  45     const uint filter_offset = f_pack*FILTER_OFM_PITCH;
  46     const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET;
  47
  48     for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
  49     {
  50         const int input_offset_y = input_y + j;
  51         for (uint i = 0; i < FILTER_SIZE_X ; ++i)
  52         {
  53             const int input_offset_x = input_x + i + STRIDE_SIZE_X * get_sub_group_local_id();
  54             uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH;
  55             uint filter_idx = filter_offset + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
  56
  57             char input_data[3];
  58             char2 _i = vload2(0, input + input_idx);
  59             input_data[0] = _i.s0;
  60             input_data[1] = _i.s1;
  61             input_data[2] = input[input_idx + 2];
  62
  63             for (uint k = 0; k < FILTER_IFM_NUM; ++k)
  64             {
  65                 char4 w_data = as_char4(intel_sub_group_block_read((const __global uint*)(weights + filter_idx)));
  66                 for(uint r = 0; r < OBS; r++)
  67                 {
  68                     char in = intel_sub_group_shuffle(input_data[k], r);
  69                     for(uint c = 0; c < 4; c++)
  70                     {
  71                         dotProd[r][c] += (int)in * (int)w_data[c];
  72                     }
  73                 }
  74                 filter_idx += FILTER_IFM_PITCH;
  75             }
  76         }
  77     }
  78
  79
  80 const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f_pack, y, x + get_sub_group_local_id());
  81 const uint _f_idx = f_pack + get_sub_group_local_id() * 4;
  82 float4 quants = vload4(0, quantizations + _f_idx );
  83 float4 calibs = vload4(0, calibrations + _f_idx );
  84 float4 bias = vload4(0, biases + _f_idx );
  85 for(uint r = 0; r < OBS; r++)
  86 {
  87     char4 char_output;
  88     for(uint c = 0; c < 4; c++)
  89     {
  90         const uint f_idx = f_pack + get_sub_group_local_id() * 4 + c;
  91     #if BIAS_TERM
  92         const uint bias_index = f_idx;
  93     #if CALIBRATION_TERM
  94         dotProd[r][c] = (UNIT_TYPE)round(((float)dotProd[r][c] * quants[c] * I_QF + bias[c]) * calibs[c]);
  95     #else  // CALIBRATION_TERM
  96         dotProd[r][c] = (UNIT_TYPE)round(((float)dotProd[r][c] * quants[c] * I_QF + bias[c]) * O_QF);
  97     #endif // CALIBRATION_TERM
  98     #endif
  99         char_output[c] = ACTIVATION(convert_char(dotProd[r][c]), NL_M, NL_N);
 100     }
 101     const uint out_idx = intel_sub_group_shuffle(dst_index, r);
 102     intel_sub_group_block_write( (__global uint*)(output + out_idx) , as_uint(char_output));
 103 }
 104
 105 }