inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b8_fp32.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17 #include "include/sub_group.cl"
  18
  19 __attribute__((reqd_work_group_size(LOCAL_WORK_GROUP_SIZE, 1, 1)))
  20 KERNEL(convolution_gpu_yxfb_yxio_b8)(
  21     const __global float* input,
  22     __global float* output,
  23     const __global float* filter,
  24 #if BIAS_TERM
  25     const __global float* bias,
  26 #endif
  27     uint split_idx)
  28 {
  29     const uint batch_num = INPUT0_BATCH_NUM;
  30
  31     const uint linear_id_xy = get_global_id(1) + get_global_size(1) * get_global_id(2);
  32     // we're computing 8 OUTPUT_FEATURE_MAP so we must divide by 8, but we got 8 batches, so no division is needed.
  33     uint global_id = ((uint)get_global_id(0) / batch_num) * batch_num + (linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (FILTER_OFM_NUM / OFM_PER_WORK_ITEM) * batch_num;
  34
  35     const uint out_batch_id = get_local_id(0);
  36     const uint out_x = get_global_id(1);
  37     const uint out_y = get_global_id(2);
  38
  39     const uint out_id = (global_id / batch_num) * OFM_PER_WORK_ITEM * batch_num + out_batch_id;
  40
  41     const uint ofm_offset = (global_id * OFM_PER_WORK_ITEM) / batch_num % FILTER_OFM_NUM;
  42
  43     const uint sub_group_id = get_local_id(0);
  44
  45     float8 _data0 = 0.f;
  46 #if OFM_PER_WORK_ITEM == 16
  47     float8 _data1 = 0.f;
  48 #endif
  49
  50     const int x = (int)out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
  51     const int y = (int)out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
  52
  53     for (uint i = 0; i < FILTER_SIZE_Y; i++)
  54     {
  55         const int input_offset_y = y + i * DILATION_SIZE_Y;
  56         const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
  57
  58         if(!zero_y)
  59         {
  60             for (uint j = 0; j < FILTER_SIZE_X; j++)
  61             {
  62                 const int input_offset_x = x + j * DILATION_SIZE_X;
  63                 const bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
  64
  65                 if(!zero)
  66                 {
  67                     uint input_idx = input_offset_x*INPUT0_X_PITCH + input_offset_y*INPUT0_Y_PITCH;
  68                     input_idx += INPUT0_OFFSET + split_idx * FILTER_IFM_NUM * INPUT0_FEATURE_PITCH;
  69                     input_idx += out_batch_id;
  70
  71                     //sub_group_id used as offset to make each workitem load different filter, and then shuffle it
  72                     uint filter_idx = ofm_offset + sub_group_id + i*FILTER_Y_PITCH + j*FILTER_X_PITCH;
  73 #if OFM_PER_WORK_ITEM == 16
  74                     uint filter_idx2 = filter_idx + 8;
  75 #endif
  76                     for (uint h = 0; h < FILTER_IFM_NUM / 8; h++)
  77                     {
  78                         float8 _input = as_float8(intel_sub_group_block_read8((const __global uint*)input + input_idx));
  79
  80                         DOT_PRODUCT_8(_data0, _input.s0, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
  81 #if OFM_PER_WORK_ITEM == 16
  82                         DOT_PRODUCT_8(_data1, _input.s0, filter[filter_idx2]) filter_idx2 += FILTER_OFM_NUM;
  83 #endif
  84                         DOT_PRODUCT_8(_data0, _input.s1, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
  85 #if OFM_PER_WORK_ITEM == 16
  86                         DOT_PRODUCT_8(_data1, _input.s1, filter[filter_idx2]) filter_idx2 += FILTER_OFM_NUM;
  87 #endif
  88                         DOT_PRODUCT_8(_data0, _input.s2, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
  89 #if OFM_PER_WORK_ITEM == 16
  90                         DOT_PRODUCT_8(_data1, _input.s2, filter[filter_idx2]) filter_idx2 += FILTER_OFM_NUM;
  91 #endif
  92                         DOT_PRODUCT_8(_data0, _input.s3, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
  93 #if OFM_PER_WORK_ITEM == 16
  94                         DOT_PRODUCT_8(_data1, _input.s3, filter[filter_idx2]) filter_idx2 += FILTER_OFM_NUM;
  95 #endif
  96                         DOT_PRODUCT_8(_data0, _input.s4, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
  97 #if OFM_PER_WORK_ITEM == 16
  98                         DOT_PRODUCT_8(_data1, _input.s4, filter[filter_idx2]) filter_idx2 += FILTER_OFM_NUM;
  99 #endif
 100                         DOT_PRODUCT_8(_data0, _input.s5, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
 101 #if OFM_PER_WORK_ITEM == 16
 102                         DOT_PRODUCT_8(_data1, _input.s5, filter[filter_idx2]) filter_idx2 += FILTER_OFM_NUM;
 103 #endif
 104                         DOT_PRODUCT_8(_data0, _input.s6, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
 105 #if OFM_PER_WORK_ITEM == 16
 106                         DOT_PRODUCT_8(_data1, _input.s6, filter[filter_idx2]) filter_idx2 += FILTER_OFM_NUM;
 107 #endif
 108                         DOT_PRODUCT_8(_data0, _input.s7, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
 109 #if OFM_PER_WORK_ITEM == 16
 110                         DOT_PRODUCT_8(_data1, _input.s7, filter[filter_idx2]) filter_idx2 += FILTER_OFM_NUM;
 111 #endif
 112                         input_idx += 8 * INPUT0_FEATURE_PITCH;
 113                     }
 114                     for (uint h = FILTER_IFM_NUM - (FILTER_IFM_NUM % 8); h < FILTER_IFM_NUM; h++)
 115                     {
 116                         float8 _filter = TRANSPOSE_BLOCK_8(filter[filter_idx]); filter_idx += FILTER_OFM_NUM;
 117                         _data0 = mad(input[input_idx], _filter, _data0);
 118 #if OFM_PER_WORK_ITEM == 16
 119                         float8 _filter2 = TRANSPOSE_BLOCK_8(filter[filter_idx2]); filter_idx2 += FILTER_OFM_NUM;
 120                         _data1 = mad(input[input_idx], _filter2, _data1);
 121 #endif
 122                         input_idx += INPUT0_FEATURE_PITCH;
 123                     }
 124                 }
 125             }
 126         }
 127     }
 128
 129 #if BIAS_TERM
 130     ADD_BIAS_8(_data0, bias[ofm_offset + sub_group_id]);
 131 #if OFM_PER_WORK_ITEM == 16
 132     ADD_BIAS_8(_data1, bias[ofm_offset + sub_group_id + 8]);
 133 #endif
 134 #endif // #if BIAS_TERM
 135     _data0 = ACTIVATION(_data0, NL_M, NL_N);
 136 #if OFM_PER_WORK_ITEM == 16
 137     _data1 = ACTIVATION(_data1, NL_M, NL_N);
 138 #endif
 139
 140     const uint _out_id = OUTPUT_OFFSET + out_id;
 141     intel_sub_group_block_write8((__global uint*)output + _out_id, as_uint8(_data0));
 142 #if OFM_PER_WORK_ITEM == 16
 143     intel_sub_group_block_write8((__global uint*)output + _out_id + 8 * INPUT0_FEATURE_PITCH, as_uint8(_data1));
 144 #endif
 145 }