inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp32.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17 #include "include/sub_group.cl"
  18
  19 KERNEL(convolution_gpu_yxfb_yxio_b16)(
  20     const __global float* input,
  21     __global UNIT_TYPE* output,
  22     const __global float* filter,
  23 #if BIAS_TERM
  24     const __global float* bias,
  25 #endif
  26     uint split_idx)
  27 {
  28     // get_global_size(0) -> Number of work items needed to compute all features and all batches for single output spatial position
  29     //                       (single (x, y) point in output).
  30     // get_global_size(1) -> Output size in X-dimension.
  31     // get_global_size(2) -> Output size in Y-dimension.
  32     // get_global_id(0)   -> Id of work item computing single spatial point of output indicated by get_global_id(1), get_global_id(2).
  33     // get_global_id(1)   -> Current x-position in output.
  34     // get_global_id(2)   -> Current y-position in output.
  35     //
  36     // WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS -> Number of work items needed to compute entire one batch for at least one feature and one spatial point.
  37     //                                           (this number in current implementation computes also OFM_PER_WORK_ITEM output features at the same time).
  38     // FILTER_ARRAY_NUM                       -> Number of filters groups (split size).
  39
  40     const uint out_x = get_global_id(1);
  41     const uint out_y = get_global_id(2);
  42
  43     const uint output_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM;
  44     const uint output_x_size = OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X;
  45     const uint linear_id_xy = OUTPUT_PAD_BEFORE_SIZE_X + out_x + output_x_size * (out_y + OUTPUT_PAD_BEFORE_SIZE_Y);
  46     uint global_id = (((uint)get_global_id(0) / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) + (linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (output_f_size / OFM_PER_WORK_ITEM)) * WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS;
  47
  48     const uint sub_group_id = get_local_id(0);
  49
  50 #if defined(USE_BLOCK_READ_2) || defined(USE_BLOCK_READ_1)
  51     const uint chunk_size = sizeof(uint)/sizeof(UNIT_TYPE);
  52 #else
  53     const uint chunk_size = 1;
  54 #endif
  55
  56     const uint out_batch_id = chunk_size * sub_group_id + LOCAL_WORK_GROUP_SIZE * BATCHES_PER_WORK_ITEM * ((uint)get_group_id(0) % LOCAL_WORK_GROUPS_PER_SINGLE_BATCHES_ELEMENTS);
  57
  58     const uint out_id = (global_id / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) * OFM_PER_WORK_ITEM * OUTPUT_FEATURE_PITCH + OUTPUT_PAD_BEFORE_FEATURE_NUM * OUTPUT_FEATURE_PITCH + OUTPUT_PAD_BEFORE_BATCH_NUM + out_batch_id;
  59
  60     const uint ofm_offset = ((global_id * OFM_PER_WORK_ITEM) / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) % output_f_size;
  61
  62     // Each component of vector element contains computation for separate output feature.
  63     float8 _data[BATCHES_PER_WORK_ITEM];
  64     for(uint i = 0; i < BATCHES_PER_WORK_ITEM; i++)
  65     {
  66         _data[i] = UNIT_VAL_ZERO;
  67     }
  68
  69     const int x = (int)out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
  70     const int y = (int)out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
  71
  72     for (uint i = 0; i < FILTER_SIZE_Y; i++)
  73     {
  74         const int input_offset_y = y + i * DILATION_SIZE_Y;
  75         const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
  76
  77         if(!zero_y)
  78         {
  79             for (uint j = 0; j < FILTER_SIZE_X; j++)
  80             {
  81                 const int input_offset_x = x + j * DILATION_SIZE_X;
  82                 const bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
  83
  84                 if(!zero)
  85                 {
  86                     uint input_idx = input_offset_x*INPUT0_X_PITCH + input_offset_y*INPUT0_Y_PITCH;
  87                     input_idx += INPUT0_OFFSET + split_idx * FILTER_IFM_NUM * INPUT0_FEATURE_PITCH;
  88                     input_idx += out_batch_id;
  89
  90                     //sub_group_id used as offset to make each workitem load different filter, and then shuffle it
  91                     uint filter_idx = ofm_offset + sub_group_id + i*FILTER_Y_PITCH + j*FILTER_X_PITCH;
  92
  93                     for (uint h = 0; h < FILTER_IFM_NUM; h++)
  94                     {
  95 #ifdef USE_BLOCK_READ_2
  96                         float2 _input = as_float2(intel_sub_group_block_read2((const __global uint*)input + input_idx));
  97                         float8 filter_transp = TRANSPOSE_BLOCK_8(filter[filter_idx]);
  98                         _data[0] = fma(_input.s0, filter_transp, _data[0]);
  99                         _data[1] = fma(_input.s1, filter_transp, _data[1]);
 100                         input_idx += INPUT0_FEATURE_PITCH;
 101 #else
 102                         float8 filter_transp = TRANSPOSE_BLOCK_8(filter[filter_idx]);
 103                         for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
 104                         {
 105                             _data[s] = fma(input[input_idx], filter_transp, _data[s]);
 106                             input_idx += LOCAL_WORK_GROUP_SIZE;
 107                         }
 108                         input_idx += INPUT0_FEATURE_PITCH - BATCHES_PER_WORK_ITEM * LOCAL_WORK_GROUP_SIZE;
 109 #endif
 110                         filter_idx += FILTER_IFM_PITCH;
 111                     }
 112                 }
 113             }
 114         }
 115     }
 116
 117 #if BIAS_TERM
 118     float bias_val = bias[ofm_offset + sub_group_id];
 119     for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
 120     {
 121         ADD_BIAS_8(_data[s], bias_val);
 122     }
 123 #endif
 124     for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
 125     {
 126         _data[s] = ACTIVATION(_data[s], NL_M, NL_N);
 127     }
 128
 129     for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
 130     {
 131         uint _out_id = OUTPUT_VIEW_OFFSET + out_id + s * LOCAL_WORK_GROUP_SIZE;
 132         output[_out_id] = _data[s].s0; _out_id += OUTPUT_FEATURE_PITCH;
 133         output[_out_id] = _data[s].s1; _out_id += OUTPUT_FEATURE_PITCH;
 134         output[_out_id] = _data[s].s2; _out_id += OUTPUT_FEATURE_PITCH;
 135         output[_out_id] = _data[s].s3; _out_id += OUTPUT_FEATURE_PITCH;
 136         output[_out_id] = _data[s].s4; _out_id += OUTPUT_FEATURE_PITCH;
 137         output[_out_id] = _data[s].s5; _out_id += OUTPUT_FEATURE_PITCH;
 138         output[_out_id] = _data[s].s6; _out_id += OUTPUT_FEATURE_PITCH;
 139         output[_out_id] = _data[s].s7; _out_id += OUTPUT_FEATURE_PITCH;
 140     }
 141 }