inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_fp32.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/include_all.cl"
  16 #include "include/sub_group.cl"
  17
  18 __attribute__((reqd_work_group_size(LOCAL_WORK_GROUP_SIZE, 1, 1)))
  19 KERNEL(convolution_gpu_yxfb_yxio_b1_block)(
  20     const __global float* input,
  21     __global float* output,
  22     const __global float* filter,
  23 #if BIAS_TERM
  24     const __global float* bias,
  25 #endif
  26     uint split_idx)
  27 {
  28 #ifdef USE_VECTOR_8
  29     #define VECTOR_FLOAT float8
  30     #define BLOCK_READ(IN) as_float8(intel_sub_group_block_read8((const __global uint*)IN))
  31     #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write8((__global uint*)OUT, as_uint8(DATA));
  32 #endif
  33 #ifdef USE_VECTOR_4
  34     #define VECTOR_FLOAT float4
  35     #define BLOCK_READ(IN) as_float4(intel_sub_group_block_read4((const __global uint*)IN))
  36     #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write4((__global uint*)OUT, as_uint4(DATA));
  37 #endif
  38 #ifdef USE_VECTOR_2
  39     #define VECTOR_FLOAT float2
  40     #define BLOCK_READ(IN) as_float2(intel_sub_group_block_read2((const __global uint*)IN))
  41     #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write2((__global uint*)OUT, as_uint2(DATA));
  42 #endif
  43
  44     const uint batch_num = INPUT0_BATCH_NUM;
  45     const uint linear_id_xy = get_group_id(1) + get_global_size(1) * get_group_id(2);
  46     uint global_id = (((uint)get_group_id(0) * LOCAL_WORK_GROUP_SIZE) / batch_num) * batch_num + (linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (FILTER_OFM_NUM / OFM_PER_WORK_ITEM) * batch_num;
  47
  48     const uint out_batch_id = (uint)get_local_id(0) % INPUT0_BATCH_NUM;
  49     const uint out_x = get_group_id(1);
  50     const uint out_y = get_group_id(2);
  51
  52     const uint out_id = (global_id / batch_num) * OFM_PER_WORK_ITEM * batch_num + out_batch_id;
  53
  54     const uint ofm_offset = (global_id * (OFM_PER_WORK_ITEM / batch_num)) % FILTER_OFM_NUM;
  55
  56     const uint sub_group_id = (uint)get_local_id(0) % INPUT0_BATCH_NUM;
  57
  58     VECTOR_FLOAT _data0 = 0.f;
  59
  60     const int x = (int)out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
  61     const int y = (int)out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
  62
  63     for (uint i = 0; i < FILTER_SIZE_Y; i++)
  64     {
  65         const int input_offset_y = y + i * DILATION_SIZE_Y;
  66         const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
  67
  68         if(!zero_y)
  69         {
  70             for (uint j = 0; j < FILTER_SIZE_X; j++)
  71             {
  72                 const int input_offset_x = x + j * DILATION_SIZE_X;
  73                 const bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
  74
  75                 if(!zero)
  76                 {
  77                     uint input_idx = input_offset_x*INPUT0_X_PITCH + input_offset_y*INPUT0_Y_PITCH;
  78                     input_idx += INPUT0_OFFSET + split_idx * FILTER_IFM_NUM * INPUT0_FEATURE_PITCH;
  79                     input_idx += out_batch_id;
  80
  81                     uint filter_idx = ofm_offset + sub_group_id + i*FILTER_Y_PITCH + j*FILTER_X_PITCH;
  82
  83 #if INPUT0_BATCH_NUM == 1
  84                     for(uint h = 0; h < FILTER_IFM_NUM / 8; h++)
  85                     {
  86                         float _in = as_float(intel_sub_group_block_read((const __global uint*)input + input_idx));
  87                         float8 _input = TRANSPOSE_BLOCK_8(_in);
  88
  89                         VECTOR_FLOAT _filter;
  90                         _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
  91                         _data0 = mad(_input.s0, _filter, _data0);
  92
  93                         _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
  94                         _data0 = mad(_input.s1, _filter, _data0);
  95
  96                         _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
  97                         _data0 = mad(_input.s2, _filter, _data0);
  98
  99                         _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 100                         _data0 = mad(_input.s3, _filter, _data0);
 101
 102                         _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 103                         _data0 = mad(_input.s4, _filter, _data0);
 104
 105                         _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 106                         _data0 = mad(_input.s5, _filter, _data0);
 107
 108                         _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 109                         _data0 = mad(_input.s6, _filter, _data0);
 110
 111                         _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 112                         _data0 = mad(_input.s7, _filter, _data0);
 113
 114                         input_idx += 8 * INPUT0_FEATURE_PITCH;
 115                     }
 116                     for (uint h = FILTER_IFM_NUM - (FILTER_IFM_NUM % 8); h < FILTER_IFM_NUM; h++)
 117 #else
 118                     for (uint h = 0; h < FILTER_IFM_NUM; h++)
 119 #endif
 120                     {
 121                         VECTOR_FLOAT _filter = BLOCK_READ(filter + filter_idx);
 122                         _data0 = mad(input[input_idx], _filter, _data0);
 123                         filter_idx += FILTER_IFM_PITCH;
 124                         input_idx += INPUT0_FEATURE_PITCH;
 125                     }
 126                 }
 127             }
 128         }
 129     }
 130
 131 #if BIAS_TERM
 132     _data0 += BLOCK_READ(bias + ofm_offset);
 133 #endif
 134     _data0 = ACTIVATION(_data0, NL_M, NL_N);
 135
 136     uint _out_id = OUTPUT_OFFSET + out_id;
 137     BLOCK_WRITE(output + _out_id, _data0);
 138 #if defined(USE_VECTOR_8) || defined(USE_VECTOR_4) || defined(USE_VECTOR_2)
 139     #undef VECTOR_FLOAT
 140     #undef BLOCK_READ
 141     #undef BLOCK_WRITE
 142 #endif
 143 }