inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_multiple_x_fp32.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/include_all.cl"
  16 #include "include/sub_group.cl"
  17
  18 __attribute__((reqd_work_group_size(LOCAL_WORK_GROUP_SIZE, 1, 1)))
  19 KERNEL(convolution_gpu_yxfb_yxio_b1_block_multiple_x)(
  20     const __global float* input,
  21     __global float* output,
  22     const __global float* filter,
  23 #if BIAS_TERM
  24     const __global float* bias,
  25 #endif
  26     uint split_idx)
  27 {
  28 #if USE_VECTOR == 8
  29     #define VECTOR_FLOAT float8
  30     #define BLOCK_READ(IN) as_float8(intel_sub_group_block_read8((const __global uint*)IN))
  31     #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write8((__global uint*)OUT, as_uint8(DATA));
  32 #endif
  33 #if USE_VECTOR == 4
  34     #define VECTOR_FLOAT float4
  35     #define BLOCK_READ(IN) as_float4(intel_sub_group_block_read4((const __global uint*)IN))
  36     #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write4((__global uint*)OUT, as_uint4(DATA));
  37 #endif
  38 #if USE_VECTOR == 2
  39     #define VECTOR_FLOAT float2
  40     #define BLOCK_READ(IN) as_float2(intel_sub_group_block_read2((const __global uint*)IN))
  41     #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write2((__global uint*)OUT, as_uint2(DATA));
  42 #endif
  43 #if USE_VECTOR == 1
  44     #define VECTOR_FLOAT float
  45     #define BLOCK_READ(IN) as_float(intel_sub_group_block_read((const __global uint*)IN))
  46     #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write((__global uint*)OUT, as_uint(DATA));
  47 #endif
  48
  49     const uint batch_num = INPUT0_BATCH_NUM;
  50     const uint linear_id_xy = (uint)get_group_id(1) * X_PER_WORK_ITEM + OUTPUT_SIZE_X * (uint)get_group_id(2);
  51     uint global_id = (((uint)get_group_id(0) * LOCAL_WORK_GROUP_SIZE) / batch_num) * batch_num + ( linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (FILTER_OFM_NUM / OFM_PER_WORK_ITEM) * batch_num;
  52
  53     const uint out_batch_id = (uint)get_local_id(0) % INPUT0_BATCH_NUM;
  54     const uint out_x = (uint)get_group_id(1) * X_PER_WORK_ITEM;
  55     const uint out_y = get_group_id(2);
  56
  57     uint out_id[X_PER_WORK_ITEM];
  58     for(uint i = 0; i < X_PER_WORK_ITEM; i++)
  59     {
  60         out_id[i] = OUTPUT_OFFSET + ( (global_id + i * FILTER_ARRAY_NUM * (FILTER_OFM_NUM / OFM_PER_WORK_ITEM) * INPUT0_BATCH_NUM) / batch_num) * OFM_PER_WORK_ITEM * batch_num + out_batch_id;
  61     }
  62
  63     const uint ofm_offset = (global_id * (OFM_PER_WORK_ITEM / batch_num)) % FILTER_OFM_NUM;
  64
  65     const uint sub_group_id = (uint)get_local_id(0) % INPUT0_BATCH_NUM;
  66
  67     VECTOR_FLOAT _data[X_PER_WORK_ITEM];
  68     for(uint i = 0; i < X_PER_WORK_ITEM; i++)
  69     {
  70         _data[i] = 0.0f;
  71     }
  72
  73     const int x = (int)out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
  74     const int y = (int)out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
  75
  76     for (uint i = 0; i < FILTER_SIZE_Y; i++)
  77     {
  78         const int input_offset_y = y + i * DILATION_SIZE_Y;
  79         const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
  80
  81         if(!zero_y)
  82         {
  83             for (uint j = 0; j < FILTER_SIZE_X; j++)
  84             {
  85                 const int input_offset_x = x + j * DILATION_SIZE_X;
  86
  87                 bool zero_x[X_PER_WORK_ITEM];
  88                 for(int z = 0; z < X_PER_WORK_ITEM; z++)
  89                 {
  90                     zero_x[z] = (input_offset_x + z * STRIDE_SIZE_X) >= INPUT0_SIZE_X || (input_offset_x + z * STRIDE_SIZE_X) < 0;
  91                 }
  92
  93                 VECTOR_FLOAT _tmp[X_PER_WORK_ITEM];
  94                 for(uint t = 0; t < X_PER_WORK_ITEM; t++)
  95                 {
  96                     _tmp[t] = 0.f;
  97                 }
  98
  99                 uint input_idx = input_offset_x*INPUT0_X_PITCH + input_offset_y*INPUT0_Y_PITCH;
 100                 input_idx += INPUT0_OFFSET + split_idx * FILTER_IFM_NUM * INPUT0_FEATURE_PITCH;
 101                 input_idx += out_batch_id;
 102
 103                 uint filter_idx = ofm_offset + sub_group_id + i*FILTER_Y_PITCH + j*FILTER_X_PITCH;
 104
 105 #if FILTER_IFM_NUM >= 8
 106                 for(uint h = 0; h < FILTER_IFM_NUM / 8; h++)
 107                 {
 108                     float _in[X_PER_WORK_ITEM];
 109                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 110                     {
 111                         _in[a] = as_float(intel_sub_group_block_read((const __global uint*)input + (input_idx + a * INPUT0_FEATURE_NUM * STRIDE_SIZE_X)));
 112                     }
 113                     float8 _input[X_PER_WORK_ITEM];
 114                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 115                     {
 116                         _input[a] = TRANSPOSE_BLOCK_8(_in[a]);
 117                     }
 118
 119                     VECTOR_FLOAT _filter;
 120                     _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 121                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 122                     {
 123                         _tmp[a] = mad(_input[a].s0, _filter, _tmp[a]);
 124                     }
 125
 126                     _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 127                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 128                     {
 129                         _tmp[a] = mad(_input[a].s1, _filter, _tmp[a]);
 130                     }
 131
 132                     _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 133                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 134                     {
 135                         _tmp[a] = mad(_input[a].s2, _filter, _tmp[a]);
 136                     }
 137
 138                     _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 139                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 140                     {
 141                         _tmp[a] = mad(_input[a].s3, _filter, _tmp[a]);
 142                     }
 143
 144
 145                     _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 146                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 147                     {
 148                         _tmp[a] = mad(_input[a].s4, _filter, _tmp[a]);
 149                     }
 150
 151                     _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 152                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 153                     {
 154                         _tmp[a] = mad(_input[a].s5, _filter, _tmp[a]);
 155                     }
 156
 157                     _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 158                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 159                     {
 160                         _tmp[a] = mad(_input[a].s6, _filter, _tmp[a]);
 161                     }
 162
 163                     _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
 164                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 165                     {
 166                         _tmp[a] = mad(_input[a].s7, _filter, _tmp[a]);
 167                     }
 168
 169                     input_idx += 8 * INPUT0_FEATURE_PITCH;
 170                 }
 171                 for (uint h = FILTER_IFM_NUM - (FILTER_IFM_NUM % 8); h < FILTER_IFM_NUM; h++)
 172 #else
 173                 for (uint h = 0; h < FILTER_IFM_NUM; h++)
 174 #endif
 175                 {
 176                     VECTOR_FLOAT _filter = BLOCK_READ(filter + filter_idx);
 177                     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 178                     {
 179                         _tmp[a] = mad(input[input_idx + a * INPUT0_FEATURE_NUM * STRIDE_SIZE_X], _filter, _tmp[a]);
 180                     }
 181                     filter_idx += FILTER_IFM_PITCH;
 182                     input_idx += INPUT0_FEATURE_PITCH;
 183                 }
 184                 for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 185                 {
 186                     if(!zero_x[a])
 187                         _data[a] += _tmp[a];
 188                 }
 189             }
 190         }
 191     }
 192
 193 #if BIAS_TERM
 194     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 195     {
 196         _data[a] += BLOCK_READ(bias + ofm_offset);
 197     }
 198 #endif
 199     for(uint a = 0; a < X_PER_WORK_ITEM; a++)
 200     {
 201         _data[a] = ACTIVATION(_data[a], NL_M, NL_N);
 202     }
 203
 204     BLOCK_WRITE(output + out_id[0], _data[0]);
 205     for(uint a = 1; a < X_PER_WORK_ITEM; a++)
 206     {
 207         if(out_x + a < OUTPUT_SIZE_X)
 208         {
 209             BLOCK_WRITE(output + out_id[a], _data[a]);
 210         }
 211     }
 212
 213 #if defined(USE_VECTOR)
 214     #undef VECTOR_FLOAT
 215     #undef BLOCK_READ
 216     #undef BLOCK_WRITE
 217 #endif
 218 }