inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_yxfb_yxio_b16_fp16.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/include_all.cl"
  17 #include "include/sub_group.cl"
  18
  19 __attribute__((intel_reqd_sub_group_size(16)))
  20 __attribute__((reqd_work_group_size(16, 1, 1)))
  21 KERNEL(fused_conv_eltwise_gpu_yxfb_yxio_b16)(
  22     const __global UNIT_TYPE* input,
  23     __global UNIT_TYPE* output,
  24     const __global UNIT_TYPE* filter,
  25 #if BIAS_TERM
  26     const __global UNIT_TYPE* bias,
  27 #endif
  28     uint split_idx,
  29     const __global UNIT_TYPE* input2)
  30 {
  31     // get_global_size(0) -> Number of work items needed to compute all features and all batches for single output spatial position
  32     //                       (single (x, y) point in output).
  33     // get_global_size(1) -> Output size in X-dimension.
  34     // get_global_size(2) -> Output size in Y-dimension.
  35     // get_global_id(0)   -> Id of work item computing single spatial point of output indicated by get_global_id(1), get_global_id(2).
  36     // get_group_id(1)   -> Current x-position in output.
  37     // get_group_id(2)   -> Current y-position in output.
  38     //
  39     // WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS -> Number of work items needed to compute entire one batch for at least one feature and one spatial point.
  40     //                                           (this number in current implementation computes also OFM_PER_WORK_ITEM output features at the same time).
  41     // FILTER_ARRAY_NUM                       -> Number of filters groups (split size).
  42
  43     const uint out_x = get_group_id(1);
  44     const uint out_y = get_group_id(2);
  45
  46     const uint output_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM;
  47     const uint output_x_size = OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X;
  48     const uint linear_id_xy = OUTPUT_PAD_BEFORE_SIZE_X + out_x + output_x_size * (out_y + OUTPUT_PAD_BEFORE_SIZE_Y);
  49     uint global_id = (((uint)get_global_id(0) / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) + (linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (output_f_size / OFM_PER_WORK_ITEM)) * WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS;
  50
  51     const uint sub_group_id = get_local_id(0);
  52
  53 #if defined(USE_BLOCK_READ_2) || defined(USE_BLOCK_READ_1)
  54     const uint chunk_size = sizeof(uint)/sizeof(UNIT_TYPE);
  55 #else
  56     const uint chunk_size = 1;
  57 #endif
  58
  59     const uint out_batch_id = chunk_size * sub_group_id + LOCAL_WORK_GROUP_SIZE * BATCHES_PER_WORK_ITEM * ((uint)get_group_id(0) % LOCAL_WORK_GROUPS_PER_SINGLE_BATCHES_ELEMENTS);
  60
  61     const uint out_id = (global_id / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) * OFM_PER_WORK_ITEM * OUTPUT_FEATURE_PITCH + OUTPUT_PAD_BEFORE_FEATURE_NUM * OUTPUT_FEATURE_PITCH + OUTPUT_PAD_BEFORE_BATCH_NUM + out_batch_id;
  62
  63     const uint ofm_offset = ((global_id * OFM_PER_WORK_ITEM) / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) % output_f_size;
  64
  65 #if IN_OUT_OPT != 1 // calculating eltwise offset
  66     const uint eltw_x = out_x * ELTW_STRIDE_X;
  67     const uint eltw_y = out_y * ELTW_STRIDE_Y;
  68
  69     const uint eltw_f_size = INPUT1_PAD_BEFORE_FEATURE_NUM + INPUT1_FEATURE_NUM + INPUT1_PAD_AFTER_FEATURE_NUM;
  70     const uint eltw_x_size = INPUT1_PAD_BEFORE_SIZE_X + INPUT1_SIZE_X + INPUT1_PAD_AFTER_SIZE_X;
  71
  72     const uint eltw_linear_id_xy = INPUT1_PAD_BEFORE_SIZE_X + eltw_x + eltw_x_size * (eltw_y + INPUT1_PAD_BEFORE_SIZE_Y);
  73
  74     uint eltw_global_id = (((uint)get_global_id(0) / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) + (eltw_linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (eltw_f_size / OFM_PER_WORK_ITEM)) * WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS;
  75     const uint eltw_id = (eltw_global_id / WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS) * OFM_PER_WORK_ITEM * INPUT1_FEATURE_PITCH + INPUT1_PAD_BEFORE_FEATURE_NUM * INPUT1_FEATURE_PITCH + INPUT1_PAD_BEFORE_BATCH_NUM + out_batch_id;
  76 #endif
  77
  78     // Each component of vector element contains computation for separate output feature.
  79     half16 _data[BATCHES_PER_WORK_ITEM];
  80     for(uint i = 0; i < BATCHES_PER_WORK_ITEM; i++)
  81     {
  82         _data[i] = UNIT_VAL_ZERO;
  83     }
  84
  85     const int x = (int)out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
  86     const int y = (int)out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
  87
  88     for (uint i = 0; i < FILTER_SIZE_Y; i++)
  89     {
  90         const int input_offset_y = y + i * DILATION_SIZE_Y;
  91         const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
  92
  93         if(!zero_y)
  94         {
  95             for (uint j = 0; j < FILTER_SIZE_X; j++)
  96             {
  97                 const int input_offset_x = x + j * DILATION_SIZE_X;
  98                 const bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
  99
 100                 if(!zero)
 101                 {
 102                     uint input_idx = input_offset_x*INPUT0_X_PITCH + input_offset_y*INPUT0_Y_PITCH;
 103                     input_idx += INPUT0_OFFSET + split_idx * FILTER_IFM_NUM * INPUT0_FEATURE_PITCH;
 104                     input_idx += out_batch_id;
 105
 106                     //sub_group_id used as offset to make each workitem load different filter, and then shuffle it
 107                     // 2 * sub_group_id is used because we group 2 halfs as one uint element.
 108                     uint filter_idx = ofm_offset + 2*sub_group_id + i*FILTER_Y_PITCH + j*FILTER_X_PITCH;
 109
 110                     for (uint h = 0; h < FILTER_IFM_NUM; h++)
 111                     {
 112 #if defined(USE_BLOCK_READ_2)
 113                         half4 _input = as_half4(intel_sub_group_block_read2((const __global uint*)(input + input_idx)));
 114                         uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
 115                         half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
 116                         _data[0] = fma(_input.s0, filter_transp, _data[0]);
 117                         _data[1] = fma(_input.s1, filter_transp, _data[1]);
 118                         _data[2] = fma(_input.s2, filter_transp, _data[2]);
 119                         _data[3] = fma(_input.s3, filter_transp, _data[3]);
 120                         input_idx += INPUT0_FEATURE_PITCH;
 121 #elif defined(USE_BLOCK_READ_1)
 122                         half2 _input = as_half2(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
 123                         uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
 124                         half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
 125                         _data[0] = fma(_input.s0, filter_transp, _data[0]);
 126                         _data[1] = fma(_input.s1, filter_transp, _data[1]);
 127                         input_idx += INPUT0_FEATURE_PITCH;
 128 #else
 129                         uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
 130                         half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
 131                         for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
 132                         {
 133                             _data[s] = fma(input[input_idx], filter_transp, _data[s]);
 134                             input_idx += LOCAL_WORK_GROUP_SIZE;
 135                         }
 136                         input_idx += INPUT0_FEATURE_PITCH - BATCHES_PER_WORK_ITEM * LOCAL_WORK_GROUP_SIZE;
 137 #endif
 138                         filter_idx += FILTER_IFM_PITCH;
 139                     }
 140                 }
 141             }
 142         }
 143     }
 144
 145 #if BIAS_TERM
 146     uint bias_val_pair = *(const __global uint*)(bias + (ofm_offset + 2 * sub_group_id));
 147     for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
 148     {
 149         ADD_BIAS_16_FP16(_data[s], bias_val_pair);
 150     }
 151 #endif
 152     for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
 153     {
 154         _data[s] = ACTIVATION(_data[s], NL_M, NL_N);
 155     }
 156
 157 #if defined(USE_BLOCK_READ_2) || defined(USE_BLOCK_READ_1)
 158     #if BATCHES_PER_WORK_ITEM == 4
 159         uint _out_id = OUTPUT_VIEW_OFFSET + out_id;
 160         for(uint i = 0; i < 16; i++)
 161         {
 162 #if IN_OUT_OPT == 1
 163             half2 eltw_second_input_data0 = as_half2(*(__global uint*)(output + _out_id ));
 164             half2 eltw_second_input_data1 = as_half2(*(__global uint*)(output + _out_id + 32));
 165 #else
 166             uint _eltw_id = INPUT1_VIEW_OFFSET + eltw_id;
 167             half2 eltw_second_input_data0 = as_half2(*(__global uint*)(input2 + _eltw_id + i * INPUT1_FEATURE_PITCH));
 168             half2 eltw_second_input_data1 = as_half2(*(__global uint*)(input2 + _eltw_id + i * INPUT1_FEATURE_PITCH + 32));
 169 #endif
 170             _data[0][i] += eltw_second_input_data0.s0;
 171             _data[1][i] += eltw_second_input_data0.s1;
 172             _data[2][i] += eltw_second_input_data1.s0;
 173             _data[3][i] += eltw_second_input_data1.s1;
 174
 175             _data[0][i] = ACTIVATION_ELTW(_data[0][i], NL_M_ELTW, NL_N_ELTW);
 176             _data[1][i] = ACTIVATION_ELTW(_data[1][i], NL_M_ELTW, NL_N_ELTW);
 177             _data[2][i] = ACTIVATION_ELTW(_data[2][i], NL_M_ELTW, NL_N_ELTW);
 178             _data[3][i] = ACTIVATION_ELTW(_data[3][i], NL_M_ELTW, NL_N_ELTW);
 179
 180             *(__global uint*)(output + _out_id) = as_uint((half2)(_data[0][i], _data[1][i]));
 181             *(__global uint*)(output + _out_id + 32) = as_uint((half2)(_data[2][i], _data[3][i]));
 182             _out_id += OUTPUT_FEATURE_PITCH;
 183         }
 184     #else
 185     for(uint s = 0; s < BATCHES_PER_WORK_ITEM / 2; s++)
 186     {
 187         uint _out_id = OUTPUT_VIEW_OFFSET + out_id + chunk_size * s * LOCAL_WORK_GROUP_SIZE;
 188
 189         for(uint i = 0; i < 16; i++)
 190         {
 191 #if IN_OUT_OPT == 1
 192             half2 eltw_second_input_data = as_half2(*(__global uint*)(output + _out_id + i * OUTPUT_FEATURE_PITCH));
 193 #else
 194             uint _eltw_id = INPUT1_VIEW_OFFSET + eltw_id + chunk_size * s * LOCAL_WORK_GROUP_SIZE;
 195             half2 eltw_second_input_data = as_half2(*(__global uint*)(input2 + _eltw_id + i * INPUT1_FEATURE_PITCH));
 196 #endif
 197             _data[chunk_size * s][i] += eltw_second_input_data.s0;
 198             _data[chunk_size * s + 1][i] += eltw_second_input_data.s1;
 199             _data[chunk_size * s][i] = ACTIVATION_ELTW(_data[chunk_size * s][i], NL_M_ELTW, NL_N_ELTW);
 200             _data[chunk_size * s + 1][i] = ACTIVATION_ELTW(_data[chunk_size * s + 1][i], NL_M_ELTW, NL_N_ELTW);
 201         }
 202
 203         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s0, _data[chunk_size * s + 1].s0)); _out_id += OUTPUT_FEATURE_PITCH;
 204         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s1, _data[chunk_size * s + 1].s1)); _out_id += OUTPUT_FEATURE_PITCH;
 205         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s2, _data[chunk_size * s + 1].s2)); _out_id += OUTPUT_FEATURE_PITCH;
 206         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s3, _data[chunk_size * s + 1].s3)); _out_id += OUTPUT_FEATURE_PITCH;
 207         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s4, _data[chunk_size * s + 1].s4)); _out_id += OUTPUT_FEATURE_PITCH;
 208         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s5, _data[chunk_size * s + 1].s5)); _out_id += OUTPUT_FEATURE_PITCH;
 209         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s6, _data[chunk_size * s + 1].s6)); _out_id += OUTPUT_FEATURE_PITCH;
 210         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s7, _data[chunk_size * s + 1].s7)); _out_id += OUTPUT_FEATURE_PITCH;
 211         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s8, _data[chunk_size * s + 1].s8)); _out_id += OUTPUT_FEATURE_PITCH;
 212         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].s9, _data[chunk_size * s + 1].s9)); _out_id += OUTPUT_FEATURE_PITCH;
 213         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sa, _data[chunk_size * s + 1].sa)); _out_id += OUTPUT_FEATURE_PITCH;
 214         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sb, _data[chunk_size * s + 1].sb)); _out_id += OUTPUT_FEATURE_PITCH;
 215         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sc, _data[chunk_size * s + 1].sc)); _out_id += OUTPUT_FEATURE_PITCH;
 216         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sd, _data[chunk_size * s + 1].sd)); _out_id += OUTPUT_FEATURE_PITCH;
 217         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].se, _data[chunk_size * s + 1].se)); _out_id += OUTPUT_FEATURE_PITCH;
 218         *(__global uint*)(output + _out_id) = as_uint((half2)(_data[chunk_size * s].sf, _data[chunk_size * s + 1].sf)); _out_id += OUTPUT_FEATURE_PITCH;
 219     }
 220     #endif
 221 #else
 222     for(uint s = 0; s < BATCHES_PER_WORK_ITEM; s++)
 223     {
 224         uint _out_id = OUTPUT_VIEW_OFFSET + out_id + s * LOCAL_WORK_GROUP_SIZE;
 225
 226         for(uint i = 0; i < 16; i++)
 227         {
 228 #if IN_OUT_OPT == 1
 229             half eltw_second_input_data = output[_out_id + i * OUTPUT_FEATURE_PITCH];
 230 #else
 231             uint _eltw_id = INPUT1_VIEW_OFFSET + eltw_id + s * LOCAL_WORK_GROUP_SIZE;
 232             half eltw_second_input_data = output[_eltw_id + i * INPUT1_FEATURE_PITCH];
 233 #endif
 234             _data[s][i] += eltw_second_input_data;
 235             _data[s][i] = ACTIVATION_ELTW(_data[s][i], NL_M_ELTW, NL_N_ELTW);
 236         }
 237
 238         output[_out_id] = _data[s].s0; _out_id += OUTPUT_FEATURE_PITCH;
 239         output[_out_id] = _data[s].s1; _out_id += OUTPUT_FEATURE_PITCH;
 240         output[_out_id] = _data[s].s2; _out_id += OUTPUT_FEATURE_PITCH;
 241         output[_out_id] = _data[s].s3; _out_id += OUTPUT_FEATURE_PITCH;
 242         output[_out_id] = _data[s].s4; _out_id += OUTPUT_FEATURE_PITCH;
 243         output[_out_id] = _data[s].s5; _out_id += OUTPUT_FEATURE_PITCH;
 244         output[_out_id] = _data[s].s6; _out_id += OUTPUT_FEATURE_PITCH;
 245         output[_out_id] = _data[s].s7; _out_id += OUTPUT_FEATURE_PITCH;
 246         output[_out_id] = _data[s].s8; _out_id += OUTPUT_FEATURE_PITCH;
 247         output[_out_id] = _data[s].s9; _out_id += OUTPUT_FEATURE_PITCH;
 248         output[_out_id] = _data[s].sa; _out_id += OUTPUT_FEATURE_PITCH;
 249         output[_out_id] = _data[s].sb; _out_id += OUTPUT_FEATURE_PITCH;
 250         output[_out_id] = _data[s].sc; _out_id += OUTPUT_FEATURE_PITCH;
 251         output[_out_id] = _data[s].sd; _out_id += OUTPUT_FEATURE_PITCH;
 252         output[_out_id] = _data[s].se; _out_id += OUTPUT_FEATURE_PITCH;
 253         output[_out_id] = _data[s].sf; _out_id += OUTPUT_FEATURE_PITCH;
 254     }
 255 #endif
 256 }