inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/common.cl"
  16 #include "include/data_types.cl"
  17
  18
  19 // ---------------------------------------------------------------------------------------------------------------------
  20 // Just-in-time macro definitions:
  21 // ---------------------------------------------------------------------------------------------------------------------
  22
  23 // Required JIT constants:
  24 //  - INPUT                - [tensor] Input dimensions (batch, spatial and feature).
  25 //  - OUTPUT               - [tensor] Output dimensions (batch, spatial and feature).
  26 //  - STRIDE               - [tensor] Stride (only spatial). Factors that describe step size in X or Y dimension of
  27 //                           input position of application of convolution filter when next ouput value
  28 //                           (step 1 in in X or Y dimension of output) is computed.
  29 //  - INPUT0_OFFSET        - [tensor] Offset for the first element
  30 //                           initial offset input position of application of convolution filter and output position.
  31 //  - FP16_SUPPORTED       - [0/1] Value indicating whether device supports FP16 OpenCL extension (cl_khr_fp16).
  32 //  - FP16_UNIT_USED       - [0/1] Value indicating that current kernel should use FP16.
  33 //  - UNIT_TYPE            - Type of unit of input/output/weight/bias.
  34 //  - UNIT_VAL_ZERO        - Literal of current UNIT_TYPE that represents 0.
  35 //  - RELU                 - [0/1] Indicates that ReLU activation function should be used on output.
  36 //  - NEGATIVE_SLOPE       - [float] Factor for negative output values (required when ReLU is specified).
  37 //
  38 //  - SUB_GROUP_SIZE       - [int] Size of used subgroup (SIMD).
  39 //  - LEFTOVERS            - [int] Optional parameter, required only when number of ofm is not dividable by SUB_GROUP_SIZE
  40 //                           see comment for FEATURES_THREADS_PER_BATCH for more informations
  41
  42 /*
  43 gpu::make_jit_constant("OUTPUT_LIMIT",              output_size),
  44 gpu::make_jit_constant("FILTER",                    filter_mem.argument().size),
  45 gpu::make_jit_constant("FILTER_ARRAY_NUM",          split),
  46 gpu::make_jit_constant("OUTPUT_BLOCK_WIDTH",        _kernel_data.block_width));
  47 gpu::make_jit_constant("OUTPUT_BLOCK_HEIGHT",       _kernel_data.block_height));
  48 gpu::make_jit_constant("IN_BLOCK_ARRAY_SIZE",       _kernel_data.input_block_array_size));
  49 gpu::make_jit_constant("IN_BLOCK_WIDTH",            _kernel_data.input_block_width));
  50 gpu::make_jit_constant("PREFETCH",                  _kernel_data.prefetch));
  51 if (_kernel_data.leftovers)
  52     gpu::make_jit_constant("LEFTOVERS",             _kernel_data.leftovers));
  53 */
  54
  55 // FEATURES_THREADS_PER_BATCH defines how many threads in z-dimension are processing single batch.
  56 // ideally, z-dimension of value n should indicate processing of n-th output feature. however, since
  57 // threads are stack in groups of SUB_GROUP_SIZE, when number of ofm is not dividable by SUB_GROUP_SIZE
  58 // there are dummy threads added in z-dimension in count of LEFTOVERS. We need to take them into consideration
  59 // while calculating batch's id (see lines 86-87). Values calculated by dummy threads are discarded at line 210.
  60 #ifdef LEFTOVERS
  61 #define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM + LEFTOVERS)
  62 #else
  63 #define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM)
  64 #endif
  65
  66 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
  67 __attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
  68 KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
  69     const __global UNIT_TYPE* input,
  70     __global UNIT_TYPE* output,
  71     const __global UNIT_TYPE* weights,
  72 #if BIAS_TERM
  73     const __global UNIT_TYPE* bias,
  74 #endif
  75     uint split_idx) // TODO: removing this parameter cause a performance degradation... :)
  76 {
  77     const uint oc  = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH;  // oc = Output Column
  78     const uint or  = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT; // or = Output Row
  79     const uint fm  = get_global_id(2);                    // fm = Feature Map = od = Output Depth
  80     const uint lid = get_sub_group_local_id();
  81
  82     uint batch_idx = fm / FEATURES_THREADS_PER_BATCH;
  83     uint feature_idx = fm % FEATURES_THREADS_PER_BATCH;
  84     uint fmg = feature_idx / SUB_GROUP_SIZE;
  85
  86     UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE];
  87     UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT];
  88     UNIT_TYPE w[PREFETCH];
  89     uint in_addr;
  90     uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid;
  91
  92     for(int i = 0; i < (OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT); i++) {
  93         out[i] = UNIT_VAL_ZERO;
  94     }
  95
  96     uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
  97     in_addr = batch_idx * INPUT0_BATCH_PITCH;
  98     in_addr += in_split_offset + INPUT0_OFFSET_WITH_PADDING + (or * STRIDE_SIZE_Y * INPUT0_Y_PITCH) + (oc * STRIDE_SIZE_X + lid) * INPUT0_X_PITCH;
  99
 100     for(int kd = 0; kd < FILTER_IFM_NUM; kd++)  // _ID = 3, RGB
 101     {
 102         uint tmp_in_addr = in_addr;
 103
 104 #if IN_BLOCK_WIDTH % SUB_GROUP_SIZE == 0
 105         __attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE)))
 106         for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) {
 107             // Horizontal position in input block after read.
 108             const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE;
 109
 110             in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + (in_block_pos % IN_BLOCK_WIDTH) * INPUT0_X_PITCH];
 111
 112             // If we have row break, move to the next row.
 113             if (in_block_next_x_pos == IN_BLOCK_WIDTH)
 114                 tmp_in_addr += INPUT0_Y_PITCH;
 115         }
 116 #elif (2 * IN_BLOCK_WIDTH) % SUB_GROUP_SIZE == 0
 117         __attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE)))
 118         for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) {
 119             // Horizontal position in input block after read.
 120             const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE;
 121
 122             if (in_block_next_x_pos <= IN_BLOCK_WIDTH) { //
 123                 in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + (in_block_pos % IN_BLOCK_WIDTH) * INPUT0_X_PITCH];
 124
 125                 // If we have row break, move to the next row.
 126                 if (in_block_next_x_pos == IN_BLOCK_WIDTH)
 127                     tmp_in_addr += INPUT0_Y_PITCH;
 128             }
 129             else {
 130                 // TODO: Generalize this step to relax IN_BLOCK_WIDTH restrictions.
 131                 // Position in sub-group on which new row need to be read.
 132                 const uint sg_br_pos = IN_BLOCK_WIDTH - in_block_pos % IN_BLOCK_WIDTH;
 133
 134                 if (lid < sg_br_pos)
 135                     in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + (in_block_pos % IN_BLOCK_WIDTH) * INPUT0_X_PITCH];
 136                 // We have row break inside sub-group. Need to move to next line.
 137                 tmp_in_addr += INPUT0_Y_PITCH;
 138                 if (lid >= sg_br_pos)
 139                     in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr - (sg_br_pos * INPUT0_X_PITCH)];
 140
 141                 // If we have another row break, move to the next row.
 142                 if (in_block_next_x_pos == 2 * IN_BLOCK_WIDTH)
 143                     tmp_in_addr += INPUT0_Y_PITCH;
 144             }
 145         }
 146 #else
 147     #error IN_BLOCK_WIDTH must be multiple of SUB_GROUP_SIZE or half of SUB_GROUP_SIZE. Other scenarios are not currently implemented.
 148 #endif
 149
 150         //move to next filter
 151         in_addr += INPUT0_FEATURE_PITCH;
 152
 153         for(int pf=0; pf<PREFETCH; pf++) {
 154             w[pf] = weights[weight_addr]; weight_addr += SUB_GROUP_SIZE;
 155         }
 156
 157         uint wi = 0;
 158         uint kr = 0; // kr = Kernel Row
 159         LOOP(FILTER_SIZE_Y, kr,  // LOOP is a macro that unrolls the loop.
 160         {
 161             uint kc = 0; // kc = Kernel Column
 162             LOOP(FILTER_SIZE_X, kc,
 163             {
 164                 //w = weights[weight_addr];
 165                 for(uint br=0; br<OUTPUT_BLOCK_HEIGHT; br++) {
 166                     for(uint bc=0; bc<OUTPUT_BLOCK_WIDTH; bc++) {
 167
 168 #if IN_BLOCK_WIDTH != SUB_GROUP_SIZE
 169                         //if we fix the programming model, then we could use a nice simple 2d array: val = in[br * STRIDE_SIZE_Y + kr][bc * STRIDE_SIZE_X + kc];
 170                         UNIT_TYPE val = intel_sub_group_shuffle( in[(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) / SUB_GROUP_SIZE],
 171                                                                     (((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) % SUB_GROUP_SIZE);
 172 #else
 173                         UNIT_TYPE val = intel_sub_group_shuffle( in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
 174 #endif
 175
 176                         out[br * OUTPUT_BLOCK_WIDTH + bc] = mad(w[wi % PREFETCH], val, out[br * OUTPUT_BLOCK_WIDTH + bc]);
 177                     }
 178                 }
 179                 w[wi % PREFETCH] = weights[weight_addr];
 180                 weight_addr += SUB_GROUP_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
 181                 wi++;
 182             });
 183         });
 184         // addr went beyond due to prefetch so move it back to correct location.
 185         weight_addr -= PREFETCH * SUB_GROUP_SIZE;
 186     }
 187
 188     uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM;
 189     uint out_addr = OUTPUT_OFFSET;
 190     out_addr += batch_idx * OUTPUT_BATCH_PITCH;
 191     out_addr += out_split_offset + feature_idx * OUTPUT_FEATURE_PITCH; // out_addr indices into start of 16 feature maps.
 192     out_addr += or * OUTPUT_Y_PITCH + oc;  // offset for the 4x3 block that this workitem is working on;
 193
 194 #if BIAS_TERM
 195     for(uint r = 0; r < OUTPUT_BLOCK_HEIGHT; r++) {
 196         for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) {
 197 #if BIAS_PER_OUTPUT
 198             const unsigned bias_index = feature_idx*OUTPUT_SIZE_X*OUTPUT_SIZE_Y + or*OUTPUT_SIZE_X + oc;
 199 #else
 200             const unsigned bias_index = feature_idx;
 201 #endif
 202             out[r * OUTPUT_BLOCK_WIDTH + c] += bias[bias_index];
 203         }
 204     }
 205 #endif
 206
 207
 208     for(uint r = 0; r < OUTPUT_BLOCK_HEIGHT; r++) {
 209         for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) {
 210             out[r * OUTPUT_BLOCK_WIDTH + c] = ACTIVATION(out[r * OUTPUT_BLOCK_WIDTH + c], NL_M, NL_N);
 211         }
 212     }
 213
 214
 215 //--------------------------------------------------------------------
 216 // output phase
 217 //--------------------------------------------------------------------
 218
 219 #ifdef LEFTOVERS
 220     if (feature_idx < OUTPUT_FEATURE_NUM)
 221 #endif
 222     for(uint r = 0; r < OUTPUT_BLOCK_HEIGHT; r++) {
 223         if(!(or + r >= OUTPUT_SIZE_Y))
 224         {
 225 #if (OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH) == 0 // in this case we don't need to check if we're outside of X boundaries
 226             uint out_vstore_offset = 0;
 227             #if (OUT_BLOCK_WIDTH % 8) > 3
 228             MAKE_VECTOR_TYPE(UNIT_TYPE, 4) tmp = MAKE_VECTOR_TYPE(UNIT_TYPE, 4)(
 229                 out[out_vstore_offset + 0 + r * OUTPUT_BLOCK_WIDTH],
 230                 out[out_vstore_offset + 1 + r * OUTPUT_BLOCK_WIDTH],
 231                 out[out_vstore_offset + 2 + r * OUTPUT_BLOCK_WIDTH],
 232                 out[out_vstore_offset + 3 + r * OUTPUT_BLOCK_WIDTH]
 233             );
 234
 235             vstore4(tmp, 0, output + out_addr + r * OUTPUT_Y_PITCH + out_vstore_offset * OUTPUT_X_PITCH);
 236             out_vstore_offset += 4;
 237             #endif
 238
 239             #if (OUT_BLOCK_WIDTH % 4) > 1
 240             MAKE_VECTOR_TYPE(UNIT_TYPE, 2) tmp2 = MAKE_VECTOR_TYPE(UNIT_TYPE, 2)(
 241                 out[out_vstore_offset + 0 + r * OUTPUT_BLOCK_WIDTH],
 242                 out[out_vstore_offset + 1 + r * OUTPUT_BLOCK_WIDTH]
 243             );
 244
 245             vstore2(tmp2, 0, output + out_addr + r * OUTPUT_Y_PITCH + out_vstore_offset * OUTPUT_X_PITCH);
 246             out_vstore_offset += 2;
 247             #endif
 248             for(uint c = out_vstore_offset; c < OUTPUT_BLOCK_WIDTH; c++) {
 249                 // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
 250                 output[out_addr + r * OUTPUT_Y_PITCH + c] = out[r * OUTPUT_BLOCK_WIDTH + c];
 251             }
 252 #else
 253             for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) {
 254                 // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
 255                 if(!(oc + c >= OUTPUT_SIZE_X))
 256                     output[out_addr + r * OUTPUT_Y_PITCH + c] = out[r * OUTPUT_BLOCK_WIDTH + c];
 257             }
 258 #endif
 259         }
 260     }
 261 }
 262
 263 #undef FEATURES_THREADS_PER_BATCH