inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_imad.cl

   1 // Copyright (c) 2018-2019 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15
  16 #include "include/common.cl"
  17 #include "include/fetch.cl"
  18 #include "include/data_types.cl"
  19 #include "include/imad.cl"
  20
  21 #ifndef NON_BLOCK_LOAD
  22 // block loads for inputs and weights should be fastest, but compiler seems
  23 // to do better with a mix, regular loads for inputs and block loads for weights.
  24 #define BLOCK_LOAD_WEIGHTS
  25 #endif
  26 // Input reading operation is always blocked.
  27 #define BLOCK_LOAD_INPUTS
  28
  29 // for now kernel stride is square
  30 #define K_WSTRIDE K_STRIDE
  31 #define K_HSTRIDE K_STRIDE
  32
  33 // need KERNEL width for first output + STRIDE more for each additional.
  34 #define IN_BLOCK_WIDTH  (K_WIDTH  + K_WSTRIDE * (OUT_BLOCK_WIDTH  - 1))
  35 #define IN_BLOCK_HEIGHT (K_HEIGHT + K_HSTRIDE * (OUT_BLOCK_HEIGHT - 1))
  36
  37 // for imad we are packing 4 8bit activations per 32 bit SIMD lane
  38 // if we later add 4bit, then PACK would be 8.
  39 #define PACK 4
  40
  41 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
  42 KERNEL (convolution_gpu_imad)(
  43     __global uint        *inputs,
  44     __global OUTPUT_TYPE *outputs,
  45     __global int         *weights
  46 #if BIAS_TERM
  47     ,__global BIAS_TYPE  *biases
  48 #endif
  49 #if QUANTIZATION_TERM
  50     ,__global float      *quantizations
  51 #endif
  52 #if CALIBRATION_TERM
  53     ,__global float      *calibrations
  54 #endif
  55 )
  56 {
  57     const uint oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
  58     const uint or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
  59     const uint fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth, SIMD is across this dimension, WG is 1x1x16
  60     const uint fmg = get_group_id(2);
  61     const uint lid = get_local_id(2);
  62     const uint batch = fm / _OD;
  63
  64     uint in[IN_BLOCK_HEIGHT];
  65     int  out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0 };  // this is the 32 bit signed accumulator that must be converted to 8 bits before final write.
  66
  67     #define NUM_FILTERS (K_HEIGHT * K_WIDTH)
  68     int w[NUM_FILTERS];
  69
  70     int in_addr;
  71
  72 #ifdef BLOCK_LOAD_WEIGHTS
  73     int weight_addr = (fmg % (_OD / SIMD_SIZE)) * ((_ID * K_HEIGHT * K_WIDTH * SIMD_SIZE) / PACK);
  74 #else
  75     int weight_addr = (fmg % (_OD / SIMD_SIZE)) * ((_ID * K_HEIGHT * K_WIDTH * SIMD_SIZE) / PACK) + lid;
  76 #endif
  77
  78     uint input_size = (_ID * (_IH + IHPAD) * (_IW + IWPAD)) / PACK; // dividing by PACK to get right number of 32bit entities.
  79
  80     __attribute__((opencl_unroll_hint(1)))
  81     for(int kd = 0; kd < (_ID / PACK); kd++) // For imad we do 4X less input feature map iterations since we are packing 4 of them in each uchar4.  For now assume _ID is multiple of packing factor.
  82     {
  83
  84 #ifdef BLOCK_LOAD_INPUTS
  85         in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) + (or * K_STRIDE) * (_IW + IWPAD) + (oc * K_STRIDE);
  86 #else
  87         in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) + (or * K_STRIDE) * (_IW + IWPAD) + (oc * K_STRIDE) + lid;
  88 #endif
  89         in_addr += batch * input_size;  // adjust for batching
  90
  91         for(uint reg = 0; reg < IN_BLOCK_HEIGHT; reg++) {
  92 #ifdef BLOCK_LOAD_INPUTS
  93             in[reg] = intel_sub_group_block_read((const __global uint*) &inputs[in_addr]);
  94 #else
  95             in[reg] = inputs[in_addr];// read SIMD_SIZE elements wide
  96 #endif
  97             in_addr += (_IW + IWPAD);  // move to next row down
  98         }
  99
 100 #ifdef BLOCK_LOAD_WEIGHTS
 101         *((int8*)&w[0]) = as_int8(intel_sub_group_block_read8((const __global uint*) &weights[weight_addr]));
 102         w[8]= as_int(intel_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)]));
 103         weight_addr += SIMD_SIZE*NUM_FILTERS;
 104 #else
 105         for(int pf=0; pf < NUM_FILTERS; pf++) {
 106             w[pf] = weights[weight_addr];
 107             weight_addr += SIMD_SIZE;
 108         }
 109 #endif
 110
 111         int wi = 0;
 112         int kr = 0; // kr = Kernel Row
 113         LOOP(K_HEIGHT, kr,
 114         {
 115             int kc = 0; // kc = Kernel Column
 116             LOOP(K_WIDTH, kc,
 117             {
 118                 for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
 119                     for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
 120                         uint input = sub_group_broadcast(in[br * K_HSTRIDE + kr], bc * K_WSTRIDE + kc);
 121
 122                         out[br * OUT_BLOCK_WIDTH + bc] =
 123 #ifdef CONVO_UNSIGNED
 124                             IMAD(out[br * OUT_BLOCK_WIDTH + bc], as_uchar4(input), as_char4(w[wi]));
 125 #else
 126                             IMAD(out[br * OUT_BLOCK_WIDTH + bc], as_char4(input), as_char4(w[wi]));
 127 #endif
 128                     }
 129                 }
 130                 wi++;
 131             });
 132         });
 133     } //for kd
 134
 135     // Feature maps are an array of slices, each H,W position within the slice contains
 136     // four 8bit feature maps, packed like RGBA components into a 32 bit pixel.
 137     int row_size_bytes = (_OW + OWPAD) * PACK;
 138
 139     // Slice_pack is a pack of 4 feature map tiles that are [OH][OW][4]
 140     // that are stored within the full [N][C/4][H][W][4] output.
 141     int slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD);
 142
 143     // Dividing the feature map index by 4 gives us the slice_pack_index in each lane
 144     // (each lane within block of 4 will have same index).
 145     int slice_pack_index = fm / PACK;
 146
 147     // Each group of 4 simd lanes points to start of it's slice pack.
 148     int slice_pack_start_addr_bytes = slice_pack_index * slice_pack_size_bytes;
 149
 150     // Make each lane within the group of 4(PACK) simd lanes point to an individual byte
 151     // witihn the uchar4 at start of slice pack.
 152     int slice_pack_addr_bytes = slice_pack_start_addr_bytes + (lid % PACK);
 153
 154     // Adjust to particular tile that we are working on
 155     slice_pack_addr_bytes += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * row_size_bytes
 156                              + (oc + OUTPUT_PAD_BEFORE_SIZE_X) * PACK;
 157
 158     for (int r = 0; r < OUT_BLOCK_HEIGHT; r++) {
 159         for (int c = 0; c < OUT_BLOCK_WIDTH; c++) {
 160             uint out_idx = slice_pack_addr_bytes + r * row_size_bytes + (c*PACK);
 161 #if QUANTIZATION_TERM
 162             int dotProd       = out[r * OUT_BLOCK_WIDTH + c];
 163 #else
 164             UNIT_TYPE dotProd = out[r * OUT_BLOCK_WIDTH + c];
 165 #endif
 166
 167 #if BIAS_TERM
 168             const uint f = fm % _OD;
 169     #if   BIAS_PER_OUTPUT
 170             #error convolution_gpu_imad.cl: BIAS_PER_OUTPUT - not supported
 171     #elif BIAS_PER_OFM
 172             const uint bias_index = f;
 173     #endif
 174
 175     #if QUANTIZATION_TERM
 176         #if CALIBRATION_TERM
 177
 178             dotProd = (UNIT_TYPE)round( ((float)dotProd * quantizations[f] * I_QF + biases[bias_index])
 179                                         * calibrations[f] );
 180         #else
 181             dotProd = (UNIT_TYPE)round( ((float)dotProd * quantizations[f] * I_QF + biases[bias_index])
 182                                         * O_QF );
 183         #endif // CALIBRATION_TERM
 184     #else
 185             dotProd += (UNIT_TYPE)biases[bias_index];
 186     #endif // QUANTIZATION_TERM
 187 #endif // BIAS_TERM
 188
 189 #if QUANTIZATION_TERM
 190             UNIT_TYPE dotProd_A = ACTIVATION(convert_char(dotProd), NL_M, NL_N);
 191 #else
 192             UNIT_TYPE dotProd_A = ACTIVATION(dotProd, NL_M, NL_N);
 193 #endif
 194
 195 #ifdef CONVO_UNSIGNED
 196             outputs[out_idx] = (uchar)( max((int)dotProd_A , 0) & 0xFF );
 197 #else
 198             outputs[out_idx] = (uchar)dotProd_A & 0xFF;
 199 #endif
 200         } // for (int c = 0; c < OUT_BLOCK_WIDTH; c++)
 201     } // for (int r = 0; r < OUT_BLOCK_HEIGHT; r++)
 202 }