inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/common.cl"
  16
  17 #include "include/data_types.cl"
  18 #include "include/fetch.cl"
  19 #include "include/mmad.cl"
  20
  21 #define FILTER_IFM_SLICES ((FILTER_IFM_NUM + 3) /4)
  22 #define FILTER_SIZE_X_SLICES ((FILTER_SIZE_X + 7) / 8)
  23
  24 #define OUT_BLOCK_HEIGHT 4
  25 #define WEIGHTS_PER_WORKITEM 4 // currently needs to be set to 4, check output stage and float4 on quantizations etc.
  26
  27 #define SCALE 0.11f
  28
  29 #ifdef LIGHTWEIGHT_QUANTIZATION
  30
  31 #define QUANTIZATION \
  32     out[w] = convert_uchar_sat((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * SCALE + bias_f[w]);
  33
  34 #elif NO_QUANTIZATION
  35
  36 #define QUANTIZATION \
  37     out[w] = convert_uchar_sat(dotProd[w*OUT_BLOCK_HEIGHT + h][i]);
  38
  39 #else
  40
  41 #define QUANTIZATION \
  42     out[w] = as_uchar( ACTIVATION( convert_char( round( ( (float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w])), NL_M, NL_N));
  43
  44 #endif
  45
  46 __attribute__((intel_reqd_sub_group_size(8)))
  47 KERNEL(convolution_gpu_byx8_f4_fs_bs_yx_bsv4_fsv32)(
  48     __global INPUT0_TYPE* input,
  49     __global OUTPUT_TYPE* output,
  50     __global FILTER_TYPE* weights,
  51     __global BIAS_TYPE* biases,
  52     __global float* quantizations,
  53 #if CALIBRATION_TERM
  54     __global float* calibrations,
  55 #endif
  56     uint split_idx)
  57 {
  58     const uint x = get_group_id(1) * 8;
  59     const uint y = get_group_id(2) * OUT_BLOCK_HEIGHT;
  60
  61     const uint f = (get_group_id(0) * 8 * WEIGHTS_PER_WORKITEM ) % OUTPUT_FEATURE_NUM;
  62     const uint b = (get_group_id(0) * 8 * WEIGHTS_PER_WORKITEM) / OUTPUT_FEATURE_NUM;
  63
  64     int8 dotProd[OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] =  { 0 };
  65
  66     const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
  67     const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
  68
  69     const uint filter_offset = f*FILTER_OFM_PITCH;
  70     const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET;
  71
  72     for (uint k = 0; k < FILTER_IFM_SLICES; ++k)
  73     {
  74         __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
  75         for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
  76         {
  77             const int input_offset_y = input_y + j * DILATION_SIZE_Y;
  78
  79             __attribute__((opencl_unroll_hint(FILTER_SIZE_X_SLICES)))
  80             for(uint i = 0; i < FILTER_SIZE_X_SLICES; i++)
  81             {
  82                 int8 act_reg[OUT_BLOCK_HEIGHT]; // activations for MMAD
  83
  84                 // preload spatial data
  85                 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
  86                 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
  87                 {
  88                     uint input_idx = GET_DATA_BYX8_F4_INDEX(INPUT0, b, k * 4, input_offset_y + h * STRIDE_SIZE_Y, input_x + i * 8);
  89                     int2 _input_data_01 = as_int2(intel_sub_group_block_read2((__global uint*)(input + input_idx)));
  90                     int _input_data_2 = as_int(intel_sub_group_block_read((__global uint*)(input + input_idx + 8 * 8)));
  91
  92                     act_reg[h][0] = _input_data_01[0];
  93                     act_reg[h][1] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 1);
  94                     act_reg[h][2] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 2);
  95                     act_reg[h][3] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 3);
  96                     act_reg[h][4] = _input_data_01[1];
  97                     act_reg[h][5] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 1);
  98                     act_reg[h][6] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 2);
  99                     act_reg[h][7] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 3);
 100                 }
 101
 102                 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
 103                 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) // iterate over output feature channels for weights
 104                 {
 105                     uint filter_idx = GET_FILTER_OS_IS_Y_X8_OSV8_ISV4(FILTER, f + w * 8, k * 4, j, i * 8);
 106                     int8 _w = as_int8(intel_sub_group_block_read8((__global uint*)(weights + filter_idx)));
 107
 108                     __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
 109                     for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
 110                     {
 111                         // MMAD on 8x WEIGHTS_PER_WORKITEM input channels elements for 8x outputs in WI
 112                         dotProd[w*OUT_BLOCK_HEIGHT + h] = MMAD_8x8(act_reg[h], _w, dotProd[w*OUT_BLOCK_HEIGHT + h]);
 113                     }
 114                 }
 115             }
 116         }
 117     }
 118
 119 float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + f) ));
 120 float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + f) ));
 121 #if CALIBRATION_TERM
 122 float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + f) ));
 123 #endif
 124
 125 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
 126 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
 127 {
 128     const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f + get_sub_group_local_id(), y + h, x);
 129
 130     __attribute__((opencl_unroll_hint(8)))
 131     for(uint i = 0; i < 8; i++)
 132     {
 133
 134     #if WEIGHTS_PER_WORKITEM == 4
 135
 136         uchar4 out;
 137         __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
 138         for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
 139         {
 140             QUANTIZATION;
 141         }
 142         intel_sub_group_block_write_uc4((__global uchar*)(output + dst_index + 32 * 4 * i), out);
 143
 144     #else
 145
 146         __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
 147         for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
 148         {
 149         #if CALIBRATION_TERM
 150             dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w]);
 151         #else  // CALIBRATION_TERM
 152             dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * O_QF);
 153         #endif // CALIBRATION_TERM
 154             output[dst_index + 32 * 4 * i + 8 * w] = ACTIVATION(convert_char(dotProd[w*OUT_BLOCK_HEIGHT + h][i]), NL_M, NL_N);
 155         }
 156
 157     #endif
 158     }
 159 }
 160
 161 }
 162
 163 #undef OUT_BLOCK_HEIGHT
 164 #undef WEIGHTS_PER_WORKITEM
 165
 166 #undef FILTER_SIZE_X_SLICES
 167 #undef FILTER_IFM_SLICES
 168
 169 #undef SCALE
 170 #undef QUANTIZATION