inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl

   1 // Copyright (c) 2018 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/fetch.cl"
  16 #include "include/mmad.cl"
  17
  18 #define SCALE 0.11f
  19
  20 #ifdef LIGHTWEIGHT_QUANTIZATION
  21
  22 #define QUANTIZATION \
  23     uchar4 out;\
  24     out[0] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s0);\
  25     out[1] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b] * SCALE + bias_f.s1);\
  26     out[2] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b] * SCALE + bias_f.s2);\
  27     out[3] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s3);
  28
  29 #elif NO_QUANTIZATION
  30
  31 #define QUANTIZATION \
  32     uchar4 out;\
  33     out[0] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b]);\
  34     out[1] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b]);\
  35     out[2] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b]);\
  36     out[3] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b]);
  37
  38 #else
  39
  40 #define QUANTIZATION \
  41     char4 out;\
  42     out[0] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b]  * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0 ) ), NL_M, NL_N);\
  43     out[1] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b]  * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1 ) ), NL_M, NL_N);\
  44     out[2] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b]  * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2 ) ), NL_M, NL_N);\
  45     out[3] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b]  * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3 ) ), NL_M, NL_N);
  46
  47 #endif
  48
  49 #define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
  50 #define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
  51 #define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
  52 #define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
  53 // input data is in blocks 4batch x 32 features
  54
  55 #define NEEDED_INPUT_X ((OUT_BLOCK_WIDTH-1) * (STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) + 1)
  56 #define NEEDED_INPUT_Y ((OUT_BLOCK_HEIGHT-1) * (STRIDE_SIZE_Y) + (FILTER_SIZE_Y - 1) + 1)
  57
  58 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
  59 KERNEL(convolution_mmad_batched_block_1x1)(
  60     __global INPUT0_TYPE* input,
  61     __global OUTPUT_TYPE* output,
  62     __global FILTER_TYPE* weights,
  63     __global BIAS_TYPE* biases,
  64     const __global float* quantizations,
  65 #if CALIBRATION_TERM
  66     const __global float* calibrations,
  67 #endif
  68     uint split_idx)
  69 {
  70     const uint x = get_global_id(0) * OUT_BLOCK_WIDTH;
  71     const uint y = get_global_id(1) * OUT_BLOCK_HEIGHT;
  72
  73 #if WEIGHTS_PER_WORKITEM == 4
  74     const uint f = (get_group_id(2) * 32 + get_sub_group_local_id() * 4) % FILTER_OFM_ALIGNED;
  75 #else
  76     const uint f = ((get_group_id(2) * WEIGHTS_PER_WORKITEM * 8) + get_sub_group_local_id() ) % FILTER_OFM_ALIGNED;
  77 #endif
  78     const uint b_block = (get_group_id(2) * 8 * WEIGHTS_PER_WORKITEM) / FILTER_OFM_ALIGNED;
  79
  80     int4 dotProd[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 };
  81
  82     const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
  83     const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
  84
  85     const uint filter_offset = ((get_group_id(2) * WEIGHTS_PER_WORKITEM) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
  86     const uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block;
  87
  88     uint filter_idx = filter_offset;
  89     for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
  90     {
  91         ////// preloading input data //////
  92         int4 preloaded_input[NEEDED_INPUT_X * NEEDED_INPUT_Y];
  93         for(int h = 0; h < NEEDED_INPUT_Y; h++)
  94         {
  95             for(int p = 0; p < NEEDED_INPUT_X; p++)
  96             {
  97                 const int input_offset_y = input_y + h;
  98                 const int input_offset_x = input_x + p;
  99
 100                 uint input_idx = input_offset + input_offset_y * IN_Y_PITCH + input_offset_x * IN_X_PITCH + k * IN_F_BLOCK_PITCH;
 101                 preloaded_input[p + h * NEEDED_INPUT_X] = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
 102             }
 103         }
 104
 105         __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
 106         for (uint j = 0; j < FILTER_SIZE_Y; ++j)
 107         {
 108             __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
 109             for (uint i = 0; i < FILTER_SIZE_X; ++i)
 110             {
 111                 ////// preloading weights data //////
 112                 int8 preloaded_weights[WEIGHTS_PER_WORKITEM];
 113                 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
 114                 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
 115                 {
 116                     preloaded_weights[w] = as_int8(intel_sub_group_block_read8((const __global uint*) (weights + (filter_idx + w * FILTER_OFM_BLOCK_PITCH) ) ));
 117                 }
 118
 119                 ////// computing //////
 120                 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
 121                 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
 122                 {
 123                     __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
 124                     for(uint oy = 0; oy < OUT_BLOCK_HEIGHT; oy++)
 125                     {
 126                         __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
 127                         for(uint ox = 0; ox < OUT_BLOCK_WIDTH; ox++)
 128                         {
 129                             const uint out_idx = ox + OUT_BLOCK_WIDTH * (oy + w * OUT_BLOCK_HEIGHT);
 130                             const uint preloaded_idx =ox * STRIDE_SIZE_X + i + NEEDED_INPUT_X * (oy * STRIDE_SIZE_Y + j);
 131                             dotProd[out_idx] = MMAD_4x8(preloaded_input[preloaded_idx], preloaded_weights[w], dotProd[out_idx]);
 132                         }
 133                     }
 134                 }
 135                 filter_idx += FILTER_X_PITCH;
 136             }
 137         }
 138     }
 139
 140
 141 #if WEIGHTS_PER_WORKITEM == 4
 142
 143 float4 quant_f = vload4(0, quantizations + f);
 144 float4 bias_f = vload4(0, biases + f);
 145 float4 calib_f = vload4(0, calibrations + f);
 146 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
 147 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
 148 {
 149     __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
 150     for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
 151     {
 152         const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y + h, x + o);
 153
 154         uint4 to_output;
 155         __attribute__((opencl_unroll_hint(4)))
 156         for(uint b = 0; b < 4; b++)
 157         {
 158             const uint out_idx = o + OUT_BLOCK_WIDTH * h;
 159
 160             QUANTIZATION;
 161             to_output[b] = as_uint(out);
 162         }
 163         intel_sub_group_block_write4((__global uint*)(output + dst_index), to_output);
 164     }
 165 }
 166
 167 #else // WEIGHTS_PER_WORKITEM ==4
 168
 169 ////// QUANTIZE & OUTPUT //////
 170 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
 171 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
 172 {
 173     float quant_f = quantizations[f + w * 8];
 174     float bias_f = biases[f + w * 8];
 175 #if CALIBRATION_TERM
 176     float calib_f = calibrations[f + w * 8];
 177 #endif
 178     __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
 179     for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
 180     {
 181         __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
 182         for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
 183         {
 184             const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT);
 185             for(uint b = 0; b < 4; b++)
 186             {
 187             #if CALIBRATION_TERM
 188                 dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * calib_f);
 189             #else  // CALIBRATION_TERM
 190                 dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * O_QF);
 191             #endif // CALIBRATION_TERM
 192             }
 193         }
 194     }
 195 }
 196
 197 ////// OUTPUT STAGE //////
 198 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
 199 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
 200 {
 201     __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
 202     for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
 203     {
 204         const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y + h, x + o);
 205
 206         __attribute__((opencl_unroll_hint(4)))
 207         for(uint b = 0; b < 4; b++)
 208         {
 209             #if WEIGHTS_PER_WORKITEM == 2
 210                 char2 out;
 211                 const uint out_idx = o + OUT_BLOCK_WIDTH * h;
 212                 out[0] = ACTIVATION(convert_char(dotProd[out_idx][b]), NL_M, NL_N);
 213                 out[1] = ACTIVATION(convert_char(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT][b]), NL_M, NL_N);
 214
 215                 intel_sub_group_block_write_uc2((__global uchar*)(output + dst_index + b * 32), as_uchar2(out));
 216             #else
 217             __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
 218             for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
 219             {
 220                 const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT);
 221                 const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f + w * 8, y + h, x + o);
 222                 char char_val = ACTIVATION(convert_char(dotProd[out_idx][b]), NL_M, NL_N);
 223                 output[dst_index + b * 32] = char_val;
 224             }
 225             #endif
 226         }
 227     }
 228 }
 229
 230 #endif // WEIGHTS_PER_WORKITEM ==4
 231
 232 }
 233
 234 #undef FILTER_IFM_MMAD_NUM
 235 #undef FILTER_OFM_MMAD_NUM
 236 #undef FILTER_IFM_ALIGNED
 237 #undef FILTER_OFM_ALIGNED
 238
 239
 240 #undef SCALE
 241 #undef QUANTIZATION