inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_1x1_opt_fp32.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/include_all.cl"
  16
  17 #define SIMD_SIZE 8
  18 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
  19 KERNEL(fused_conv_eltwise_gpu_bfyx_1x1_opt)(
  20     __global INPUT0_TYPE* input,
  21     __global OUTPUT_TYPE* output,
  22     __global FILTER_TYPE* weights,
  23 #if BIAS_TERM
  24     __global BIAS_TYPE* biases,
  25 #endif
  26     uint split_idx,
  27     const __global float* src3)
  28 {
  29    const uint group_x = get_group_id(0) * OUT_BLOCK_WIDTH;
  30     const uint group_y = get_group_id(1) * OUT_BLOCK_HEIGHT;
  31     const uint f = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) % OUTPUT_FEATURE_NUM;
  32     const uint b = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) / OUTPUT_FEATURE_NUM;;
  33
  34     const uint ifm_part = get_sub_group_id();
  35     uint ifm_offset = ifm_part* OUT_BLOCK_DEPTH/2;
  36
  37     UNIT_TYPE in[OUT_BLOCK_HEIGHT];
  38     UNIT_TYPE dotProd0[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
  39     UNIT_TYPE dotProd1[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
  40
  41     for(uint i = 0; i < OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2; i++)
  42     {
  43         dotProd0[i] = 0;
  44         dotProd1[i] = 0;
  45     }
  46
  47 #if OUT_BLOCK_DEPTH == 8
  48     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(64 * FILTER_IFM_NUM/2);
  49 #elif OUT_BLOCK_DEPTH == 4
  50     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(32 * FILTER_IFM_NUM/2);
  51 #elif OUT_BLOCK_DEPTH == 2
  52     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(16 * FILTER_IFM_NUM/2);
  53 #else
  54     const uint filter_offset = f*FILTER_OFM_PITCH + ifm_part*(FILTER_IFM_NUM/2) * FILTER_IFM_PITCH;
  55 #endif
  56     const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + group_x * INPUT0_X_PITCH + group_y * INPUT0_Y_PITCH + ifm_part*(FILTER_IFM_NUM/2) * INPUT0_FEATURE_PITCH;
  57
  58     //--------------------------------------------------------------------
  59     // main computation phase
  60     //--------------------------------------------------------------------
  61
  62     for (uint k = 0; k < FILTER_IFM_NUM/2; ++k)
  63     {
  64         for(uint i = 0; i < OUT_BLOCK_HEIGHT; i++)
  65         {
  66             const uint in_offset = input_offset + get_sub_group_local_id() + i * INPUT0_Y_PITCH + k * INPUT0_FEATURE_PITCH;
  67             in[i] = input[in_offset];
  68         }
  69
  70 #if OUT_BLOCK_DEPTH == 8
  71         float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
  72 #elif OUT_BLOCK_DEPTH == 4
  73         float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
  74 #elif OUT_BLOCK_DEPTH == 2
  75         float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
  76 #endif
  77
  78         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
  79         {
  80             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
  81             {
  82                 float _in = intel_sub_group_shuffle(in[br], bc);
  83                 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
  84                 {
  85                     dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd];
  86                     dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd + OUT_BLOCK_DEPTH/2];
  87                 }
  88             }
  89         }
  90     }
  91
  92     __local float slm_vals[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD_SIZE];
  93
  94     //--------------------------------------------------------------------
  95     // second sub_group in workgroup task
  96     //--------------------------------------------------------------------
  97
  98     if(ifm_part == 1)
  99     {
 100         for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 101         {
 102             for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 103             {
 104                 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
 105                 {
 106                     slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)) + get_sub_group_local_id()] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
 107                     dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
 108                 }
 109             }
 110         }
 111
 112     }
 113
 114     //--------------------------------------------------------------------
 115     // first sub_group in workgroup task
 116     //--------------------------------------------------------------------
 117
 118     if(ifm_part == 0)
 119     {
 120         for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 121         {
 122             for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 123             {
 124                 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
 125                 {
 126                     slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * (bd+OUT_BLOCK_DEPTH/2) )) + get_sub_group_local_id()] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
 127                 }
 128             }
 129         }
 130
 131     }
 132
 133     //--------------------------------------------------------------------
 134     // add bias phase
 135     //--------------------------------------------------------------------
 136
 137     #if BIAS_TERM
 138     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 139     {
 140         float _bias = biases[f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id()];
 141         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 142         {
 143             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
 144             {
 145                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _bias;
 146             }
 147         }
 148     }
 149     #endif
 150
 151     barrier(CLK_LOCAL_MEM_FENCE); // we want to add barrier after biases addition so that the long slm write part latency is shadowed by it
 152
 153     //--------------------------------------------------------------------
 154     // sum sub-group results + activation phase
 155     //--------------------------------------------------------------------
 156
 157     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 158     {
 159         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 160         {
 161             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
 162             {
 163                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += slm_vals[SIMD_SIZE * (bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) )) + get_sub_group_local_id()];
 164                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
 165             }
 166         }
 167     }
 168
 169     //--------------------------------------------------------------------
 170     // eltwise with eltwise activation phase
 171     //--------------------------------------------------------------------
 172     #if IN_OUT_OPT != 1
 173     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 174     {
 175         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 176         {
 177             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
 178             {
 179                 uint src3_offset = GET_DATA_INDEX(INPUT1, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), (group_y + br) * ELTW_STRIDE_Y, (group_x + bc) * ELTW_STRIDE_X);
 180                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += src3[src3_offset];
 181                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION_ELTW(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M_ELTW, NL_N_ELTW);
 182             }
 183         }
 184     }
 185     #endif
 186
 187     //--------------------------------------------------------------------
 188     // output phase
 189     //--------------------------------------------------------------------
 190
 191     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 192     {
 193         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 194         {
 195             uint dst_index = GET_DATA_INDEX(OUTPUT, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), group_y + br, group_x);
 196             uint out_vstore_offset = 0;
 197             #if (OUT_BLOCK_WIDTH >= 8)
 198             {
 199                 float8 tmp = (float8)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 200                                       dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 201                                       dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 202                                       dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 203                                       dotProd0[out_vstore_offset + 4 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 204                                       dotProd0[out_vstore_offset + 5 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 205                                       dotProd0[out_vstore_offset + 6 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 206                                       dotProd0[out_vstore_offset + 7 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
 207 #if IN_OUT_OPT == 1
 208                 float8 tmp2 = vload8(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
 209                 tmp += tmp2;
 210                 tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW);
 211 #endif
 212                 vstore8(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
 213                 out_vstore_offset += 8;
 214             }
 215             #endif
 216             #if (OUT_BLOCK_WIDTH % 8) > 3
 217             {
 218                 float4 tmp = (float4)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 219                                       dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 220                                       dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 221                                       dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
 222 #if IN_OUT_OPT == 1
 223                 float4 tmp2 = vload4(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
 224                 tmp += tmp2;
 225                 tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW);
 226 #endif
 227                 vstore4(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
 228                 out_vstore_offset += 4;
 229             }
 230             #endif
 231             #if (OUT_BLOCK_WIDTH % 4) > 1
 232             {
 233                 float2 tmp = (float2)(dotProd0[out_vstore_offset + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 234                                        dotProd0[out_vstore_offset+1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
 235 #if IN_OUT_OPT == 1
 236                 float2 tmp2 = vload2(0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
 237                 tmp += tmp2;
 238                 tmp = ACTIVATION_ELTW(tmp, NL_M_ELTW, NL_N_ELTW);
 239 #endif
 240                 vstore2(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
 241                 out_vstore_offset += 2;
 242             }
 243             #endif
 244             for(uint bc = out_vstore_offset; bc < OUT_BLOCK_WIDTH; bc++)
 245             {
 246 #if IN_OUT_OPT == 1
 247                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += output[dst_index + bc * OUTPUT_X_PITCH];
 248                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION_ELTW(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M_ELTW, NL_N_ELTW);
 249 #endif
 250                 output[dst_index + bc * OUTPUT_X_PITCH] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
 251             }
 252         }
 253     }
 254 }