inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_1x1_opt.cl

   1 // Copyright (c) 2016-2017 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/include_all.cl"
  16
  17 #define SIMD_SIZE 8
  18 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
  19 KERNEL(convolution)(
  20     __global INPUT0_TYPE* input,
  21     __global OUTPUT_TYPE* output,
  22     __global FILTER_TYPE* weights,
  23 #if BIAS_TERM
  24     __global BIAS_TYPE* biases,
  25 #endif
  26     uint split_idx)
  27 {
  28     const uint group_x = get_group_id(0) * OUT_BLOCK_WIDTH;
  29     const uint group_y = get_group_id(1) * OUT_BLOCK_HEIGHT;
  30     const uint f = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) % OUTPUT_FEATURE_NUM;
  31     const uint b = (get_group_id(2) * SIMD_SIZE * OUT_BLOCK_DEPTH) / OUTPUT_FEATURE_NUM;;
  32
  33     const uint ifm_part = get_sub_group_id();
  34     uint ifm_offset = ifm_part* OUT_BLOCK_DEPTH/2;
  35
  36     UNIT_TYPE in[OUT_BLOCK_HEIGHT];
  37     UNIT_TYPE dotProd0[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
  38     UNIT_TYPE dotProd1[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2];
  39
  40     for(uint i = 0; i < OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH/2; i++)
  41     {
  42         dotProd0[i] = 0;
  43         dotProd1[i] = 0;
  44     }
  45
  46 #if OUT_BLOCK_DEPTH == 8
  47     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(64 * FILTER_IFM_NUM/2);
  48 #elif OUT_BLOCK_DEPTH == 4
  49     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(32 * FILTER_IFM_NUM/2);
  50 #elif OUT_BLOCK_DEPTH == 2
  51     const uint filter_offset = f * FILTER_IFM_NUM + ifm_part*(16 * FILTER_IFM_NUM/2);
  52 #else
  53     const uint filter_offset = f*FILTER_OFM_PITCH + ifm_part*(FILTER_IFM_NUM/2) * FILTER_IFM_PITCH;
  54 #endif
  55     const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + group_x * INPUT0_X_PITCH + group_y * INPUT0_Y_PITCH + ifm_part*(FILTER_IFM_NUM/2) * INPUT0_FEATURE_PITCH;
  56
  57     //--------------------------------------------------------------------
  58     // main computation phase
  59     //--------------------------------------------------------------------
  60
  61     for (uint k = 0; k < FILTER_IFM_NUM/2; ++k)
  62     {
  63         for(uint i = 0; i < OUT_BLOCK_HEIGHT; i++)
  64         {
  65             const uint in_offset = input_offset + get_sub_group_local_id() + i * INPUT0_Y_PITCH + k * INPUT0_FEATURE_PITCH;
  66             in[i] = input[in_offset];
  67         }
  68
  69 #if OUT_BLOCK_DEPTH == 8
  70         float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
  71 #elif OUT_BLOCK_DEPTH == 4
  72         float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
  73 #elif OUT_BLOCK_DEPTH == 2
  74         float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
  75 #endif
  76
  77         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
  78         {
  79             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
  80             {
  81                 float _in = intel_sub_group_shuffle(in[br], bc);
  82                 for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
  83                 {
  84                     dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd];
  85                     dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd + OUT_BLOCK_DEPTH/2];
  86                 }
  87             }
  88         }
  89     }
  90
  91     __local float slm_vals[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD_SIZE];
  92     __local float* slm_p = &slm_vals[0];
  93     //--------------------------------------------------------------------
  94     // second sub_group in workgroup task
  95     //--------------------------------------------------------------------
  96
  97     if(ifm_part == 1)
  98     {
  99         for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 100         {
 101             for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 102             {
 103                 for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
 104                 {
 105                     slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * bd))] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
 106                     dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
 107                 }
 108             }
 109         }
 110
 111     }
 112
 113     //--------------------------------------------------------------------
 114     // first sub_group in workgroup task
 115     //--------------------------------------------------------------------
 116
 117     if(ifm_part == 0)
 118     {
 119         for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 120         {
 121             for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 122             {
 123                 uint width_offset = 0;
 124                 #if (OUT_BLOCK_WIDTH) >= 4
 125                 const uint slm_off = OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + OUT_BLOCK_DEPTH/2) ));
 126                 float4 tmp = (float4)(dotProd1[width_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 127                                       dotProd1[width_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 128                                       dotProd1[width_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 129                                       dotProd1[width_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
 130                 vstore4(tmp, 0, slm_p + slm_off);
 131                 width_offset += 4;
 132                 #endif
 133                 for(uint bc = width_offset; bc < OUT_BLOCK_WIDTH; bc++)
 134                 {
 135                     slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd+OUT_BLOCK_DEPTH/2) ))] = dotProd1[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
 136                 }
 137             }
 138         }
 139
 140     }
 141
 142     //--------------------------------------------------------------------
 143     // add bias phase
 144     //--------------------------------------------------------------------
 145
 146     #if BIAS_TERM
 147     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 148     {
 149         float _bias = biases[f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id()];
 150         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 151         {
 152             for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
 153             {
 154                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _bias;
 155             }
 156         }
 157     }
 158     #endif
 159
 160     barrier(CLK_LOCAL_MEM_FENCE); // we want to add barrier after biases addition so that the long slm write part latency is shadowed by it
 161
 162     //--------------------------------------------------------------------
 163     // sum sub-group results + activation phase
 164     //--------------------------------------------------------------------
 165
 166     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 167     {
 168         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 169         {
 170             uint width_offset = 0;
 171             #if (OUT_BLOCK_WIDTH) >= 4
 172             const uint slm_off = OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) ));
 173             float4 tmp = vload4(0, slm_p + slm_off);
 174             dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[0];
 175             dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[1];
 176             dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[2];
 177             dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += tmp[3];
 178
 179             dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
 180             dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
 181             dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
 182             dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
 183
 184             width_offset += 4;
 185             #endif
 186
 187             for(uint bc = width_offset; bc < OUT_BLOCK_WIDTH; bc++)
 188             {
 189                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += slm_vals[bc + OUT_BLOCK_WIDTH * (get_sub_group_local_id() + SIMD_SIZE * (br + OUT_BLOCK_HEIGHT * (bd + ifm_offset) ))];
 190                 dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] = ACTIVATION(dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)], NL_M, NL_N);;
 191             }
 192         }
 193     }
 194
 195     //--------------------------------------------------------------------
 196     // output phase
 197     //--------------------------------------------------------------------
 198
 199     for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
 200     {
 201         for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
 202         {
 203             uint dst_index = GET_DATA_INDEX(OUTPUT, b, f + (bd + ifm_offset) * SIMD_SIZE + get_sub_group_local_id(), group_y + br, group_x);
 204             uint out_vstore_offset = 0;
 205             #if (OUT_BLOCK_WIDTH >= 8)
 206             float8 tmp = (float8)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 207                                   dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 208                                   dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 209                                   dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 210                                   dotProd0[out_vstore_offset + 4 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 211                                   dotProd0[out_vstore_offset + 5 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 212                                   dotProd0[out_vstore_offset + 6 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 213                                   dotProd0[out_vstore_offset + 7 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
 214             vstore8(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
 215             out_vstore_offset += 8;
 216             #endif
 217             #if (OUT_BLOCK_WIDTH % 8) > 3
 218             float4 tmp = (float4)(dotProd0[out_vstore_offset + 0 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 219                                   dotProd0[out_vstore_offset + 1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 220                                   dotProd0[out_vstore_offset + 2 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 221                                   dotProd0[out_vstore_offset + 3 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
 222             vstore4(tmp, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
 223             out_vstore_offset += 4;
 224             #endif
 225             #if (OUT_BLOCK_WIDTH % 4) > 1
 226             float2 tmp2 = (float2)(dotProd0[out_vstore_offset + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)],
 227                                  dotProd0[out_vstore_offset+1 + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)]);
 228             vstore2(tmp2, 0, output + dst_index + out_vstore_offset * OUTPUT_X_PITCH);
 229             out_vstore_offset += 2;
 230             #endif
 231             //dst_index += 4 * OUTPUT_X_PITCH;
 232             for(uint bc = out_vstore_offset; bc < OUT_BLOCK_WIDTH; bc++)
 233             {
 234                 output[dst_index + bc * OUTPUT_X_PITCH] = dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)];
 235             }
 236         }
 237     }
 238 }