inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl

   1 // Copyright (c) 2018 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/include_all.cl"
  16
  17 #define LOCAL_SIZE INPUT0_BATCH_NUM
  18
  19 __attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
  20 KERNEL(convolution)(
  21     __global INPUT0_TYPE* input,
  22     __global OUTPUT_TYPE* output,
  23     __global FILTER_TYPE* weights,
  24 #if BIAS_TERM
  25     __global BIAS_TYPE* biases,
  26 #endif
  27     uint split_idx,
  28     __global INPUT0_TYPE* scale_in
  29 #if SCALE_BIAS_TERM
  30     , __global INPUT0_TYPE* scale_bias
  31 #endif
  32 #if FUSED_TRAINING
  33     , __global INPUT0_TYPE* inv_var,
  34     __global INPUT0_TYPE* conv_output,
  35     __global INPUT0_TYPE* bn_output
  36 #endif
  37     )
  38 {
  39     const uint f = get_global_id(1);
  40     const uint b = get_global_id(0);
  41
  42     UNIT_TYPE conv_out = UNIT_VAL_ZERO;
  43
  44     const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
  45
  46     const uint filter_offset = f*FILTER_OFM_PITCH;
  47     const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
  48
  49     for (uint y = 0; y < OUTPUT_SIZE_Y; ++y)
  50     {
  51         const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
  52         for (uint x = 0; x < OUTPUT_SIZE_X; ++x)
  53         {
  54             const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
  55             for (uint k = 0; k < FILTER_IFM_NUM; ++k)
  56             {
  57                 for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
  58                 {
  59                     const int input_offset_y = input_y + j * DILATION_SIZE_Y;
  60                     const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
  61
  62                     if(!zero_y)
  63                     {
  64                         for (uint i = 0; i < FILTER_SIZE_X ; ++i)
  65                         {
  66                             const int input_offset_x = input_x + i * DILATION_SIZE_X;
  67                             const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
  68
  69                             if(!zero_x)
  70                             {
  71                                 uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*INPUT0_FEATURE_PITCH;
  72                                 uint filter_idx = filter_offset + k*FILTER_IFM_PITCH + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
  73                                 conv_out += input[input_idx] * weights[filter_idx];
  74                             }
  75                         }
  76                     }
  77                 }
  78             }
  79 #if BIAS_TERM
  80                 conv_out += (UNIT_TYPE)biases[f];
  81 #endif
  82
  83                 const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
  84                 const uint dst_index = GET_DATA_INDEX(OUTPUT, b, f, y, x) + out_split_offset;
  85 #ifdef FUSED_TRAINING
  86                 conv_output[dst_index] = conv_out;
  87 #else
  88                 output[dst_index] = conv_out;
  89 #endif
  90         }
  91     }
  92
  93
  94     // BATCH NORM PART
  95     barrier(CLK_LOCAL_MEM_FENCE);
  96
  97     __local ACCUMULATOR_TYPE sum[LOCAL_SIZE];
  98
  99     const uint local_idx = b;
 100
 101     sum[local_idx] = 0;
 102
 103     uint input_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0);
 104     for (uint y = 0; y < OUTPUT_SIZE_Y; y++)
 105     {
 106         for (uint x = 0; x < OUTPUT_SIZE_X; x++)
 107         {
 108 #ifdef FUSED_TRAINING
 109             UNIT_TYPE in = conv_output[input_idx];
 110 #else
 111             UNIT_TYPE in = output[input_idx];
 112 #endif
 113             sum[local_idx] += in;
 114             input_idx += OUTPUT_X_PITCH;
 115         }
 116         input_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH;
 117     }
 118
 119     barrier(CLK_LOCAL_MEM_FENCE);
 120
 121     for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2)
 122     {
 123         if (local_idx < offset)
 124         {
 125             sum[local_idx] += sum[local_idx + offset];
 126         }
 127         barrier(CLK_LOCAL_MEM_FENCE);
 128     }
 129
 130     UNIT_TYPE mean = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
 131
 132     sum[local_idx] = 0;
 133
 134     input_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0);
 135     for (uint y = 0; y < OUTPUT_SIZE_Y; y++)
 136     {
 137         for (uint x = 0; x < OUTPUT_SIZE_X; x++)
 138         {
 139 #ifdef FUSED_TRAINING
 140             UNIT_TYPE in = conv_output[input_idx] - mean;
 141 #else
 142             UNIT_TYPE in = output[input_idx] - mean;
 143 #endif
 144             sum[local_idx] += in * in;
 145             input_idx += OUTPUT_X_PITCH;
 146         }
 147         input_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH;
 148     }
 149
 150     barrier(CLK_LOCAL_MEM_FENCE);
 151
 152     for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2)
 153     {
 154         if (local_idx < offset)
 155         {
 156             sum[local_idx] += sum[local_idx + offset];
 157         }
 158         barrier(CLK_LOCAL_MEM_FENCE);
 159     }
 160
 161     float variance = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
 162
 163     float inv_variance = (float)(1.0 / sqrt(variance + EPSILON));
 164
 165 #ifdef FUSED_TRAINING
 166     if (local_idx == 0)
 167         inv_var[f] = inv_variance;
 168 #endif
 169
 170     uint out_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0);
 171     for (uint y = 0; y < OUTPUT_SIZE_Y; y++)
 172     {
 173         for (uint x = 0; x < OUTPUT_SIZE_X; x++)
 174         {
 175 #ifdef FUSED_TRAINING
 176             UNIT_TYPE out_val = inv_variance * (conv_output[out_idx] - mean);
 177             bn_output[out_idx] = out_val;
 178 #ifdef SCALE_BIAS_TERM
 179             output[out_idx] = ACTIVATION(out_val * scale_in[f] + scale_bias[f], NL_M, NL_N);
 180 #else
 181             output[out_idx] = ACTIVATION(out_val * scale_in[f], NL_M, NL_N);
 182 #endif
 183 #else
 184 #ifdef SCALE_BIAS_TERM
 185             output[out_idx] = ACTIVATION(inv_variance * (output[out_idx] - mean) * scale_in[f] + scale_bias[f], NL_M, NL_N);
 186 #else
 187             output[out_idx] = ACTIVATION(inv_variance * (output[out_idx] - mean) * scale_in[f], NL_M, NL_N);
 188 #endif
 189 #endif
 190             out_idx += OUTPUT_X_PITCH;
 191         }
 192         out_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH;
 193     }
 194
 195 }
 196
 197 #undef LOCAL_SIZE