inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_direct_10_12_16.cl

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "include/include_all.cl"
  18
  19 //////////////////////////////////////////////////////////////////////////////
  20 // Direct Convolution
  21 #if defined(cl_intel_subgroups_short)
  22
  23 #define TILE_M          DY      // Height of tile in input patches (src0)
  24 #define TILE_K          DX      // Width of tile in input patches (src0)
  25 #define TILE_N          16      // Num filter channels per tile (src1)
  26
  27 #define TILE_X          12      // Width of tile loaded in input (src0)
  28 #define TILE_Y          10      // Height of tile loaded in input (src0)
  29
  30 __attribute__((intel_reqd_sub_group_size(16)))
  31 KERNEL(convolution_f16_10x12x16)(
  32     const __global half *src0,
  33     __global half *dst,
  34     const __global half *src1,
  35 #if BIAS_TERM
  36     const __global half *biases,
  37 #endif
  38     uint split_idx)
  39 {
  40 #include "include/vec_typedefs.cl"
  41
  42     const unsigned global_x = get_global_id(0);
  43     const unsigned global_y = get_global_id(1);
  44     const unsigned global_z = get_global_id(2);
  45     const unsigned out_fm   = global_z % ALIGNED_OFM;
  46     const unsigned batch_id = global_z / ALIGNED_OFM;
  47     const unsigned group_x = get_group_id(0);
  48     const unsigned group_z = get_group_id(2);
  49     const unsigned max_group_x = get_num_groups(0);
  50     const unsigned local_z = get_local_id(2);
  51
  52     half blockC[TILE_M * TILE_K] = { 0 };
  53
  54     const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * INPUT0_FEATURE_NUM;
  55     uint src0_offset_tile = INPUT0_OFFSET_WITH_PADDING      // data offset
  56      + in_split_offset
  57      + batch_id * INPUT0_BATCH_PITCH                        // batch offset
  58      + ( global_y * TILE_M * STRIDE_SIZE_Y ) * INPUT0_Y_PITCH    // y offset
  59      + ( global_x * TILE_K * STRIDE_SIZE_X );                    // x offset
  60     uint src0_offset = src0_offset_tile
  61      + ( local_z / ( TILE_X / 4 ) ) * INPUT0_Y_PITCH        // y tile offset
  62      + ( local_z % ( TILE_X / 4 ) ) * 4;                    // x tile offset
  63
  64     const __global half *src1_read = src1 + ( group_z * TILE_N % ALIGNED_OFM ) * 2;
  65
  66     unsigned patch_depth = 0;
  67     __attribute__((opencl_unroll_hint(3)))
  68     do
  69     {
  70         // Load atile (input) and btile (filters).
  71         // Kernel data is partially interleaved.  Every 2 rows are interleaved at float16 granularity.
  72         // The exception is that if FILTER_SIZE_X is odd the last row is not interleaved.  The non
  73         // interleaved row is padded with zero to ensure same size as interleaved rows. This
  74         // interleaving is done to increase consecutive data to fetch which reduces loads required.
  75         // For example, this is how the kernel data would be arranged before/after interleaving for FILTER_SIZE_X=3.
  76         // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
  77         // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
  78         // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
  79         // ...
  80
  81         #if ((INPUT0_Y_PITCH) % 4) == 0
  82         // aligned - can ignore vload
  83         half4 blockA0 = *(const __global half4 *)( src0 + src0_offset );
  84         half4 blockA1 = *(const __global half4 *)( src0 + src0_offset + INPUT0_Y_PITCH * 5 );
  85         #elif ((INPUT0_Y_PITCH) % 2) == 0
  86         // in case the data is not aligned to sizeof(T)*4 we need to use vload or set the data in a loop
  87         // first one is aligned
  88         half4 blockA0 = *(const __global half4 *)( src0 + src0_offset );
  89         half4 blockA1 = vload4(0, src0 + src0_offset + INPUT0_Y_PITCH * 5 );
  90         #else
  91         half4 blockA0 = vload4(0, src0 + src0_offset );
  92         half4 blockA1 = vload4(0, src0 + src0_offset + INPUT0_Y_PITCH * 5 );
  93         #endif
  94         src0_offset += INPUT0_FEATURE_PITCH;
  95
  96         half blockB[FILTER_SIZE_X * FILTER_SIZE_Y];
  97         ushort2* p2BlockB = (ushort2*)blockB;
  98         ushort*  pBlockB =  (ushort* )blockB;
  99
 100         const bool kernel_slice_is_odd = ( FILTER_SIZE_X * FILTER_SIZE_Y ) % 2 == 1;
 101         unsigned interleaved_y = 0;
 102         LOOP(KERNEL_SLICE_DIV2, interleaved_y,
 103         {
 104             p2BlockB[interleaved_y] = intel_sub_group_block_read_us2( (const __global ushort*)src1_read );
 105             src1_read += ALIGNED_OFM * 2;
 106         } )
 107         if ( kernel_slice_is_odd )
 108         {
 109             pBlockB[FILTER_SIZE_X * FILTER_SIZE_Y - 1] = intel_sub_group_block_read_us( (const __global ushort*)src1_read );
 110             src1_read += ALIGNED_OFM * 2;
 111         }
 112
 113 #define BLOCK_A(n) ( (n < 60) \
 114     ? sub_group_broadcast( blockA0[(n)%4], (n)/4 ) \
 115     : sub_group_broadcast( blockA1[(n-60)%4], (n-60)/4 ) )
 116
 117         // Perform MADs
 118         // Loop through all patches in tile (patch_x/y)
 119         // For each patch, sum values (x/y)
 120         unsigned patch_y=0;
 121         LOOP(TILE_M, patch_y,
 122         {
 123             unsigned patch_x=0;
 124             LOOP(TILE_K, patch_x,
 125             {
 126                 unsigned tile_idx = patch_y * TILE_X * STRIDE_SIZE_Y + patch_x * STRIDE_SIZE_X;
 127                 unsigned out_idx  = patch_y * TILE_K + patch_x;
 128
 129                 unsigned y=0;
 130                 LOOP(FILTER_SIZE_Y, y,
 131                 {
 132                     unsigned x=0;
 133                     LOOP(FILTER_SIZE_X, x,
 134                     {
 135                         unsigned offset_idx = y * TILE_X + x;
 136                         unsigned out_chan_idx = y * FILTER_SIZE_X + x;
 137
 138                         blockC[out_idx] = mad( BLOCK_A( tile_idx + offset_idx ), blockB[out_chan_idx], blockC[out_idx] );
 139                     } )
 140                 } )
 141             } )
 142         } )
 143     }
 144     while ( ++patch_depth < INPUT0_FEATURE_NUM );
 145
 146     // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
 147     // TILE_K x TILE_M x SIMD.  Partial writes most likely generated if output padding used.
 148     // Group stores into vectors to expedite writeback.  One large write is faster than many
 149     // small saves. Right-most column may be smaller if output width not divisible by tile width.
 150     const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
 151     __global half *out = dst + OUTPUT_OFFSET + out_split_offset +
 152      + batch_id * OUTPUT_BATCH_PITCH            // batch offset
 153      + out_fm * OUTPUT_FEATURE_PITCH              // channel offset
 154      + ( global_y * TILE_M ) * OUTPUT_Y_PITCH // y offset
 155      + ( global_x * TILE_K );                // x offset
 156
 157     if ( batch_id < OUTPUT_BATCH_NUM && out_fm < OUTPUT_FEATURE_NUM )
 158     {
 159 #if BIAS_TERM == 0
 160         const half bias = 0.h;
 161 #elif BIAS_PER_OFM
 162         const half bias = biases[out_fm];
 163 #endif
 164
 165         if ( OUTPUT_SIZE_X % TILE_K == 0 ||
 166              group_x < max_group_x - 1 )
 167         {
 168             typedef CAT( half, TILE_K ) half_t;
 169             for( unsigned y = 0; y < TILE_M; y++ )
 170             {
 171                 if ( global_y * TILE_M + y < OUTPUT_SIZE_Y )
 172                 {
 173                     half_t vBlockC;
 174                     half *pvBlockC = (half*)&vBlockC;
 175                     for (unsigned i = 0; i < TILE_K; i++)
 176                     {
 177                     #if BIAS_TERM && BIAS_PER_OUTPUT
 178                         const unsigned bias_index = out_fm*OUTPUT_SIZE_X*OUTPUT_SIZE_Y + ( global_y * TILE_M + y )*OUTPUT_SIZE_X + ( global_x * TILE_K + i);
 179                         const half bias = biases[bias_index];
 180                     #endif
 181                         pvBlockC[i] = ACTIVATION(blockC[y * TILE_K + i] + bias, NL_M, NL_N);
 182                         ((__global half*)(out + y * OUTPUT_Y_PITCH))[i] = pvBlockC[i];
 183                     }
 184                     //*(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC;
 185                 }
 186             }
 187         }
 188         else
 189         {
 190             typedef CAT( half, RIGHT_PARTIAL_TILE_K ) half_t;
 191             for( unsigned y = 0; y < TILE_M; y++ )
 192             {
 193                 if ( global_y * TILE_M + y < OUTPUT_SIZE_Y )
 194                 {
 195                     half_t vBlockC;
 196                     half *pvBlockC = (half*)&vBlockC;
 197                     for (unsigned i = 0; i < RIGHT_PARTIAL_TILE_K; i++)
 198                     {
 199                     #if BIAS_TERM && BIAS_PER_OUTPUT
 200                         const unsigned bias_index = out_fm*OUTPUT_SIZE_X*OUTPUT_SIZE_Y + ( global_y * TILE_M + y )*OUTPUT_SIZE_X + ( global_x * TILE_K + i);
 201                         const half bias = biases[bias_index];
 202                     #endif
 203                         pvBlockC[i] = ACTIVATION(blockC[y * TILE_K + i] + bias, NL_M, NL_N);
 204                         ((__global half*)(out + y * OUTPUT_Y_PITCH))[i] = pvBlockC[i];
 205                     }
 206                     //*(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC;
 207                 }
 208             }
 209         }
 210     }
 211 }
 212 #endif // cl_intel_subgroups_short