inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_gemm_like_fp16.cl

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "include/include_all.cl"
  18
  19 #if defined(cl_intel_subgroups_short)
  20 #define TILE_M          1
  21 #define TILE_K          FILTER_SIZE_X
  22 #define TILE_N          32
  23
  24 __attribute__((intel_reqd_sub_group_size(16)))
  25 KERNEL(convolution_f16)(
  26     const __global half *src0,
  27     __global half *dst,
  28     const __global half *src1,
  29 #if BIAS_TERM
  30     const __global half *bias,
  31 #endif
  32     uint split_idx)
  33 {
  34 #include "include/vec_typedefs.cl"
  35
  36     const unsigned group_x = get_group_id(0);
  37     const unsigned group_y = get_group_id(1);
  38     const unsigned global_x = get_global_id(0);
  39     const unsigned global_y = get_global_id(1);
  40     const unsigned global_z = get_global_id(2);
  41
  42     unsigned interleaved_y;
  43     unsigned kernel_y;
  44     unsigned kernel_idx;
  45
  46     // Result ctile (*dst) is M rows x N columns
  47     // LWG size is 1x16.  Thus each thread calculates 16*M rows x N cols of ctile.
  48     half16  blockC00 = 0.f;
  49     half16  blockC10 = 0.f;
  50
  51     const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * INPUT0_FEATURE_NUM;
  52     // Src0 (patch input) is directly used as atile.
  53     // Each work item points to the start of a different patch.
  54     // atile is M rows x K columns.
  55 #if defined(INPUT_BUFFER_WIDTH_PADDED) && defined(INPUT_BUFFER_HEIGHT_PADDED)
  56     const uint src0_read_offset_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset
  57      + INPUT0_BATCH_PITCH * global_z                                                         // batch offset
  58      + ( ( global_y / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH )                     // y offset
  59      + ( ( global_y % OUTPUT_SIZE_X ) * STRIDE_SIZE_X );                                     // x offset
  60 #elif !defined(INPUT_BUFFER_WIDTH_PADDED) && !defined(INPUT_BUFFER_HEIGHT_PADDED)
  61     #pragma error - fix this path
  62     const int y_offset = ( global_y / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y - PADDING_SIZE_Y;
  63     const int x_offset = ( global_y % OUTPUT_SIZE_X ) * STRIDE_SIZE_X - PADDING_SIZE_X;
  64     uint src0_read_offset = INPUT_OFFSET + in_split_offset + INPUT0_BATCH_PITCH * global_z
  65                             + y_offset * INPUT0_Y_PITCH;
  66
  67     int partial_left = 0, partial_right = 0;
  68     if (x_offset < 0)
  69     {
  70         partial_left = min((int) FILTER_SIZE_X, (int) abs(x_offset));
  71         src0_read_offset -= partial_left;
  72     }
  73     else
  74     {
  75         partial_left = 0;
  76         src0_read_offset +=  x_offset;
  77     }
  78     if ((x_offset + FILTER_SIZE_X) >= INPUT_SIZE_X)
  79         partial_right = min(FILTER_SIZE_X, INPUT_SIZE_X - x_offset);
  80     else
  81         partial_right = FILTER_SIZE_X;
  82
  83 #elif defined(INPUT_BUFFER_WIDTH_PADDED)
  84     #pragma error - fix this path
  85     // TODO: Handle offset
  86     const int y_offset = ( global_y / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y -PADDING_SIZE_Y;
  87     int src0_read_offset = in_split_offset + INPUT0_BATCH_PITCH * global_z        // batch offset
  88      + y_offset * INPUT0_Y_PITCH                              // y offset
  89      + ( ( global_y % OUTPUT_SIZE_X ) * STRIDE_SIZE_X );                // x offset
  90 #endif
  91
  92     // Src1 (filter) is directly used as btile.
  93     // It starts at the top of src1 and walks down.
  94     // btile is K rows x N columns.
  95     uint src0_read_offset = src0_read_offset_const;
  96     uint src1_read_offset = ( global_x * TILE_N * 2);
  97
  98 #define DOT_PRODUCT_16( _result, _rowA, colB )    \
  99     {   \
 100         _result.s0 = mad( _rowA, sub_group_broadcast( colB,  0 ), _result.s0 );  \
 101         _result.s1 = mad( _rowA, sub_group_broadcast( colB,  1 ), _result.s1 );  \
 102         _result.s2 = mad( _rowA, sub_group_broadcast( colB,  2 ), _result.s2 );  \
 103         _result.s3 = mad( _rowA, sub_group_broadcast( colB,  3 ), _result.s3 );  \
 104         _result.s4 = mad( _rowA, sub_group_broadcast( colB,  4 ), _result.s4 );  \
 105         _result.s5 = mad( _rowA, sub_group_broadcast( colB,  5 ), _result.s5 );  \
 106         _result.s6 = mad( _rowA, sub_group_broadcast( colB,  6 ), _result.s6 );  \
 107         _result.s7 = mad( _rowA, sub_group_broadcast( colB,  7 ), _result.s7 );  \
 108         _result.s8 = mad( _rowA, sub_group_broadcast( colB,  8 ), _result.s8 );  \
 109         _result.s9 = mad( _rowA, sub_group_broadcast( colB,  9 ), _result.s9 );  \
 110         _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa );  \
 111         _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb );  \
 112         _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc );  \
 113         _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd );  \
 114         _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se );  \
 115         _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf );  \
 116     }
 117     typedef CAT( half, FILTER_SIZE_X ) half_t;
 118     // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
 119     // Inner loop loads and FMADs one row (FILTER_SIZE_X) of each input patch
 120     // and FILTER_SIZE_X/2 rows of interleaved filter.
 121     unsigned patch_depth = 0;
 122     __attribute__((opencl_unroll_hint(1)))
 123     do
 124     {
 125         int patch_row = 0;
 126         __attribute__((opencl_unroll_hint(1)))
 127         do
 128         {
 129             // Load atile and btile.
 130             // Kernel data is partially interleaved.  Every 2 rows are interleaved at half16 granularity.
 131             // The exception is that if FILTER_SIZE_X is odd the last row is not interleaved.  The non
 132             // interleaved row is padded with zero to ensure same size as interleaved rows. This
 133             // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
 134             // kernel data would be arranged before/after interleaving for FILTER_SIZE_X=3.
 135             // (0, 0) (16, 0) (32, 0) (48, 0) ...     (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ...
 136             // (0, 1) (16, 1) (32, 1) (48, 1) ... =>  (0, 2) (16, 2) (32, 2) (48, 2) ...
 137             // (0, 2) (16, 2) (32, 2) (48, 2) ...     ...
 138             // ...
 139             const bool kernel_width_is_odd = FILTER_SIZE_X % 2 == 1;
 140             #if defined(INPUT_BUFFER_WIDTH_PADDED) && defined(INPUT_BUFFER_HEIGHT_PADDED)
 141
 142             // in case the data is not aligned to sizeof(T)*FILTER_SIZE_X we need to use vload or set the data in a loop
 143             half blockA00[FILTER_SIZE_X];
 144             {
 145                 unsigned i = 0;
 146                 LOOP(FILTER_SIZE_X, i,
 147                 {
 148 #if LEFTOVERS == 1
 149                     if(src0_read_offset_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
 150                     {
 151                         if(src0_read_offset + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
 152                             blockA00[i] = src0[src0_read_offset + i];
 153                     }
 154                     else
 155 #endif
 156                         blockA00[i] = src0[src0_read_offset + i];
 157                 } )
 158             }
 159
 160             half*  pblockA00 = (half*)(&blockA00);
 161
 162             #elif !defined(INPUT_BUFFER_WIDTH_PADDED) && !defined(INPUT_BUFFER_HEIGHT_PADDED)
 163             // TODO: Fixed vload issue in this path.
 164             #pragma error
 165             half_t blockA00;
 166             half*  pblockA00 = (half*)(&blockA00);
 167             #if (PADDING_SIZE_X == 1) && (INPPUT_PADDING_Y == 1) && (FILTER_SIZE_X == 3) && (FILTER_SIZE_Y == 3)
 168             if ((y_offset +  patch_row < 0) || ((y_offset + patch_row) >= INPUT_SIZE_Y))
 169             {
 170                 blockA00 = { 0 };
 171             }
 172             else
 173             {
 174                  blockA00 = src0[src0_read_offset - partial_left];
 175                  if (partial_left) pblockA00[0] = 0;
 176                  if (partial_right != FILTER_SIZE_X) pblockA00[FILTER_SIZE_X - 1] = 0;
 177             }
 178             #else
 179             if ((y_offset +  patch_row < 0) || ((y_offset + patch_row) >= INPUT_SIZE_Y))
 180             {
 181                 blockA00 = { 0 };
 182             }
 183             else
 184             {
 185                  blockA00 = src0[src0_read_offset - partial_left];
 186                  for (unsigned i = 0; i < partial_left; ++i) pblockA00[i] = 0;
 187                  for (unsigned i = partial_right; i < FILTER_SIZE_X; ++i) pblockA00[i] = 0;
 188
 189             }
 190             #endif
 191             #elif defined(INPUT_BUFFER_WIDTH_PADDED)
 192             // TODO: Fixed vload issue in this path.
 193             #pragma error
 194             if ((y_offset +  patch_row < 0) || ((y_offset + patch_row) >= INPUT_SIZE_Y))
 195             {
 196                 blockA00 = { 0 };
 197             }
 198             else
 199             {
 200                 blockA00 = src0[src0_read_offset];
 201             }
 202             #endif
 203             src0_read_offset += INPUT0_Y_PITCH;
 204
 205             ushort blockB00[FILTER_SIZE_X * 2];
 206             ushort4* p4BlockB00 = (ushort4*)blockB00;
 207             ushort2* p2BlockB00 = (ushort2*)blockB00;
 208             half* pBlockB00  = (half*)blockB00;
 209
 210             interleaved_y = 0;
 211             LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
 212             {
 213                 p4BlockB00[interleaved_y] = intel_sub_group_block_read_us4( (const __global ushort*)src1 + src1_read_offset );
 214                 src1_read_offset += ALIGNED_OFM * 2;
 215             } )
 216             if ( kernel_width_is_odd )
 217             {
 218                 p2BlockB00[FILTER_SIZE_X - 1] = intel_sub_group_block_read_us2( (const __global ushort*)src1 + src1_read_offset );
 219                 src1_read_offset += ALIGNED_OFM * 2;
 220             }
 221
 222             // Perform MADs
 223             kernel_idx = 0;
 224             interleaved_y = 0;
 225             LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
 226             {
 227                 kernel_y = interleaved_y * 2;
 228                 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
 229                 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
 230                 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
 231                 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
 232             } )
 233             if ( kernel_width_is_odd )
 234             {
 235                 kernel_y = interleaved_y * 2;
 236                 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
 237                 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
 238             }
 239         }
 240         while( ++patch_row < FILTER_SIZE_Y );
 241
 242         src0_read_offset += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch
 243     }
 244     while ( ++patch_depth < INPUT0_FEATURE_NUM );
 245
 246     #undef DOT_PRODUCT_16
 247
 248     const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
 249     // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
 250     // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
 251     __global half *out = dst + OUTPUT_OFFSET + out_split_offset
 252      + global_z * OUTPUT_BATCH_PITCH                                                   // batch offset
 253      + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH                                     // channel offset
 254      + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH                      // y offset
 255      + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X );                                      // x offset
 256
 257
 258     if (global_y * TILE_M < OUTPUT_SIZE_X * OUTPUT_SIZE_Y )
 259     {
 260          #if BIAS_TERM
 261          __global half16* biasPtr = (__global half16*) (bias + group_x * TILE_N);
 262          #endif
 263
 264 #if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 )
 265
 266         #if BIAS_TERM
 267         blockC00 += *biasPtr;
 268         blockC10 += *(biasPtr + 1);
 269         #endif
 270
 271         blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
 272         blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
 273
 274         for (unsigned i = 0; i < 16; i++)
 275         {
 276             out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
 277             out[(16+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
 278         }
 279
 280 #elif ( ( OUTPUT_FEATURE_NUM % 16 ) == 0 )
 281         if ( ( global_x + 1 ) < get_global_size(0) )
 282         {
 283             #if BIAS_TERM
 284             blockC00 += *biasPtr;
 285             blockC10 += *(biasPtr + 1);
 286             #endif
 287
 288             blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
 289             blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
 290
 291             for ( unsigned i = 0; i < 16; i++ )
 292             {
 293                 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
 294                 out[(16+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
 295             }
 296         }
 297         else
 298         {
 299             #if BIAS_TERM
 300             blockC00 += *biasPtr;
 301             #endif
 302
 303             blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
 304
 305             for (unsigned i = 0; i < 16; i++)
 306             {
 307                 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
 308             }
 309         }
 310 #else
 311         if ( ( global_x + 1 ) < get_global_size(0) )
 312         {
 313             #if BIAS_TERM
 314             blockC00 += *biasPtr;
 315             blockC10 += *(biasPtr + 1);
 316             #endif
 317
 318             blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
 319             blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
 320
 321             for ( unsigned i = 0; i < 16; i++ )
 322             {
 323                 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
 324                 out[(16+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
 325             }
 326         }
 327         else
 328         {
 329 #if ( (OUTPUT_FEATURE_NUM % TILE_N) > 16 )
 330
 331             #if BIAS_TERM
 332             blockC00 += *biasPtr;
 333             blockC10 += *(biasPtr + 1);
 334             #endif
 335
 336             blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
 337             blockC10 = ACTIVATION(blockC10, NL_M, NL_N);
 338
 339             for (unsigned i = 0; i < 16 ; i++)
 340             {
 341                 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
 342             }
 343             for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 16 ; i++)
 344             {
 345                 out[(16+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
 346             }
 347 #else
 348             #if BIAS_TERM
 349             blockC00 += *biasPtr;
 350             #endif
 351
 352             blockC00 = ACTIVATION(blockC00, NL_M, NL_N);
 353
 354             for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 16 ; i++)
 355             {
 356                 out[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
 357             }
 358 #endif
 359         }
 360 #endif
 361     }
 362
 363 }
 364 #endif