/* // Copyright (c) 2016 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. */ #include "include/include_all.cl" ////////////////////////////////////////////////////////////////////////////// // Direct Convolution #if defined(cl_intel_subgroups_short) #define TILE_M DY // Height of tile in input patches (src0) #define TILE_K DX // Width of tile in input patches (src0) #define TILE_N 16 // Num filter channels per tile (src1) #define TILE_X 12 // Width of tile loaded in input (src0) #define TILE_Y 10 // Height of tile loaded in input (src0) __attribute__((intel_reqd_sub_group_size(16))) KERNEL(convolution_f16_10x12x16)( const __global half *src0, __global half *dst, const __global half *src1, #if BIAS_TERM const __global half *biases, #endif uint split_idx) { #include "include/vec_typedefs.cl" const unsigned global_x = get_global_id(0); const unsigned global_y = get_global_id(1); const unsigned global_z = get_global_id(2); const unsigned out_fm = global_z % ALIGNED_OFM; const unsigned batch_id = global_z / ALIGNED_OFM; const unsigned group_x = get_group_id(0); const unsigned group_z = get_group_id(2); const unsigned max_group_x = get_num_groups(0); const unsigned local_z = get_local_id(2); half blockC[TILE_M * TILE_K] = { 0 }; const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * INPUT0_FEATURE_NUM; uint src0_offset_tile = INPUT0_OFFSET_WITH_PADDING // data offset + in_split_offset + batch_id * INPUT0_BATCH_PITCH // batch offset + ( global_y * TILE_M * STRIDE_SIZE_Y ) * INPUT0_Y_PITCH // y offset + ( global_x * TILE_K * STRIDE_SIZE_X ); // x offset uint src0_offset = src0_offset_tile + ( local_z / ( TILE_X / 4 ) ) * INPUT0_Y_PITCH // y tile offset + ( local_z % ( TILE_X / 4 ) ) * 4; // x tile offset const __global half *src1_read = src1 + ( group_z * TILE_N % ALIGNED_OFM ) * 2; unsigned patch_depth = 0; __attribute__((opencl_unroll_hint(3))) do { // Load atile (input) and btile (filters). // Kernel data is partially interleaved. Every 2 rows are interleaved at float16 granularity. // The exception is that if FILTER_SIZE_X is odd the last row is not interleaved. The non // interleaved row is padded with zero to ensure same size as interleaved rows. This // interleaving is done to increase consecutive data to fetch which reduces loads required. // For example, this is how the kernel data would be arranged before/after interleaving for FILTER_SIZE_X=3. // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... // (0, 2) (8, 2) (16, 2) (24, 2) ... ... // ... #if ((INPUT0_Y_PITCH) % 4) == 0 // aligned - can ignore vload half4 blockA0 = *(const __global half4 *)( src0 + src0_offset ); half4 blockA1 = *(const __global half4 *)( src0 + src0_offset + INPUT0_Y_PITCH * 5 ); #elif ((INPUT0_Y_PITCH) % 2) == 0 // in case the data is not aligned to sizeof(T)*4 we need to use vload or set the data in a loop // first one is aligned half4 blockA0 = *(const __global half4 *)( src0 + src0_offset ); half4 blockA1 = vload4(0, src0 + src0_offset + INPUT0_Y_PITCH * 5 ); #else half4 blockA0 = vload4(0, src0 + src0_offset ); half4 blockA1 = vload4(0, src0 + src0_offset + INPUT0_Y_PITCH * 5 ); #endif src0_offset += INPUT0_FEATURE_PITCH; half blockB[FILTER_SIZE_X * FILTER_SIZE_Y]; ushort2* p2BlockB = (ushort2*)blockB; ushort* pBlockB = (ushort* )blockB; const bool kernel_slice_is_odd = ( FILTER_SIZE_X * FILTER_SIZE_Y ) % 2 == 1; unsigned interleaved_y = 0; LOOP(KERNEL_SLICE_DIV2, interleaved_y, { p2BlockB[interleaved_y] = intel_sub_group_block_read_us2( (const __global ushort*)src1_read ); src1_read += ALIGNED_OFM * 2; } ) if ( kernel_slice_is_odd ) { pBlockB[FILTER_SIZE_X * FILTER_SIZE_Y - 1] = intel_sub_group_block_read_us( (const __global ushort*)src1_read ); src1_read += ALIGNED_OFM * 2; } #define BLOCK_A(n) ( (n < 60) \ ? sub_group_broadcast( blockA0[(n)%4], (n)/4 ) \ : sub_group_broadcast( blockA1[(n-60)%4], (n-60)/4 ) ) // Perform MADs // Loop through all patches in tile (patch_x/y) // For each patch, sum values (x/y) unsigned patch_y=0; LOOP(TILE_M, patch_y, { unsigned patch_x=0; LOOP(TILE_K, patch_x, { unsigned tile_idx = patch_y * TILE_X * STRIDE_SIZE_Y + patch_x * STRIDE_SIZE_X; unsigned out_idx = patch_y * TILE_K + patch_x; unsigned y=0; LOOP(FILTER_SIZE_Y, y, { unsigned x=0; LOOP(FILTER_SIZE_X, x, { unsigned offset_idx = y * TILE_X + x; unsigned out_chan_idx = y * FILTER_SIZE_X + x; blockC[out_idx] = mad( BLOCK_A( tile_idx + offset_idx ), blockB[out_chan_idx], blockC[out_idx] ); } ) } ) } ) } ) } while ( ++patch_depth < INPUT0_FEATURE_NUM ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // TILE_K x TILE_M x SIMD. Partial writes most likely generated if output padding used. // Group stores into vectors to expedite writeback. One large write is faster than many // small saves. Right-most column may be smaller if output width not divisible by tile width. const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM; __global half *out = dst + OUTPUT_OFFSET + out_split_offset + + batch_id * OUTPUT_BATCH_PITCH // batch offset + out_fm * OUTPUT_FEATURE_PITCH // channel offset + ( global_y * TILE_M ) * OUTPUT_Y_PITCH // y offset + ( global_x * TILE_K ); // x offset if ( batch_id < OUTPUT_BATCH_NUM && out_fm < OUTPUT_FEATURE_NUM ) { #if BIAS_TERM == 0 const half bias = 0.h; #elif BIAS_PER_OFM const half bias = biases[out_fm]; #endif if ( OUTPUT_SIZE_X % TILE_K == 0 || group_x < max_group_x - 1 ) { typedef CAT( half, TILE_K ) half_t; for( unsigned y = 0; y < TILE_M; y++ ) { if ( global_y * TILE_M + y < OUTPUT_SIZE_Y ) { half_t vBlockC; half *pvBlockC = (half*)&vBlockC; for (unsigned i = 0; i < TILE_K; i++) { #if BIAS_TERM && BIAS_PER_OUTPUT const unsigned bias_index = out_fm*OUTPUT_SIZE_X*OUTPUT_SIZE_Y + ( global_y * TILE_M + y )*OUTPUT_SIZE_X + ( global_x * TILE_K + i); const half bias = biases[bias_index]; #endif pvBlockC[i] = ACTIVATION(blockC[y * TILE_K + i] + bias, NL_M, NL_N); ((__global half*)(out + y * OUTPUT_Y_PITCH))[i] = pvBlockC[i]; } //*(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC; } } } else { typedef CAT( half, RIGHT_PARTIAL_TILE_K ) half_t; for( unsigned y = 0; y < TILE_M; y++ ) { if ( global_y * TILE_M + y < OUTPUT_SIZE_Y ) { half_t vBlockC; half *pvBlockC = (half*)&vBlockC; for (unsigned i = 0; i < RIGHT_PARTIAL_TILE_K; i++) { #if BIAS_TERM && BIAS_PER_OUTPUT const unsigned bias_index = out_fm*OUTPUT_SIZE_X*OUTPUT_SIZE_Y + ( global_y * TILE_M + y )*OUTPUT_SIZE_X + ( global_x * TILE_K + i); const half bias = biases[bias_index]; #endif pvBlockC[i] = ACTIVATION(blockC[y * TILE_K + i] + bias, NL_M, NL_N); ((__global half*)(out + y * OUTPUT_Y_PITCH))[i] = pvBlockC[i]; } //*(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC; } } } } } #endif // cl_intel_subgroups_short