2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "include/include_all.cl"
19 //////////////////////////////////////////////////////////////////////////////
21 #if defined(cl_intel_subgroups_short)
23 #define TILE_M DY // Height of tile in input patches (src0)
24 #define TILE_K DX // Width of tile in input patches (src0)
25 #define TILE_N 16 // Num filter channels per tile (src1)
27 #define TILE_X 12 // Width of tile loaded in input (src0)
28 #define TILE_Y 10 // Height of tile loaded in input (src0)
30 __attribute__((intel_reqd_sub_group_size(16)))
31 KERNEL(convolution_f16_10x12x16)(
32 const __global half *src0,
34 const __global half *src1,
36 const __global half *biases,
40 #include "include/vec_typedefs.cl"
42 const unsigned global_x = get_global_id(0);
43 const unsigned global_y = get_global_id(1);
44 const unsigned global_z = get_global_id(2);
45 const unsigned out_fm = global_z % ALIGNED_OFM;
46 const unsigned batch_id = global_z / ALIGNED_OFM;
47 const unsigned group_x = get_group_id(0);
48 const unsigned group_z = get_group_id(2);
49 const unsigned max_group_x = get_num_groups(0);
50 const unsigned local_z = get_local_id(2);
52 half blockC[TILE_M * TILE_K] = { 0 };
54 const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * INPUT0_FEATURE_NUM;
55 uint src0_offset_tile = INPUT0_OFFSET_WITH_PADDING // data offset
57 + batch_id * INPUT0_BATCH_PITCH // batch offset
58 + ( global_y * TILE_M * STRIDE_SIZE_Y ) * INPUT0_Y_PITCH // y offset
59 + ( global_x * TILE_K * STRIDE_SIZE_X ); // x offset
60 uint src0_offset = src0_offset_tile
61 + ( local_z / ( TILE_X / 4 ) ) * INPUT0_Y_PITCH // y tile offset
62 + ( local_z % ( TILE_X / 4 ) ) * 4; // x tile offset
64 const __global half *src1_read = src1 + ( group_z * TILE_N % ALIGNED_OFM ) * 2;
66 unsigned patch_depth = 0;
67 __attribute__((opencl_unroll_hint(3)))
70 // Load atile (input) and btile (filters).
71 // Kernel data is partially interleaved. Every 2 rows are interleaved at float16 granularity.
72 // The exception is that if FILTER_SIZE_X is odd the last row is not interleaved. The non
73 // interleaved row is padded with zero to ensure same size as interleaved rows. This
74 // interleaving is done to increase consecutive data to fetch which reduces loads required.
75 // For example, this is how the kernel data would be arranged before/after interleaving for FILTER_SIZE_X=3.
76 // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
77 // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
78 // (0, 2) (8, 2) (16, 2) (24, 2) ... ...
81 #if ((INPUT0_Y_PITCH) % 4) == 0
82 // aligned - can ignore vload
83 half4 blockA0 = *(const __global half4 *)( src0 + src0_offset );
84 half4 blockA1 = *(const __global half4 *)( src0 + src0_offset + INPUT0_Y_PITCH * 5 );
85 #elif ((INPUT0_Y_PITCH) % 2) == 0
86 // in case the data is not aligned to sizeof(T)*4 we need to use vload or set the data in a loop
87 // first one is aligned
88 half4 blockA0 = *(const __global half4 *)( src0 + src0_offset );
89 half4 blockA1 = vload4(0, src0 + src0_offset + INPUT0_Y_PITCH * 5 );
91 half4 blockA0 = vload4(0, src0 + src0_offset );
92 half4 blockA1 = vload4(0, src0 + src0_offset + INPUT0_Y_PITCH * 5 );
94 src0_offset += INPUT0_FEATURE_PITCH;
96 half blockB[FILTER_SIZE_X * FILTER_SIZE_Y];
97 ushort2* p2BlockB = (ushort2*)blockB;
98 ushort* pBlockB = (ushort* )blockB;
100 const bool kernel_slice_is_odd = ( FILTER_SIZE_X * FILTER_SIZE_Y ) % 2 == 1;
101 unsigned interleaved_y = 0;
102 LOOP(KERNEL_SLICE_DIV2, interleaved_y,
104 p2BlockB[interleaved_y] = intel_sub_group_block_read_us2( (const __global ushort*)src1_read );
105 src1_read += ALIGNED_OFM * 2;
107 if ( kernel_slice_is_odd )
109 pBlockB[FILTER_SIZE_X * FILTER_SIZE_Y - 1] = intel_sub_group_block_read_us( (const __global ushort*)src1_read );
110 src1_read += ALIGNED_OFM * 2;
113 #define BLOCK_A(n) ( (n < 60) \
114 ? sub_group_broadcast( blockA0[(n)%4], (n)/4 ) \
115 : sub_group_broadcast( blockA1[(n-60)%4], (n-60)/4 ) )
118 // Loop through all patches in tile (patch_x/y)
119 // For each patch, sum values (x/y)
121 LOOP(TILE_M, patch_y,
124 LOOP(TILE_K, patch_x,
126 unsigned tile_idx = patch_y * TILE_X * STRIDE_SIZE_Y + patch_x * STRIDE_SIZE_X;
127 unsigned out_idx = patch_y * TILE_K + patch_x;
130 LOOP(FILTER_SIZE_Y, y,
133 LOOP(FILTER_SIZE_X, x,
135 unsigned offset_idx = y * TILE_X + x;
136 unsigned out_chan_idx = y * FILTER_SIZE_X + x;
138 blockC[out_idx] = mad( BLOCK_A( tile_idx + offset_idx ), blockB[out_chan_idx], blockC[out_idx] );
144 while ( ++patch_depth < INPUT0_FEATURE_NUM );
146 // Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
147 // TILE_K x TILE_M x SIMD. Partial writes most likely generated if output padding used.
148 // Group stores into vectors to expedite writeback. One large write is faster than many
149 // small saves. Right-most column may be smaller if output width not divisible by tile width.
150 const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
151 __global half *out = dst + OUTPUT_OFFSET + out_split_offset +
152 + batch_id * OUTPUT_BATCH_PITCH // batch offset
153 + out_fm * OUTPUT_FEATURE_PITCH // channel offset
154 + ( global_y * TILE_M ) * OUTPUT_Y_PITCH // y offset
155 + ( global_x * TILE_K ); // x offset
157 if ( batch_id < OUTPUT_BATCH_NUM && out_fm < OUTPUT_FEATURE_NUM )
160 const half bias = 0.h;
162 const half bias = biases[out_fm];
165 if ( OUTPUT_SIZE_X % TILE_K == 0 ||
166 group_x < max_group_x - 1 )
168 typedef CAT( half, TILE_K ) half_t;
169 for( unsigned y = 0; y < TILE_M; y++ )
171 if ( global_y * TILE_M + y < OUTPUT_SIZE_Y )
174 half *pvBlockC = (half*)&vBlockC;
175 for (unsigned i = 0; i < TILE_K; i++)
177 #if BIAS_TERM && BIAS_PER_OUTPUT
178 const unsigned bias_index = out_fm*OUTPUT_SIZE_X*OUTPUT_SIZE_Y + ( global_y * TILE_M + y )*OUTPUT_SIZE_X + ( global_x * TILE_K + i);
179 const half bias = biases[bias_index];
181 pvBlockC[i] = ACTIVATION(blockC[y * TILE_K + i] + bias, NL_M, NL_N);
182 ((__global half*)(out + y * OUTPUT_Y_PITCH))[i] = pvBlockC[i];
184 //*(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC;
190 typedef CAT( half, RIGHT_PARTIAL_TILE_K ) half_t;
191 for( unsigned y = 0; y < TILE_M; y++ )
193 if ( global_y * TILE_M + y < OUTPUT_SIZE_Y )
196 half *pvBlockC = (half*)&vBlockC;
197 for (unsigned i = 0; i < RIGHT_PARTIAL_TILE_K; i++)
199 #if BIAS_TERM && BIAS_PER_OUTPUT
200 const unsigned bias_index = out_fm*OUTPUT_SIZE_X*OUTPUT_SIZE_Y + ( global_y * TILE_M + y )*OUTPUT_SIZE_X + ( global_x * TILE_K + i);
201 const half bias = biases[bias_index];
203 pvBlockC[i] = ACTIVATION(blockC[y * TILE_K + i] + bias, NL_M, NL_N);
204 ((__global half*)(out + y * OUTPUT_Y_PITCH))[i] = pvBlockC[i];
206 //*(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC;
212 #endif // cl_intel_subgroups_short