2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
19 #define GET_DATA_INDEX(prefix, b, f, y, x) \
20 CAT(prefix, _OFFSET) + \
21 (x)*CAT(prefix, _X_PITCH) + \
22 (y)*CAT(prefix, _Y_PITCH) + \
23 (f)*CAT(prefix, _FEATURE_PITCH) + \
24 (b)*CAT(prefix, _BATCH_PITCH)
26 #define GET_DATA_INDEX_SAFE(prefix, b, f, y, x) \
27 CAT(prefix, _OFFSET) + \
28 (x % CAT(prefix, _SIZE_X ))*CAT(prefix, _X_PITCH) + \
29 (y % CAT(prefix, _SIZE_Y ))*CAT(prefix, _Y_PITCH) + \
30 (f % CAT(prefix, _FEATURE_NUM))*CAT(prefix, _FEATURE_PITCH) + \
31 (b % CAT(prefix, _BATCH_NUM ))*CAT(prefix, _BATCH_PITCH)
34 #define GET_DATA_BS_FYX_BSV8_INDEX(prefix, b, f, y, x, sub_group_size) \
35 CAT(prefix, _OFFSET) + \
36 ((b) % (sub_group_size)) + \
38 (x)*CAT(prefix, _X_PITCH) + \
39 (y)*CAT(prefix, _Y_PITCH) + \
40 (f)*CAT(prefix, _FEATURE_PITCH) + \
41 ((b) / (sub_group_size))*CAT(prefix, _BATCH_PITCH) \
44 inline uint FUNC(get_bf8_xy16_index)(uint b, uint f, uint y, uint x, uint x_size, uint y_size, uint f_size, uint offset)
46 const uint xy_idx = x + y * x_size;
47 const uint xy_offset = (xy_idx % 16) + (xy_idx / 16) * 16 * 8;
48 const uint xy_block_num = (x_size * y_size + 16 - 1) / 16;
49 const uint f_offset = (f % 8) * 16 + (f / 8) * xy_block_num * 16 * 8;
50 const uint f_block_num = (f_size + 8 - 1) / 8;
51 const uint b_offset = b * f_block_num * xy_block_num * 128;
53 const size_t idx = offset + xy_offset + f_offset + b_offset;
58 inline uint FUNC(get_byxf_af32_index)(uint b, uint f, uint y, uint x, uint y_pitch, uint b_pitch, uint f_size, uint offset)
60 const uint f_aligned_to_32 = ((f_size + 31) / 32) * 32;
61 const uint b_offset = b * b_pitch;
62 const uint xy_offset = f_aligned_to_32 * x + y_pitch * y;
63 const uint f_offset = f;
64 const size_t idx = offset + xy_offset + b_offset + f_offset;
68 #define GET_DATA_BYXF_AF32_INDEX(prefix, b, f, y, x)\
69 FUNC_CALL(get_byxf_af32_index)( \
70 b, f, y, x, CAT(prefix, _Y_PITCH), \
71 CAT(prefix, _BATCH_PITCH), \
72 CAT(prefix, _FEATURE_NUM), \
75 inline uint FUNC(get_byx8_f4_index)(uint b, uint f, uint y, uint x,
76 uint x_pitch, uint y_pitch, uint b_pitch, uint f_size, uint x_size, uint offset)
78 const uint f_aligned_to_4 = ((f_size + 3) / 4) * 4;
79 const uint x_aligned_to_8 = ((x_size + 7) / 8) * 8;
80 const uint b_offset = b * b_pitch;
81 const uint xy_offset = x * x_pitch + y * y_pitch;
82 const uint f_offset = f;
83 const size_t idx = offset + xy_offset + b_offset + f_offset;
87 #define GET_DATA_BYX8_F4_INDEX(prefix, b, f, y, x)\
88 FUNC_CALL(get_byx8_f4_index)( \
89 b, f, y, x, CAT(prefix, _X_PITCH), \
90 CAT(prefix, _Y_PITCH), \
91 CAT(prefix, _BATCH_PITCH), \
92 CAT(prefix, _FEATURE_NUM), \
93 CAT(prefix, _SIZE_X), \
96 #define GET_DATA_BF8_XY16_INDEX(prefix, b, f, y, x) \
97 FUNC_CALL(get_bf8_xy16_index)( \
98 b, f, y, x, CAT(prefix, _SIZE_X ), \
99 CAT(prefix, _SIZE_Y), \
100 CAT(prefix, _FEATURE_NUM), \
101 CAT(prefix, _OFFSET))
103 inline uint FUNC(get_fs_bs_yx_bsv4_fsv32_index)(uint b, uint f, uint y, uint x,
104 uint x_pad_before, uint x_size, uint x_pad_after,
105 uint y_pad_before, uint y_size, uint y_pad_after,
106 uint size_f, uint size_b)
108 const uint f_32_aligned = ((size_f + 31)/32) * 32;
109 const uint b_4_aligned = ((size_b + 3)/4) * 4;
110 const uint fsv_idx = f % 32;
111 const uint bsv_idx = b % 4;
112 const uint fs_idx = f / 32;
113 const uint bs_idx = b / 4;
115 const uint x_pitch = 32 * 4;
116 const uint y_pitch = 32 * 4 * (x_pad_before + x_size + x_pad_after);
117 const uint bs_pitch = y_pitch * (y_pad_before + y_size + y_pad_after);
118 const uint fs_pitch = bs_pitch * (b_4_aligned / 4);
119 uint offset = x_pitch * x_pad_before + y_pitch * y_pad_before;
121 size_t idx = offset + fsv_idx + bsv_idx * 32;
124 idx += bs_idx * bs_pitch;
125 idx += fs_idx * fs_pitch;
130 #define GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(prefix, b, f, y, x)\
131 FUNC_CALL(get_fs_bs_yx_bsv4_fsv32_index)( \
133 CAT(prefix, _PAD_BEFORE_SIZE_X), \
134 CAT(prefix, _SIZE_X), \
135 CAT(prefix, _PAD_AFTER_SIZE_X), \
136 CAT(prefix, _PAD_BEFORE_SIZE_Y), \
137 CAT(prefix, _SIZE_Y), \
138 CAT(prefix, _PAD_AFTER_SIZE_Y), \
139 CAT(prefix, _FEATURE_NUM), \
140 CAT(prefix, _BATCH_NUM))
142 #define GET_FILTER_INDEX(prefix, o, i, y, x) \
143 CAT(prefix, _OFFSET) + \
144 (x)*CAT(prefix, _X_PITCH) + \
145 (y)*CAT(prefix, _Y_PITCH) + \
146 (i)*CAT(prefix, _IFM_PITCH) + \
147 (o)*CAT(prefix, _OFM_PITCH)
149 #define GET_FILTER_INDEX_SAFE(prefix, o, i, y, x) \
150 CAT(prefix, _OFFSET) + \
151 (x % CAT(prefix, _SIZE_X ))*CAT(prefix, _X_PITCH) + \
152 (y % CAT(prefix, _SIZE_Y ))*CAT(prefix, _Y_PITCH) + \
153 (i % CAT(prefix, _IFM_NUM))*CAT(prefix, _IFM_PITCH) + \
154 (o % CAT(prefix, _OFM_NUM))*CAT(prefix, _OFM_PITCH)
156 #define GET_FILTER_OS_IYX_OSV8_INDEX(prefix, o, i, y, x, sub_group_size) \
157 CAT(prefix, _OFFSET) + \
158 ((o) % (sub_group_size)) + \
160 (x)*CAT(prefix, _X_PITCH) + \
161 (y)*CAT(prefix, _Y_PITCH) + \
162 (i)*CAT(prefix, _IFM_PITCH) + \
163 ((o) / (sub_group_size))*CAT(prefix, _OFM_PITCH) \
166 #define GET_FILTER_OS_IYX_OSV8_ROTATE_180_INDEX(prefix, o, i, y, x, sub_group_size) \
167 CAT(prefix, _OFFSET) + \
168 ((o) % (sub_group_size)) + \
170 (CAT(prefix, _SIZE_X ) - x - 1)*CAT(prefix, _X_PITCH) + \
171 (CAT(prefix, _SIZE_Y ) - y - 1)*CAT(prefix, _Y_PITCH) + \
172 (i)*CAT(prefix, _IFM_PITCH) + \
173 ((o) / (sub_group_size))*CAT(prefix, _OFM_PITCH) \
176 inline uint FUNC(get_i_yxs_os_yxsv2_osv_index)(uint o, uint i, uint y, uint x, uint x_size, uint i_pitch, uint y_pitch, uint x_pitch, uint offset, uint sub_group_size)
178 const uint aligned_ofm_line = x_pitch;
179 const uint ifm_height_pitch = (i_pitch/aligned_ofm_line);
180 const uint dst_height = i*ifm_height_pitch + y*x_size + x;
181 const uint base_filter_index = y*x_size + x;
183 const uint aligned_height = dst_height & 0xfffffffe;
184 const uint base_filter_odd = (base_filter_index & 0x1);
186 uint slice_id = o / sub_group_size;
187 uint id_in_slice = o % sub_group_size;
188 uint slice_pitch = 2*sub_group_size;
189 uint offset_in_slice = (int)(sub_group_size*base_filter_odd);
191 const uint in_line = (slice_pitch*slice_id + offset_in_slice + id_in_slice);
192 const size_t idx = offset + aligned_height*aligned_ofm_line + in_line;
197 #define GET_FILTER_I_YXS_OS_YXSV2_OSV_INDEX(prefix, o, i, y, x, sub_group_size) \
198 FUNC_CALL(get_i_yxs_os_yxsv2_osv_index)( \
199 o, i, y, x, CAT(prefix, _SIZE_X ), \
200 CAT(prefix, _IFM_PITCH), \
201 CAT(prefix, _Y_PITCH), \
202 CAT(prefix, _X_PITCH), \
203 CAT(prefix, _OFFSET), \
206 inline uint FUNC(get_iy_xs_os_xsv2_osv_index)(uint o, uint i, uint y, uint x, uint x_size, uint i_pitch, uint y_pitch, uint x_pitch, uint offset, uint sub_group_size)
208 const uint aligned_ofm_line = x_pitch;
209 const uint ifm_height_pitch = (i_pitch/aligned_ofm_line);
210 const uint aligned_x_line = y_pitch / x_pitch;
211 const uint dst_height = i*ifm_height_pitch + y*aligned_x_line + x;
212 const uint base_filter_index = x;
214 const uint aligned_height = dst_height & 0xfffffffe;
215 const uint base_filter_odd = (base_filter_index & 0x1);
217 uint slice_id = o / sub_group_size;
218 uint id_in_slice = o % sub_group_size;
219 uint slice_pitch = 2*sub_group_size;
220 uint offset_in_slice = (int)(sub_group_size*base_filter_odd);
222 const bool last_line_in_base_filter = (x == (x_size - 1));
223 if (last_line_in_base_filter && base_filter_odd == 0)
225 const uint element_in_slice = 32;
226 slice_id = o / element_in_slice;
227 id_in_slice = o % element_in_slice;
228 slice_pitch = 2*element_in_slice;
232 const uint in_line = (slice_pitch*slice_id + offset_in_slice + id_in_slice);
233 const size_t idx = offset + aligned_height*aligned_ofm_line + in_line;
238 #define GET_FILTER_IY_XS_OS_XSV2_OSV_INDEX(prefix, o, i, y, x, sub_group_size) \
239 FUNC_CALL(get_iy_xs_os_xsv2_osv_index)( \
240 o, i, y, x, CAT(prefix, _SIZE_X ), \
241 CAT(prefix, _IFM_PITCH), \
242 CAT(prefix, _Y_PITCH), \
243 CAT(prefix, _X_PITCH), \
244 CAT(prefix, _OFFSET), \
247 inline uint FUNC(get_os_is_yx_isa8_osv8_isv4_index)(uint o, uint i, uint y, uint x, uint size_x, uint size_y, uint size_ifm, uint size_ofm, uint offset)
249 const uint f_32_aligned = ((size_ifm + 31)/32) * 32;
250 const uint isv2_idx = i % 4;
251 const uint osv_idx = o % 8;
252 const uint isv1_idx = (i / 4) % 8;
253 const uint is_idx = i / 32;
254 const uint os_idx = o / 8;
256 size_t idx = offset + isv2_idx + 4 * (osv_idx + 8 * isv1_idx);
257 idx += x * 4 * 8 * 8;
258 idx += y * size_x * 4 * 8 * 8;
259 idx += is_idx * size_y * size_x * 4 * 8 * 8;
260 idx += os_idx * (f_32_aligned/32) * size_y * size_x * 4 * 8 * 8;
265 #define GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4(prefix, o, i, y, x) \
266 FUNC_CALL(get_os_is_yx_isa8_osv8_isv4_index)( \
267 o, i, y, x, CAT(prefix, _SIZE_X ), \
268 CAT(prefix, _SIZE_Y), \
269 CAT(prefix, _IFM_NUM), \
270 CAT(prefix, _OFM_NUM), \
271 CAT(prefix, _OFFSET))
273 inline uint FUNC(get_os_is_yx_isa8_osv8_isv4_swizzled_by_4_index)(uint o, uint i, uint y, uint x, uint size_x, uint size_y, uint size_ifm, uint size_ofm, uint offset)
275 const uint o_swizzled = (o % 4) * 8 + ((o % 32) / 4) + (o / 32) * 32;
277 const uint f_32_aligned = ((size_ifm + 31)/32) * 32;
278 const uint isv2_idx = i % 4;
279 const uint osv_idx = o_swizzled % 8;
280 const uint isv1_idx = (i / 4) % 8;
281 const uint is_idx = i / 32;
282 const uint os_idx = o_swizzled / 8;
284 size_t idx = offset + isv2_idx + 4 * (osv_idx + 8 * isv1_idx);
285 idx += x * 4 * 8 * 8;
286 idx += y * size_x * 4 * 8 * 8;
287 idx += is_idx * size_y * size_x * 4 * 8 * 8;
288 idx += os_idx * (f_32_aligned/32) * size_y * size_x * 4 * 8 * 8;
293 #define GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(prefix, o, i, y, x) \
294 FUNC_CALL(get_os_is_yx_isa8_osv8_isv4_swizzled_by_4_index)( \
295 o, i, y, x, CAT(prefix, _SIZE_X ), \
296 CAT(prefix, _SIZE_Y), \
297 CAT(prefix, _IFM_NUM), \
298 CAT(prefix, _OFM_NUM), \
299 CAT(prefix, _OFFSET))
302 inline uint FUNC(get_is_o_yx_isv32_index)(uint o, uint i, uint y, uint x, uint i_size, uint o_size, uint x_size, uint y_size)
304 const uint i_aligned_to_32 = ((i_size + 31) / 32) * 32;
305 const uint i_val = i % 32;
306 const uint i_slice = i / 32;
307 const size_t idx = i_val + 32* (x + x_size * (y + y_size * (o + o_size * i_slice) ) );
311 #define GET_FILTER_IS_O_YX_ISV32(prefix, o, i, y, x)\
312 FUNC_CALL(get_is_o_yx_isv32_index)(\
313 o, i, y, x, CAT(prefix, _IFM_NUM),\
314 CAT(prefix, _OFM_NUM),\
315 CAT(prefix, _SIZE_X),\
316 CAT(prefix, _SIZE_Y))
318 inline uint FUNC(get_is_o32_yx_isv32_swizzled_by_4_index)(uint o, uint i, uint y, uint x, uint i_size, uint o_size, uint x_size, uint y_size)
320 const uint o_aligned_to_32 = ((o_size + 31) / 32) * 32;
321 const uint o_swizzled = (o % 4) * 8 + ((o % 32) / 4) + (o / 32) * 32;
322 const uint i_aligned_to_32 = ((i_size + 31) / 32) * 32;
323 const uint i_val = i % 32;
324 const uint i_slice = i / 32;
325 const size_t idx = i_val + 32* (x + x_size * (y + y_size * (o_swizzled + o_aligned_to_32 * i_slice) ) );
329 #define GET_FILTER_IS_O32_YX_ISV32_SWIZZLED_BY_4(prefix, o, i, y, x)\
330 FUNC_CALL(get_is_o32_yx_isv32_swizzled_by_4_index)(\
331 o, i, y, x, CAT(prefix, _IFM_NUM),\
332 CAT(prefix, _OFM_NUM),\
333 CAT(prefix, _SIZE_X),\
334 CAT(prefix, _SIZE_Y))
336 inline uint FUNC(get_os_is_y_x8_osv8_isv4_index)(uint o, uint i, uint y, uint x, uint i_size, uint o_size, uint x_size, uint y_size)
338 const uint i_aligned_to_4 = ((i_size + 3) / 4) * 4;
339 const uint o_aligned_to_8 = ((o_size + 7) / 8) * 8;
340 const uint x_aligned_to_8 = ((x_size + 7) / 8) * 8;
341 const uint i_val = i % 4;
342 const uint i_slice = i / 4;
343 const uint o_val = o % 8;
344 const uint o_slice = o / 8;
345 const size_t idx = i_val + 4 * (o_val + 8 * ( x + x_aligned_to_8 * (y + y_size * (i_slice + (i_aligned_to_4/4) * (o_slice)))));
349 #define GET_FILTER_OS_IS_Y_X8_OSV8_ISV4(prefix, o, i, y, x)\
350 FUNC_CALL(get_os_is_y_x8_osv8_isv4_index)(\
351 o, i, y, x, CAT(prefix, _IFM_NUM),\
352 CAT(prefix, _OFM_NUM),\
353 CAT(prefix, _SIZE_X),\
354 CAT(prefix, _SIZE_Y))
356 #define GET_DATA_B_FS_YX_FSV4_INDEX(prefix, o, i, y, x)\
357 FUNC_CALL(get_b_fs_yx_fsv4)(\
359 CAT(prefix, _FEATURE_NUM),\
360 CAT(prefix, _PAD_BEFORE_SIZE_Y), CAT(prefix, _SIZE_Y), CAT(prefix, _PAD_AFTER_SIZE_Y),\
361 CAT(prefix, _PAD_BEFORE_SIZE_X), CAT(prefix, _SIZE_X), CAT(prefix, _PAD_AFTER_SIZE_X))
363 inline uint FUNC(get_b_fs_yx_fsv4)(uint o, uint i, uint y, uint x,
365 uint pad_before_size_y, uint size_y, uint pad_after_size_y,
366 uint pad_before_size_x, uint size_x, uint pad_after_size_x)
369 uint id_tile = i / tile;
370 uint id = i - id_tile * tile;
372 const uint feature_num_aligned4 = ((feature_num + 3) / 4) * 4;
374 uint idx = o * (feature_num_aligned4 / tile) *
375 (pad_before_size_y + size_y + pad_after_size_y) *
376 (pad_before_size_x + size_x + pad_after_size_x) * tile
377 + id_tile * (pad_before_size_y + size_y + pad_after_size_y) *
378 (pad_before_size_x + size_x + pad_after_size_x) * tile
379 + pad_before_size_y * (pad_before_size_x + size_x + pad_after_size_x) * tile
380 + y * (pad_before_size_x + size_x + pad_after_size_x) * tile
381 + pad_before_size_x * tile
388 #define GET_FILTER_OS_IS_YX_OSV16_ISV4_INDEX(prefix, o, i, y, x)\
389 FUNC_CALL(get_os_is_yx_osv16_isv4)(\
391 CAT(prefix, _IFM_PITCH),\
392 CAT(prefix, _OFM_PITCH),\
393 CAT(prefix, _SIZE_X))
395 inline uint FUNC(get_os_is_yx_osv16_isv4)(uint o, uint i, uint y, uint x,
401 uint out_depth_tile = o / otd;
402 uint od = o - out_depth_tile * otd;
405 uint id_tile = i / tile;
406 uint id = i - id_tile * tile;
408 uint idx = out_depth_tile * (o_size / tile) * otd * tile
409 + id_tile * i_size * otd * tile
410 + y * x_size * otd * tile
418 #define DECLARE_SAMPLER const sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST
421 #define IMAGE_READ(image, coord) read_imageh((image), imageSampler, (coord))
422 #define IMAGE_WRITE(image, coord, val) write_imageh((image), (coord), (val))
424 #define IMAGE_READ(image, coord) read_imagef((image), imageSampler, (coord))
425 #define IMAGE_WRITE(image, coord, val) write_imagef((image), (coord), (val))