26 #if defined(FIXED_POINT_POSITION) 28 #endif // FIXED_POINT_POSITION 30 #if defined(DATA_TYPE) && defined(ELEMENT_SIZE) 31 #if !defined(FIXED_POINT_POSITION) 34 #define COND_DATA_TYPE char 35 #elif ELEMENT_SIZE == 2 36 #define COND_DATA_TYPE short 37 #elif ELEMENT_SIZE == 4 38 #define COND_DATA_TYPE int 40 #error "Element size not support" 41 #endif // ELEMENT_SIZE 43 #if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) 70 __kernel
void im2col1x1_stridex1_dchw(
76 const uint xc = get_global_id(0) * 4;
77 const uint yc = get_global_id(1);
78 const uint ch = get_global_id(2) % KERNEL_DEPTH;
79 const uint batch = get_global_id(2) / KERNEL_DEPTH;
83 uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
88 xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
92 const uint yi = yc * STRIDE_Y;
96 const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH;
99 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
101 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
104 data = vload4(0, (__global DATA_TYPE *)input_ptr);
107 data = select((
VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
109 *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
110 *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
111 *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
112 *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
115 if(ch == (KERNEL_DEPTH - 1))
117 *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
118 *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
119 *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
120 *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
124 #endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) 126 #if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) 155 __kernel
void im2col3x3_dchw(
161 const int xc = get_global_id(0);
162 const int yc = get_global_id(1);
163 const int ch = get_global_id(2) % KERNEL_DEPTH;
164 const int batch = get_global_id(2) / KERNEL_DEPTH;
167 const int xi = xc * STRIDE_X - PAD_LEFT;
168 const int yi = yc * STRIDE_Y - PAD_TOP;
171 const int xo = ch * 9;
172 const int yo = xc + yc * CONVOLVED_WIDTH;
175 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (
int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
177 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
180 row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
182 row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
184 row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
186 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 188 int3 x = (int3)xi + (int3)(0, 1, 2);
189 int3 y = (int3)yi + (int3)(0, 1, 2);
192 cond0 =
CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)),
VEC_DATA_TYPE(COND_DATA_TYPE, 3));
194 cond1 =
CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)),
VEC_DATA_TYPE(COND_DATA_TYPE, 3));
196 cond2 =
CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)),
VEC_DATA_TYPE(COND_DATA_TYPE, 3));
198 row0 = select((
VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
199 row1 = select((
VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
200 row2 = select((
VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
201 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 203 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
204 *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
207 if(ch == (KERNEL_DEPTH - 1))
209 *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
242 __kernel
void im2col5x5_dchw(
248 const int xc = get_global_id(0);
249 const int yc = get_global_id(1);
250 const int ch = get_global_id(2) % KERNEL_DEPTH;
251 const int batch = get_global_id(2) / KERNEL_DEPTH;
254 const int xi = xc * STRIDE_X - PAD_LEFT;
255 const int yi = yc * STRIDE_Y - PAD_TOP;
258 const int xo = ch * 25;
259 const int yo = xc + yc * CONVOLVED_WIDTH;
261 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 263 int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
264 int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
270 x0_condition =
CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH),
VEC_DATA_TYPE(COND_DATA_TYPE, 4));
272 y0_condition =
CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT),
VEC_DATA_TYPE(COND_DATA_TYPE, 4));
273 COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
274 COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
275 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 278 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (
int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
280 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
284 row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
286 row01 = *((__global DATA_TYPE *)input_ptr + 4);
288 input_ptr += src_stride_y;
291 row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
293 row11 = *((__global DATA_TYPE *)input_ptr + 4);
295 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 297 cond00 = x0_condition && (
VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
299 cond10 = x0_condition && (
VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
300 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
301 COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
304 row00 = select((
VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
305 row10 = select((
VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
306 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
307 row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
308 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 312 0, (__global DATA_TYPE *)output_ptr);
313 vstore2((
VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
315 input_ptr += src_stride_y;
316 output_ptr += 10 * dst_stride_x;
321 row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
323 row01 = *((__global DATA_TYPE *)input_ptr + 4);
325 input_ptr += src_stride_y;
328 row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
330 row11 = *((__global DATA_TYPE *)input_ptr + 4);
332 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 334 cond00 = x0_condition && (
VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
336 cond10 = x0_condition && (
VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
337 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
338 COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
341 row00 = select((
VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
342 row10 = select((
VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
343 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
344 row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
345 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 349 0, (__global DATA_TYPE *)output_ptr);
350 vstore2((
VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
352 input_ptr += src_stride_y;
353 output_ptr += 10 * dst_stride_x;
358 row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
360 row01 = *((__global DATA_TYPE *)input_ptr + 4);
362 input_ptr += src_stride_y;
364 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 366 cond00 = x0_condition && (
VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
367 COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
370 row00 = select((
VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
371 row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
372 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 374 vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
375 *((__global DATA_TYPE *)output_ptr + 4) = row01;
377 output_ptr += 5 * dst_stride_x;
381 if(ch == (KERNEL_DEPTH - 1))
383 *((__global DATA_TYPE *)output_ptr) = 1.0f;
387 #endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) 389 #if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) 415 __kernel
void im2col11x11_padx0_pady0_dchw(
421 const int xc = get_global_id(0);
422 const int yc = get_global_id(1);
423 const int ch = get_global_id(2) % KERNEL_DEPTH;
424 const int batch = get_global_id(2) / KERNEL_DEPTH;
427 const int xi = xc * STRIDE_X;
428 const int yi = yc * STRIDE_Y;
431 const int xo = ch * 121;
432 const int yo = xc + yc * CONVOLVED_WIDTH;
435 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
437 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
440 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
442 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
444 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
445 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
447 input_ptr += src_stride_y;
448 output_ptr += 11 * src_stride_x;
453 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
455 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
457 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
458 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
460 input_ptr += src_stride_y;
461 output_ptr += 11 * src_stride_x;
466 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
468 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
470 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
471 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
473 input_ptr += src_stride_y;
474 output_ptr += 11 * src_stride_x;
479 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
481 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
483 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
484 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
486 input_ptr += src_stride_y;
487 output_ptr += 11 * src_stride_x;
492 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
494 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
496 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
497 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
499 input_ptr += src_stride_y;
500 output_ptr += 11 * src_stride_x;
505 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
507 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
509 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
510 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
512 input_ptr += src_stride_y;
513 output_ptr += 11 * src_stride_x;
518 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
520 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
522 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
523 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
525 input_ptr += src_stride_y;
526 output_ptr += 11 * src_stride_x;
531 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
533 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
535 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
536 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
538 input_ptr += src_stride_y;
539 output_ptr += 11 * src_stride_x;
544 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
546 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
548 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
549 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
551 input_ptr += src_stride_y;
552 output_ptr += 11 * src_stride_x;
557 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
559 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
561 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
562 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
564 input_ptr += src_stride_y;
565 output_ptr += 11 * src_stride_x;
570 row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
572 row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
574 vstore8((
VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
575 vstore3((
VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
577 output_ptr += 11 * src_stride_x;
581 if(ch == (KERNEL_DEPTH - 1))
583 *((__global DATA_TYPE *)output_ptr) = 1.0f;
587 #endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) 588 #endif // !defined(FIXED_POINT_POSITION) 590 #if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) 616 __kernel
void im2col_generic_padx0_pady0_dchw(
622 const int xc = get_global_id(0);
623 const int yc = get_global_id(1);
624 const int ch = get_global_id(2) % KERNEL_DEPTH;
625 const int batch = get_global_id(2) / KERNEL_DEPTH;
628 const int xi = xc * STRIDE_X;
629 const int yi = yc * STRIDE_Y;
631 const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
632 const int yo = xc + yc * CONVOLVED_WIDTH;
633 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
634 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
636 for(
int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
642 row =
VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
644 (row, 0, output_ptr);
649 #if WIDTH_MOD_VECTOR_SIZE == 1 650 *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
651 #elif WIDTH_MOD_VECTOR_SIZE > 1 653 row =
VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
654 VSTORE(WIDTH_MOD_VECTOR_SIZE)
655 (row, 0, output_ptr);
657 output_ptr += WIDTH_MOD_VECTOR_SIZE;
661 if(ch == (KERNEL_DEPTH - 1))
663 #ifdef FIXED_POINT_POSITION 664 *output_ptr = (
DATA_TYPE)(1 << FIXED_POINT_POSITION);
665 #else // FIXED_POINT_POSITION 667 #endif // FIXED_POINT_POSITION 671 #endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) 673 #if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) 703 __kernel
void im2col_generic_dchw(
709 const int xc = get_global_id(0);
710 const int yc = get_global_id(1);
711 const int ch = get_global_id(2) % KERNEL_DEPTH;
712 const int batch = get_global_id(2) / KERNEL_DEPTH;
715 const int xi = xc * STRIDE_X - PAD_LEFT;
716 const int yi = yc * STRIDE_Y - PAD_TOP;
719 const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
720 const int yo = xc + yc * CONVOLVED_WIDTH;
722 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
723 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
726 for(
int yk = 0; yk < KERNEL_HEIGHT; ++yk)
728 int y = yi + yk * DILATION_Y;
729 for(
int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
731 int x = xi + xk * DILATION_X;
732 #if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 733 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
734 #else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 735 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
737 *output_ptr = PAD_VALUE;
741 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
743 #endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 748 if(ch == (KERNEL_DEPTH - 1))
750 #ifdef FIXED_POINT_POSITION 751 *output_ptr = (
DATA_TYPE)(1 << FIXED_POINT_POSITION);
752 #else // FIXED_POINT_POSITION 754 #endif // FIXED_POINT_POSITION 758 #endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) 781 __kernel
void im2col_reduced_dchw(
784 uint width, uint height)
788 const uint image_size = width * height;
790 __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x;
792 *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)
src.ptr);
796 if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
798 tmp_out_ptr += dst_stride_x;
799 #ifdef FIXED_POINT_POSITION 800 *((__global DATA_TYPE *)tmp_out_ptr) = (
DATA_TYPE)(1 << FIXED_POINT_POSITION);
801 #else // FIXED_POINT_POSITION 802 *((__global DATA_TYPE *)tmp_out_ptr) = (
DATA_TYPE)1.0f;
803 #endif // FIXED_POINT_POSITION 807 #endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
#define CONVERT_TO_TENSOR3D_STRUCT(name)
#define IMAGE_DECLARATION(name)
Structure to hold 3D tensor information.
#define VECTOR_DECLARATION(name)
#define TENSOR3D_DECLARATION(name)
#define VEC_DATA_TYPE(type, size)
convolution configure & src