44 uint x = get_global_id(0);
45 uint y = get_global_id(1);
51 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
53 float4 b0 = vload4(0, (__global
float *)src.
ptr);
55 vstore4(b0, 0, (__global
float *)(dst_ptr + dst_addr_in_bytes));
76 uint x = get_global_id(0);
77 uint y = get_global_id(1);
83 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
85 half8 b0 = vload8(0, (__global half *)src.
ptr);
87 vstore8(b0, 0, (__global half *)(dst_ptr + dst_addr_in_bytes));
108 uint x = get_global_id(0);
109 uint y = get_global_id(1);
115 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
117 uchar16 b0 = vload16(0, (__global uchar *)src.
ptr);
119 vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
145 float4 a0 = vload4(0, (__global
float *)(
offset(&src, 0, 0)));
146 float4 a1 = vload4(0, (__global
float *)(
offset(&src, 0, 1)));
147 float4 a2 = vload4(0, (__global
float *)(
offset(&src, 0, 2)));
148 float4 a3 = vload4(0, (__global
float *)(
offset(&src, 0, 3)));
150 float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
151 vstore4(val0, 0, ((__global
float *)dst.
ptr) + 0);
153 val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
154 vstore4(val0, 0, ((__global
float *)dst.
ptr) + 4);
156 val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
157 vstore4(val0, 0, ((__global
float *)dst.
ptr) + 8);
159 val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
160 vstore4(val0, 0, ((__global
float *)dst.
ptr) + 12);
186 half8 a0 = vload8(0, (__global half *)(
offset(&src, 0, 0)));
187 half8 a1 = vload8(0, (__global half *)(
offset(&src, 0, 1)));
188 half8 a2 = vload8(0, (__global half *)(
offset(&src, 0, 2)));
189 half8 a3 = vload8(0, (__global half *)(
offset(&src, 0, 3)));
191 half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
192 vstore8(val0, 0, ((__global half *)dst.
ptr) + 0);
194 val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
195 vstore8(val0, 0, ((__global half *)dst.
ptr) + 8);
197 val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
198 vstore8(val0, 0, ((__global half *)dst.
ptr) + 16);
200 val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
201 vstore8(val0, 0, ((__global half *)dst.
ptr) + 24);
227 uchar16 a0 = vload16(0, (__global uchar *)(
offset(&src, 0, 0)));
228 uchar16 a1 = vload16(0, (__global uchar *)(
offset(&src, 0, 1)));
229 uchar16 a2 = vload16(0, (__global uchar *)(
offset(&src, 0, 2)));
230 uchar16 a3 = vload16(0, (__global uchar *)(
offset(&src, 0, 3)));
232 uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
233 (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
234 vstore16(val0, 0, ((__global uchar *)dst.
ptr) + 0);
236 val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
237 (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
238 vstore16(val0, 0, ((__global uchar *)dst.
ptr) + 16);
240 val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
241 (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
242 vstore16(val0, 0, ((__global uchar *)dst.
ptr) + 32);
244 val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
245 (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
246 vstore16(val0, 0, ((__global uchar *)dst.
ptr) + 48);
269 float4 accum_value = vload4(0, (__global
float *)accum.
ptr);
270 float4 biases_value = vload4(0, (__global
float *)biases.
ptr);
271 accum_value = biases_value + accum_value;
274 vstore4(accum_value, 0, (__global
float *)accum.
ptr);
297 half8 accum_value = vload8(0, (__global half *)accum.
ptr);
298 half8 biases_value = vload8(0, (__global half *)biases.
ptr);
299 accum_value = biases_value + accum_value;
302 vstore8(accum_value, 0, (__global half *)accum.
ptr);
305 #if(defined WIDTH_MATRIX_B) 348 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
352 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
355 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
363 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
366 int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
367 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
369 c00 += (int16)a0.s0 * b0;
370 c10 += (int16)a0.s1 * b0;
371 c20 += (int16)a0.s2 * b0;
372 c30 += (int16)a0.s3 * b0;
374 int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
376 c00 += (int16)a0.s4 * b1;
377 c10 += (int16)a0.s5 * b1;
378 c20 += (int16)a0.s6 * b1;
379 c30 += (int16)a0.s7 * b1;
382 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
385 int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
386 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
388 c00 += (int16)a0.s0 * b0;
389 c10 += (int16)a0.s1 * b0;
390 c20 += (int16)a0.s2 * b0;
391 c30 += (int16)a0.s3 * b0;
398 c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
399 c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
400 c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
401 c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
404 vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(
offset(&dst, 0, 0)));
405 vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(
offset(&dst, 0, 1)));
406 vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(
offset(&dst, 0, 2)));
407 vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(
offset(&dst, 0, 3)));
435 #if(defined WIDTH_MATRIX_B && defined ALPHA) 444 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
448 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
451 src_addr = src_addr >> 2;
454 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
462 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
465 float4 a0 = vload4(0, ((__global
float *)src0_ptr) + src_addr.s0);
466 float4 b0 = vload4(0, ((__global
float *)src1_ptr) + src_addr.s1);
468 c00 += (float4)a0.s0 * b0;
469 c10 += (float4)a0.s1 * b0;
470 c20 += (float4)a0.s2 * b0;
471 c30 += (float4)a0.s3 * b0;
474 a0 = vload4(0, ((__global
float *)src0_ptr) + src_addr.s0 + 4);
475 b0 = vload4(0, ((__global
float *)src1_ptr) + src_addr.s1 + 4);
477 c00 += (float4)a0.s0 * b0;
478 c10 += (float4)a0.s1 * b0;
479 c20 += (float4)a0.s2 * b0;
480 c30 += (float4)a0.s3 * b0;
483 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
486 float4 a0 = vload4(0, ((__global
float *)src0_ptr) + src_addr.s0);
487 float4 b0 = vload4(0, ((__global
float *)src1_ptr) + src_addr.s1);
489 c00 += (float4)a0.s0 * b0;
490 c10 += (float4)a0.s1 * b0;
491 c20 += (float4)a0.s2 * b0;
492 c30 += (float4)a0.s3 * b0;
499 c00 = c00 * (float4)ALPHA;
500 c10 = c10 * (float4)ALPHA;
501 c20 = c20 * (float4)ALPHA;
502 c30 = c30 * (float4)ALPHA;
505 vstore4(c00, 0, (__global
float *)(
offset(&dst, 0, 0)));
506 vstore4(c10, 0, (__global
float *)(
offset(&dst, 0, 1)));
507 vstore4(c20, 0, (__global
float *)(
offset(&dst, 0, 2)));
508 vstore4(c30, 0, (__global
float *)(
offset(&dst, 0, 3)));
543 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
547 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
550 src_addr = src_addr >> 1;
553 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
561 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
564 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
565 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
567 c00 += (half8)a0.s0 * b0;
568 c10 += (half8)a0.s1 * b0;
569 c20 += (half8)a0.s2 * b0;
570 c30 += (half8)a0.s3 * b0;
573 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
574 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
576 c00 += (half8)a0.s0 * b0;
577 c10 += (half8)a0.s1 * b0;
578 c20 += (half8)a0.s2 * b0;
579 c30 += (half8)a0.s3 * b0;
582 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
585 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
586 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
588 c00 += (half8)a0.s0 * b0;
589 c10 += (half8)a0.s1 * b0;
590 c20 += (half8)a0.s2 * b0;
591 c30 += (half8)a0.s3 * b0;
598 c00 = c00 * (half8)ALPHA;
599 c10 = c10 * (half8)ALPHA;
600 c20 = c20 * (half8)ALPHA;
601 c30 = c30 * (half8)ALPHA;
604 vstore8(c00, 0, (__global half *)(
offset(&dst, 0, 0)));
605 vstore8(c10, 0, (__global half *)(
offset(&dst, 0, 1)));
606 vstore8(c20, 0, (__global half *)(
offset(&dst, 0, 2)));
607 vstore8(c30, 0, (__global half *)(
offset(&dst, 0, 3)));
635 #if(defined WIDTH_VECTOR_A) 640 int idx = get_global_id(0) * 4;
643 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
644 src_addr.s1 += idx *
sizeof(float);
646 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A *
sizeof(float));
650 for(; src_addr.s0 <= (end_row_vec_a - 2 *
sizeof(float)); src_addr += (int2)(2 *
sizeof(
float), 2 * src1_stride_y))
652 float2 a0 = vload2(0, (__global
float *)(src0_ptr + src_addr.s0));
653 float4 b0 = vload4(0, (__global
float *)(src1_ptr + src_addr.s1));
654 float4 b1 = vload4(0, (__global
float *)(src1_ptr + src_addr.s1 + src1_stride_y));
656 acc += b0 * (float4)a0.s0;
657 acc += b1 * (float4)a0.s1;
660 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(
sizeof(
float), src1_stride_y))
662 float a0 = *((__global
float *)(src0_ptr + src_addr.s0));
663 float4 b0 = vload4(0, (__global
float *)(src1_ptr + src_addr.s1));
665 acc += b0 * (float4)a0;
672 acc = acc * (float4)ALPHA;
674 vstore4(acc, 0, (__global
float *)(
offset(&dst, 0, 0)));
706 int idx = get_global_id(0) * 8;
709 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
710 src_addr.s1 += idx *
sizeof(half);
712 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A *
sizeof(half));
716 for(; src_addr.s0 <= (end_row_vec_a - 4 *
sizeof(half)); src_addr += (int2)(4 *
sizeof(half), 4 * src1_stride_y))
718 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
719 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
720 half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
721 half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
722 half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
724 acc += b0 * (half8)a0.s0;
725 acc += b1 * (half8)a0.s1;
726 acc += b2 * (half8)a0.s2;
727 acc += b3 * (half8)a0.s3;
730 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(
sizeof(half), src1_stride_y))
732 half a0 = *((__global half *)(src0_ptr + src_addr.s0));
733 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
735 acc += b0 * (half8)a0;
742 acc = acc * (half8)ALPHA;
744 vstore8(acc, 0, (__global half *)(
offset(&dst, 0, 0)));
775 float4 alpha_ab = vload4(0, (__global
float *)dst.
ptr);
778 float4 c = vload4(0, (__global
float *)src.
ptr);
781 float4 out = alpha_ab + (float4)BETA * c;
784 vstore4(out, 0, (__global
float *)dst.
ptr);
810 half8 alpha_ab = vload8(0, (__global half *)dst.
ptr);
813 half8 c = vload8(0, (__global half *)src.
ptr);
816 half8 out = alpha_ab + (half8)BETA * c;
819 vstore8(out, 0, (__global half *)dst.
ptr);
Structure to hold Vector information.
#define CONVERT_TO_VECTOR_STRUCT(name)
__kernel void gemm_interleave4x4_8bit(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)
This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values...
__kernel void gemm_accumulate_biases_f16(__global uchar *accum_ptr, uint accum_stride_x, uint accum_step_x, uint accum_stride_y, uint accum_step_y, uint accum_offset_first_element_in_bytes, __global uchar *biases_ptr, uint biases_stride_x, uint biases_step_x, uint biases_offset_first_element_in_bytes)
This kernel accumulates each row with the biases vector.
#define IMAGE_DECLARATION(name)
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
#define CONVERT_TO_IMAGE_STRUCT(name)
__global uchar * ptr
Pointer to the starting postion of the buffer.
#define VECTOR_DECLARATION(name)
Structure to hold Image information.
__kernel void gemm_transpose1x8_f16(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)
This OpenCL kernel computes the "vector" 1x8 transposition of input matrix.
__global uchar * ptr
Pointer to the starting postion of the buffer.
__kernel void gemm_interleave4x4_32bit(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)
This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values...
__kernel void gemm_transpose1x4_f32(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)
This OpenCL kernel computes the "vector" 1x4 transposition of input matrix.
__kernel void gemm_interleave4x4_16bit(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)
This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values...
__kernel void gemm_accumulate_biases_f32(__global uchar *accum_ptr, uint accum_stride_x, uint accum_step_x, uint accum_stride_y, uint accum_step_y, uint accum_offset_first_element_in_bytes, __global uchar *biases_ptr, uint biases_stride_x, uint biases_step_x, uint biases_offset_first_element_in_bytes)
This kernel accumulates each row with the biases vector.
__kernel void gemm_transpose1x16_u8(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)
This OpenCL kernel computes the "vector" 1x16 transposition of input matrix.