28 #if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) 33 #define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) 35 #define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) 37 #error "STRIDE_X larger than 2 is not supported" 40 #define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ 42 int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \ 43 int weights_value1 = convert_int(*(weights_row_ptr + 4)); \ 44 int8 src0 = convert_int8(vload8(0, src_row_ptr)); \ 45 int4 src1 = convert_int4(vload4(0, src_row_ptr + 8)); \ 46 acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset); \ 47 acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \ 48 acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \ 49 acc += ((int8)(src0.s345, src0.s67, src1.s012) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \ 50 acc += ((int8)(src0.s45, src0.s67, src1.s0123) + input_offset) * ((int8)weights_value1 + weight_offset); \ 53 #define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ 55 int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr)); \ 56 int weights_value1 = convert_int(*(weights_row_ptr + 4)); \ 57 int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ 58 int4 src1 = convert_int4(vload4(0, src_row_ptr + 16)); \ 59 acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset); \ 60 acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \ 61 acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \ 62 acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \ 63 acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + input_offset) * ((int8)weights_value1 + weight_offset); \ 66 #elif KERNEL_SIZE == 3 69 #define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) 71 #define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) 73 #error "STRIDE_X larger than 2 is not supported" 76 #define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ 78 int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \ 79 int8 src0 = convert_int8(vload8(0, src_row_ptr)); \ 80 int2 src1 = convert_int2(vload2(0, src_row_ptr + 8)); \ 81 acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset); \ 82 acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \ 83 acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \ 86 #define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ 88 int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr)); \ 89 int16 src0 = convert_int16(vload16(0, src_row_ptr)); \ 90 int src1 = convert_int(*(src_row_ptr + 16)); \ 91 acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset); \ 92 acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \ 93 acc += ((int8)(src0.s2468, src0.sACE, src1) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \ 96 #elif KERNEL_SIZE == 1 99 #define INPUT_PIXEL extract_input_stride3 101 #define INPUT_PIXEL extract_input_stride2 103 #define INPUT_PIXEL extract_input_stride1 106 #error "Only support strides 1, 2 and 3" 115 inline uchar8 extract_input_stride1(__global
const uchar *input_pixel)
117 return vload8(0, input_pixel);
126 inline uchar8 extract_input_stride2(__global
const uchar *input_pixel)
128 uchar16 temp = vload16(0, input_pixel);
129 return temp.s02468ace;
138 inline uchar8 extract_input_stride3(__global
const uchar *input_pixel)
140 uchar16 temp1 = vload16(0, input_pixel);
141 uchar16 temp2 = vload16(0, input_pixel + 12);
142 return (uchar8)(temp1.s0369, temp2.s0369);
146 #error "Only kernel sizes 1, 3 and 5 are supported" 190 __kernel
void direct_convolution_1x1_3x3_5x5_quantized(
197 unsigned int weights_stride_w,
201 int output_multiplier,
210 __global uchar *weights_addr = (__global uchar *)
tensor3D_offset(&weights, 0, 0, 0);
211 __global uchar *src_addr = (__global uchar *)
offset(&src, 0, 0);
213 const int kernel_index = get_global_id(2);
214 weights_addr += kernel_index * weights_stride_w;
216 for(
volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
219 CONVOLUTION1x5(pixels0, (__global uchar *)src_addr, (__global uchar *)weights_addr);
220 CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 1 * src_stride_y), (__global uchar *)(weights_addr + 1 * weights_stride_y));
221 CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 2 * src_stride_y), (__global uchar *)(weights_addr + 2 * weights_stride_y));
222 CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 3 * src_stride_y), (__global uchar *)(weights_addr + 3 * weights_stride_y));
223 CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 4 * src_stride_y), (__global uchar *)(weights_addr + 4 * weights_stride_y));
224 #elif KERNEL_SIZE == 3 225 CONVOLUTION1x3(pixels0, (__global uchar *)(src_addr + 0 * src_stride_y), (__global uchar *)(weights_addr + 0 * weights_stride_y));
226 CONVOLUTION1x3(pixels0, (__global uchar *)(src_addr + 1 * src_stride_y), (__global uchar *)(weights_addr + 1 * weights_stride_y));
227 CONVOLUTION1x3(pixels0, (__global uchar *)(src_addr + 2 * src_stride_y), (__global uchar *)(weights_addr + 2 * weights_stride_y));
228 #elif KERNEL_SIZE == 1 229 int weight = convert_int(*(__global uchar *)weights_addr);
230 int8 input_pixel = convert_int8(INPUT_PIXEL((__global uchar *)src_addr));
231 pixels0 += (input_pixel + input_offset) * ((int8)weight + weight_offset);
234 src_addr += src_stride_z;
235 weights_addr += weights_stride_z;
240 __global
int *bias_addr = ((__global
int *)(
vector_offset(&biases, kernel_index)));
241 pixels0 += (int8)(*bias_addr);
245 pixels0 = pixels0 + output_offset;
247 vstore8(convert_uchar8_sat(pixels0), 0, (__global uchar *)dst.
ptr);
249 #endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) 281 #
if defined(HAS_BIAS)
285 int output_multiplier,
290 #if defined(HAS_BIAS) 292 #endif //defined(HAS_BIAS) 295 int16 vals = vload16(0, (__global
int *)(src.
ptr));
297 #if defined(HAS_BIAS) 299 int bias_value = *((__global
int *)(
vector_offset(&bias, get_global_id(2))));
300 vals += (int16)(bias_value);
301 #endif //defined(HAS_BIAS) 304 vals = vals + output_offset;
307 vstore16(convert_uchar16_sat(vals), 0, (__global uchar *)dst.
ptr);
Structure to hold Vector information.
#define CONVERT_TO_TENSOR3D_STRUCT(name)
__kernel void output_stage_quantized(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_offset_first_element_in_bytes, __global uchar *bias_ptr, uint bias_stride_x, uint bias_step_x, uint bias_offset_first_element_in_bytes, int output_offset, int output_multiplier, int output_shift)
This function computes the output stage of a depthwise convolution.
Structure to hold 3D tensor information.
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
#define CONVERT_TO_IMAGE_STRUCT(name)
__global const uchar * tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
Get the pointer position of a Tensor3D.
#define VECTOR_DECLARATION(name)
Structure to hold Image information.
#define TENSOR3D_DECLARATION(name)
__global uchar * ptr
Pointer to the starting postion of the buffer.
#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name)
__global const uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size)
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
__global uchar * ptr
Pointer to the starting postion of the buffer.
convolution configure & src