26 #define MAX_OP(x, y, type, size) max((x), (y)) 27 #define ADD_OP(x, y, type, size) ((x) + (y)) 30 #if !defined(GRID_SIZE) 35 __constant uint2
idx__ = (uint2)(0, 1);
36 #define asymm_mult(a, b) ASYMM_MULT(a, b, 2) 37 #define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 2) 38 #define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 2) 40 #elif VECTOR_SIZE == 4 41 __constant uint4
idx__ = (uint4)(0, 1, 2, 3);
42 #define asymm_mult(a, b) ASYMM_MULT(a, b, 4) 43 #define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 4) 44 #define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 4) 46 #elif VECTOR_SIZE == 8 47 __constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
48 #define asymm_mult(a, b) ASYMM_MULT(a, b, 8) 49 #define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 8) 50 #define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 8) 53 #define VECTOR_SIZE 16 54 #define LOG_VECTOR_SIZE 4 55 __constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
56 #define asymm_mult(a, b) ASYMM_MULT(a, b, 16) 57 #define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 16) 58 #define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 16) 62 #define VEC_UCHAR VEC_DATA_TYPE(uchar, VECTOR_SIZE) 63 #define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE) 64 #define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE) 70 #if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) 71 if(INPUT_BETA_MULTIPLIER > 1)
73 return asymm_mult(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER);
79 int4 mult_by_quantized_multiplier_parallel(int4 data)
81 #if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) 82 if(INPUT_BETA_MULTIPLIER > 1)
84 return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 4);
131 __kernel
void softmax_layer_max_shift_exp_sum_quantized_serial(
147 for(uint i = 0; i < width4; i++)
150 max_val_vec =
MAX_OP(data, max_val_vec, uchar, 16);
153 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 158 max_val_vec =
MAX_OP(max_val_vec, select(uchar_min, data, widx), uchar, 16);
162 #if VECTOR_SIZE == 16 163 max_val_vec.s01234567 =
MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, uchar, 8);
166 max_val_vec.s0123 =
MAX_OP(max_val_vec.s0123, max_val_vec.s4567, uchar, 4);
169 max_val_vec.s01 =
MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);
171 max_val_vec.s0 =
MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);
174 *((__global uchar *)maxo.
ptr) = max_val_vec.s0;
179 int max_val = convert_int(*((__global uchar *)
offset(&maxo, 0, 0)));
185 for(uint i = 0; i < width4; i++)
189 VEC_INT data_diff = data_fp - max_val;
190 VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
192 data_fp =
asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
194 (data_diff, 0, (__global
int *)
offset(&dst, i << LOG_VECTOR_SIZE, 0));
195 sum1D = sum1D + select(0, data_fp, data_diff >= (
VEC_INT)(DIFF_MIN));
198 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 202 VEC_INT data_diff = data_fp - max_val;
203 VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
205 data_fp =
asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
208 (data_diff, 0, (__global
int *)
offset(&dst, width4 << LOG_VECTOR_SIZE, 0));
209 data_fp = select(0, data_fp, data_diff >= (
VEC_INT)(DIFF_MIN));
210 sum1D = sum1D + select(0, data_fp, widx_);
214 #if VECTOR_SIZE == 16 215 sum1D.s01234567 =
ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, uchar, 8);
218 sum1D.s0123 =
ADD_OP(sum1D.s0123, sum1D.s4567, uchar, 4);
221 sum1D.s01 =
ADD_OP(sum1D.s01, sum1D.s23, uchar, 2);
223 sum1D.s0 =
ADD_OP(sum1D.s0, sum1D.s1, uchar, 1);
226 *((__global
int *)sum.
ptr) = sum1D.s0;
270 __kernel
void softmax_layer_max_shift_exp_sum_quantized_parallel(
282 const uint4
idx4 = (uint4)(0, 1, 2, 3);
283 const uint lid = get_local_id(0);
287 __local uchar max_local;
289 uchar4 uchar_min = (uchar4)0;
290 uchar4 max_val_vec = uchar_min;
295 const uint width_ = row >> 2;
298 for(; i < width_; i++)
300 uchar4 data_max = vload4(0, (__global uchar *)
offset(&src, i *
GRID_SIZE * 4, 0));
301 max_val_vec =
MAX_OP(data_max, max_val_vec, uchar, 4);
303 #ifdef NON_MULTIPLE_OF_GRID_SIZE 305 int boundary_workitems = (width % (
GRID_SIZE * 4)) / 4;
306 if(lid < boundary_workitems)
308 uchar4 data_max = vload4(0, (__global uchar *)
offset(&src, i *
GRID_SIZE * 4, 0));
309 max_val_vec =
MAX_OP(data_max, max_val_vec, uchar, 4);
311 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 312 if(boundary_workitems == 0)
317 if(lid == (boundary_workitems - 1))
320 uchar4 data_max = vload4(0, (__global uchar *)
offset(&src, (
GRID_SIZE * i * 4) + 4, 0));
321 uchar4 widx = convert_uchar4(((uint4)(
GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
322 max_val_vec =
MAX_OP(max_val_vec, select(uchar_min, data_max, widx), uchar, 4);
326 tmp_local[lid] = convert_int4(max_val_vec);
328 barrier(CLK_LOCAL_MEM_FENCE);
334 tmp_local[lid] =
MAX_OP(tmp_local[lid + 128], tmp_local[lid],
int, 4);
336 barrier(CLK_LOCAL_MEM_FENCE);
342 tmp_local[lid] =
MAX_OP(tmp_local[lid + 64], tmp_local[lid],
int, 4);
344 barrier(CLK_LOCAL_MEM_FENCE);
350 tmp_local[lid] =
MAX_OP(tmp_local[lid + 32], tmp_local[lid],
int, 4);
352 barrier(CLK_LOCAL_MEM_FENCE);
358 tmp_local[lid] =
MAX_OP(tmp_local[lid + 16], tmp_local[lid],
int, 4);
360 barrier(CLK_LOCAL_MEM_FENCE);
366 tmp_local[lid] =
MAX_OP(tmp_local[lid + 8], tmp_local[lid],
int, 4);
368 barrier(CLK_LOCAL_MEM_FENCE);
374 tmp_local[lid] =
MAX_OP(tmp_local[lid + 4], tmp_local[lid],
int, 4);
376 barrier(CLK_LOCAL_MEM_FENCE);
382 tmp_local[lid] =
MAX_OP(tmp_local[lid + 2], tmp_local[lid],
int, 4);
384 barrier(CLK_LOCAL_MEM_FENCE);
388 max_val_vec =
MAX_OP(convert_uchar4(tmp_local[lid + 1]), convert_uchar4(tmp_local[lid]), uchar, 4);
389 max_val_vec.s01 =
MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);
390 max_val_vec.s0 =
MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);
391 max_local = max_val_vec.s0;
393 barrier(CLK_LOCAL_MEM_FENCE);
399 int max_val = convert_int(max_local);
402 for(i = 0; i < width_; i++)
404 uchar4 data = vload4(0, (__global uchar *)
offset(&src, i *
GRID_SIZE * 4, 0));
405 int4 data_fp = convert_int4(data);
406 int4 data_diff = data_fp - max_val;
407 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
409 data_fp =
ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
410 vstore4(data_diff, 0, (__global
int *)
offset(&dst, i * GRID_SIZE * 4, 0));
411 sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
413 #ifdef NON_MULTIPLE_OF_GRID_SIZE 414 boundary_workitems = (width % (
GRID_SIZE * 4)) / 4;
415 if(lid < boundary_workitems)
417 uchar4 data = vload4(0, (__global uchar *)
offset(&src, i *
GRID_SIZE * 4, 0));
418 int4 data_fp = convert_int4(data);
419 int4 data_diff = data_fp - max_val;
420 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
422 data_fp =
ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
423 vstore4(data_diff, 0, (__global
int *)
offset(&dst, i * GRID_SIZE * 4, 0));
424 sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
426 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE 427 if(boundary_workitems == 0)
432 if(lid == (boundary_workitems - 1))
435 uchar4 data = vload4(0, (__global uchar *)
offset(&src, i *
GRID_SIZE * 4 + 4, 0));
436 int4 data_fp = convert_int4(data);
437 int4 data_diff = data_fp - max_val;
438 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
440 data_fp =
ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
441 int4 widx = convert_int4(((uint4)(
GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
442 data_fp = select(0, data_fp, widx);
443 vstore4(data_diff, 0, (__global
int *)
offset(&dst, i *
GRID_SIZE * 4 + 4, 0));
444 sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
448 tmp_local[lid] = sum1D;
450 barrier(CLK_LOCAL_MEM_FENCE);
456 tmp_local[lid] =
ADD_OP(tmp_local[lid + 128], tmp_local[lid],
int, 4);
458 barrier(CLK_LOCAL_MEM_FENCE);
464 tmp_local[lid] =
ADD_OP(tmp_local[lid + 64], tmp_local[lid],
int, 4);
466 barrier(CLK_LOCAL_MEM_FENCE);
472 tmp_local[lid] =
ADD_OP(tmp_local[lid + 32], tmp_local[lid],
int, 4);
474 barrier(CLK_LOCAL_MEM_FENCE);
480 tmp_local[lid] =
ADD_OP(tmp_local[lid + 16], tmp_local[lid],
int, 4);
482 barrier(CLK_LOCAL_MEM_FENCE);
488 tmp_local[lid] =
ADD_OP(tmp_local[lid + 8], tmp_local[lid],
int, 4);
490 barrier(CLK_LOCAL_MEM_FENCE);
496 tmp_local[lid] =
ADD_OP(tmp_local[lid + 4], tmp_local[lid],
int, 4);
498 barrier(CLK_LOCAL_MEM_FENCE);
504 tmp_local[lid] =
ADD_OP(tmp_local[lid + 2], tmp_local[lid],
int, 4);
506 barrier(CLK_LOCAL_MEM_FENCE);
510 sum1D =
ADD_OP(tmp_local[lid + 1], tmp_local[lid],
int, 4);
512 sum1D.s01 =
ADD_OP(sum1D.s01, sum1D.s23,
int, 2);
513 sum1D.s0 =
ADD_OP(sum1D.s0, sum1D.s1,
int, 1);
514 *((__global
int *)sum.
ptr) = sum1D.s0;
549 __kernel
void softmax_layer_norm_quantized(
559 int sum_val = *((__global
int *)
offset(&sum, 0, get_global_id(1)));
562 uint sum_val_u = convert_uint(sum_val);
563 int headroom_plus_one =
clz(sum_val_u);
564 int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
565 int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
566 int16 shifted_sum_minus_one = shifted_sum_minus_one_1;
570 int16 data_diff = vload16(0, (__global
int *)
offset(&src, 0, 0));
571 int16 data_diff_mult = data_diff;
572 #if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) 573 if(INPUT_BETA_MULTIPLIER > 1)
575 data_diff_mult =
ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);
582 data = select(0, data, data_diff >= (int16)(DIFF_MIN));
583 vstore16(convert_uchar16_sat(data), 0, (__global uchar *)
offset(&dst, 0, 0));
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)
#define ASYMM_MULT(a, b, size)
#define MAX_OP(x, y, type, size)
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size)
#define ADD_OP(x, y, type, size)
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size)
#define asymm_rescale(value, src_integer_bits, dst_integer_bits)
Structure to hold Image information.
#define TENSOR3D_DECLARATION(name)
__global uchar * ptr
Pointer to the starting postion of the buffer.
#define asymm_exp_on_negative_values(a, k_integer_bits)
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
constexpr int clz(T value)
Count the number of leading zero bits in the given value.
#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size)
#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size)
convolution configure & src