27 #define MAX_OP(x, y, type, size) max((x), (y)) 28 #define ADD_OP(x, y, type, size) ((x) + (y)) 31 __constant uint16
idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
67 const uint width4 = width >> 4;
68 for(uint i = 0; i < width4; i++)
70 uchar16 data = vload16(0, (__global uchar *)
offset(&src, i << 4, 0));
71 max_val =
MAX_OP(data, max_val, uchar, 16);
74 #ifdef NON_MULTIPLE_OF_16 76 uchar16 data = vload16(0, (__global uchar *)
offset(&src, width4 << 4, 0));
77 uchar16 widx = convert_uchar16(((uint16)(width4 << 4) +
idx16) < width);
78 max_val =
MAX_OP(max_val, select(
type_min, data, widx), uchar, 16);
82 max_val.s01234567 =
MAX_OP(max_val.s01234567, max_val.s89ABCDEF, uchar, 8);
83 max_val.s0123 =
MAX_OP(max_val.s0123, max_val.s4567, uchar, 4);
84 max_val.s01 =
MAX_OP(max_val.s01, max_val.s23, uchar, 2);
85 max_val.s0 =
MAX_OP(max_val.s0, max_val.s1, uchar, 1);
88 *((__global uchar *)dst.
ptr) = max_val.s0;
93 int16 mult_by_quantized_multiplier(int16 data)
95 #if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) 96 if(INPUT_BETA_MULTIPLIER > 1)
98 return asymm_mult(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER);
145 __kernel
void softmax_layer_shift_exp_sum_quantized(
158 int max_val = convert_int(*((__global uchar *)
offset(&max, 0, 0)));
164 const uint width4 = width >> 4;
165 for(uint i = 0; i < width4; i++)
167 uchar16 data = vload16(0, (__global uchar *)
offset(&src, i << 4, 0));
168 int16 data_fp = convert_int16(data);
169 int16 data_diff = data_fp - max_val;
170 int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
172 data_fp =
asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
173 vstore16(data_diff, 0, (__global
int *)
offset(&dst, i << 4, 0));
174 sum1D = sum1D + select(0, data_fp, data_diff >= (int16)(DIFF_MIN));
177 #ifdef NON_MULTIPLE_OF_16 179 uchar16 data = vload16(0, (__global uchar *)
offset(&src, width4 << 4, 0));
180 int16 data_fp = convert_int16(data);
181 int16 data_diff = data_fp - max_val;
182 int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
184 data_fp =
asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
185 int16 widx = convert_int16(((uint16)(width4 << 4) +
idx16) < width);
186 vstore16(data_diff, 0, (__global
int *)
offset(&dst, width4 << 4, 0));
187 data_fp = select(0, data_fp, data_diff >= (int16)(DIFF_MIN));
188 sum1D = sum1D + select(0, data_fp, widx);
192 sum1D.s01234567 =
ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF,
qs16, 8);
193 sum1D.s0123 =
ADD_OP(sum1D.s0123, sum1D.s4567,
qs16, 4);
194 sum1D.s01 =
ADD_OP(sum1D.s01, sum1D.s23,
qs16, 2);
195 sum1D.s0 =
ADD_OP(sum1D.s0, sum1D.s1,
qs16, 1);
198 *((__global
int *)sum.
ptr) = sum1D.s0;
232 __kernel
void softmax_layer_norm_quantized(
242 int sum_val = *((__global
int *)
offset(&sum, 0, get_global_id(1)));
245 uint sum_val_u = convert_uint(sum_val);
246 int headroom_plus_one =
clz(sum_val_u);
247 int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
248 int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
249 int16 shifted_sum_minus_one = shifted_sum_minus_one_1;
253 int16 data_diff = vload16(0, (__global
int *)
offset(&src, 0, 0));
254 int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
259 data = select(0, data, data_diff >= (int16)(DIFF_MIN));
260 vstore16(convert_uchar16_sat(data), 0, (__global uchar *)
offset(&dst, 0, 0));
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)
int16 asymm_exp_on_negative_values(int16 a, int k_integer_bits)
Calculates for x < 0.
__constant uchar16 type_min
int16 asymm_one_over_one_plus_x_for_x_in_0_1(int16 a)
Calculates for x in (0, 1).
#define MAX_OP(x, y, type, size)
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
#define ADD_OP(x, y, type, size)
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
__kernel void softmax_layer_max_quantized(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_offset_first_element_in_bytes, uint width)
Identifies the maximum value across the 1st dimension.
int16 asymm_rescale(int16 value, int src_integer_bits, int dst_integer_bits)
Considering the integer value as fixed-point, change the number of integer bits and update value acco...
int16 asymm_mult(int16 a, int16 b)
Fixed-point multiplication.
int16 asymm_rounding_divide_by_pow2(int16 x, int exponent)
Correctly rounded to nearest division by a power of two.
Structure to hold Image information.
#define TENSOR3D_DECLARATION(name)
__global uchar * ptr
Pointer to the starting postion of the buffer.
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
constexpr int clz(T value)
Count the number of leading zero bits in the given value.
fixed_point< T > max(fixed_point< T > x, fixed_point< T > y)
convolution configure & src