#endif //defined(HAS_BIAS)
)
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
- Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
+ __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z;
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
+ Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
// Extract channel and linearized batch indices
const int channel = get_global_id(2) % DST_CHANNELS;
#endif //defined(HAS_BIAS)
// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
- src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+ src_addr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
// Row0
int8 left, middle, right;
- GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);
+ GET_VALUES(src_addr + 0 * src_stride_y, left, middle, right);
values0 += left * (int8)(w0.s0);
values0 += middle * (int8)(w0.s1);
values0 += right * (int8)(w0.s2);
#endif /* WEIGHTS_OFFSET != 0 */
// Row1
- GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left, middle, right);
+ GET_VALUES(src_addr + DILATION_Y * src_stride_y, left, middle, right);
values0 += left * (int8)(w1.s0);
values0 += middle * (int8)(w1.s1);
values0 += right * (int8)(w1.s2);
+
#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
values1 += left * (int8)(w0.s0);
values1 += middle * (int8)(w0.s1);
#endif /* WEIGHTS_OFFSET != 0 */
// Row2
- GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left, middle, right);
+ GET_VALUES(src_addr + 2 * DILATION_Y * src_stride_y, left, middle, right);
values0 += left * (int8)(w2.s0);
values0 += middle * (int8)(w2.s1);
values0 += right * (int8)(w2.s2);
#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
// Row3
- GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
+ GET_VALUES(src_addr + 3 * src_stride_y, left, middle, right);
values1 += left * (int8)(w2.s0);
values1 += middle * (int8)(w2.s1);
values1 += right * (int8)(w2.s2);
#endif //defined(HAS_BIAS)
)
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
- Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
+ __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z;
+ Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+ Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
+ Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
// Extract channel and linearized batch indices
const int channel = get_global_id(2) % DST_CHANNELS;
#endif //defined(HAS_BIAS)
// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
- src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+ src_addr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
VEC_TYPE(3)
int8 values0 = 0;
int8 sum0 = 0;
- GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);
- GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left1, middle1, right1);
- GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
+ GET_VALUES(src_addr + 0 * src_stride_y, left0, middle0, right0);
+ GET_VALUES(src_addr + DILATION_Y * src_stride_y, left1, middle1, right1);
+ GET_VALUES(src_addr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
#if WEIGHTS_OFFSET != 0
sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
int8 values1 = 0;
int8 sum1 = 0;
- GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);
+ GET_VALUES(src_addr + 3 * src_stride_y, left3, middle3, right3);
#if WEIGHTS_OFFSET != 0
sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
// z_coord can be only negative for z = 0 so we do not need to clamp it
// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
+ z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
offset = y_offset + (int4)(z_coord * src_stride_z);
+ offset = min(offset, (int4)max_offset);
VEC_TYPE(VEC_SIZE)
values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
VEC_TYPE(VEC_SIZE)
// z == 2
// Offset can be out-of-bound so we need to check if it is greater than max_offset
z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
+ z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
offset = y_offset + (int4)(z_coord * src_stride_z);
offset = min(offset, (int4)max_offset);
VEC_TYPE(VEC_SIZE)
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
PaddingSize{ 4, 36, 4, 4 }})),
framework::dataset::make("Strides", {
Strides{},
- Strides{ 1U, 50U },
- Strides{ 1U, 50U },
- Strides{ 1U, 50U, 900U },
- Strides{ 1U, 50U, 900U, 9000U },
- Strides{ 1U, 50U, 900U, 9000U, 90000U },
+ Strides{ 1U, 50U, 50U, 50U, 50U, 50U },
+ Strides{ 1U, 50U, 900U, 900U, 900U, 900U },
+ Strides{ 1U, 50U, 900U, 900U, 900U, 900U },
+ Strides{ 1U, 50U, 900U, 9000U, 9000U, 9000U },
+ Strides{ 1U, 50U, 900U, 9000U, 90000U, 90000U },
Strides{ 1U, 50U, 900U, 9000U, 90000U, 900000U }})),
framework::dataset::make("Offset", { 0U, 4U, 204U, 204U, 204U, 204U, 204U })),
shape, auto_padding, strides, offset)