int w[NUM_FILTERS];
int in_addr;
+#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
+ int in_start_addr = INPUT0_GET_INDEX(batch, 0, input_y, input_x + sglid);
+#endif
+
#ifdef BLOCK_LOAD_WEIGHTS
int weight_addr = (ofmg * CEIL_DIV(FILTER_IFM_NUM, PACK) * FILTER_SIZE_Y * FILTER_SIZE_X * SIMD_SIZE) + (g * FILTER_GROUPS_PITCH / 4);
#else
for(int kd = 0; kd < CEIL_DIV(FILTER_IFM_NUM, PACK); kd++)
{
#if INPUT0_LAYOUT_B_FS_YX_FSV16
+ #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
+ int feature_location = kd * PACK + g * FILTER_IFM_NUM;
+ #else
in_addr = INPUT0_GET_INDEX(batch, (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK, input_y, input_x + sglid);
+ #endif
#else
#ifdef BLOCK_LOAD_INPUTS
in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x;
#endif
in_addr += batch * input_size; // adjust for batching
#endif
+
for(uint reg = 0; reg < IN_BLOCK_HEIGHT; reg++) {
#if INPUT0_LAYOUT_B_FS_YX_FSV16
+ #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
+ INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &in[reg];
+ in_addr = in_start_addr + reg * INPUT0_Y_PITCH * FSV;
+ for (uint v = 0; v < PACK; v++) {
+ int f_addr = ((feature_location + v) / FSV + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * INPUT0_FEATURE_PITCH * FSV + (feature_location + v) % FSV;
+ input_int8_arr[v] = conv_input[in_addr + f_addr];
+ }
+ #else
in[reg] = *(__global PACKED_TYPE*)(conv_input + in_addr);
in_addr += (INPUT0_SIZE_X + IWPAD) * 16;
+ #endif
#else
#ifdef BLOCK_LOAD_INPUTS
in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr]));
// Input X size, Input Y size, Input Z size, Input features, Output features,
// Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
// Input data format, Implementation name
+ // Format: b_fs_yx_fsv16
+ TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
+ TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""),
// Format: b_fs_yx_fsv4
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv16, ""),
-
+
// Format: b_fs_zyx_fsv16
TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 17, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 16, 3, 3, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),