const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
#endif
const uint input_offset = INPUT0_OFFSET + batch_offset*INPUT0_BATCH_PITCH + in_split_offset;
+#if GROUPED && !DEPTHWISE_SEPARABLE_OPT
+ const uint filter_offset = split_idx * FILTER_LENGTH;
+#else
+ const uint filter_offset = 0;
+#endif
for (uint i = start_y; i < FILTER_SIZE_Y; i+=STRIDE_SIZE_Y)
{
uint input_idx = input_offset + (uint)fixed_input_offset_x*INPUT0_X_PITCH + (uint)fixed_input_offset_y*INPUT0_Y_PITCH;
#if GRADIENT
- uint filter_idx = ofm_offset*FILTER_IFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH;
+ uint filter_idx = filter_offset + ofm_offset*FILTER_IFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH;
for (uint h = 0; h < FILTER_OFM_NUM; h++)
{
result = fma(input[input_idx], filter[filter_idx], result);
input_idx += INPUT0_FEATURE_PITCH;
}
#else
- uint filter_idx = ofm_offset*FILTER_OFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH;
+ uint filter_idx = filter_offset + ofm_offset*FILTER_OFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH;
for (uint h = 0; h < FILTER_IFM_NUM; h++)
{
result = fma(input[input_idx], filter[filter_idx], result);
}
}
#if BIAS_TERM
- result += bias[ofm_offset];
+#if GROUPED && !DEPTHWISE_SEPARABLE_OPT
+ const uint bias_offset = split_idx * BIAS_LENGTH;
+#else
+ const uint bias_offset = 0;
+#endif
+ result += bias[ofm_offset + bias_offset];
#endif
const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM;
const uint dst_index = OUTPUT_OFFSET + out_split_offset + batch_offset*OUTPUT_BATCH_PITCH + ofm_offset*OUTPUT_FEATURE_PITCH + id_y*OUTPUT_Y_PITCH + id_x*OUTPUT_X_PITCH;