in_addr += INPUT_PITCH;
- Dtype weight_buf[WEIGHT_PREF];
- int w_idx=0;
-
- for (int i = 0; i < WEIGHT_PREF; i++)
- {
- weight_buf[i] = weights[weight_addr];
- weight_addr += SIMD_SIZE;
- }
-
#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))
int kr = 0; // kr = Kernel Row
int kc = 0; // kc = Kernel Column
LOOP(KERNEL_WIDTH, kc,
{
+ Dtype weight_value = weights[weight_addr];
+ weight_addr += SIMD_SIZE;
for (int br=0; br < OUT_BLOCK_HEIGHT; br++)
{
for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)
{
Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);
- out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
+ out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_value, input, out[br * OUT_BLOCK_WIDTH + bc]);
}
}
- weight_buf[w_idx % WEIGHT_PREF] = weights[weight_addr];
- weight_addr += SIMD_SIZE;
- ++w_idx;
});
});
- weight_addr -= WEIGHT_PREF * SIMD_SIZE;
}
fm = fm % ALIGNED_NUM_FILTERS;