+#else
+ INPUT_PACKED_TYPE input_data[UNROLL_FACTOR];
+ FILTER_PACKED_TYPE_8 weights_data[UNROLL_FACTOR];
+
+ __attribute__((opencl_unroll_hint))
+ for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
+ input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, intel_sub_group_block_read((const __global uint*)(input +
+ input_idx + kb * MMAD_INPUT_FBLOCK_PITCH)));
+
+ uint8 weights_data_u0 = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH));
+ weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u0);
+ }
+
+ __attribute__((opencl_unroll_hint))
+ for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
+ INPUT_PACKED_TYPE_8 in;
+
+ in.s0 = sub_group_broadcast(input_data[kb], 0);
+ in.s1 = sub_group_broadcast(input_data[kb], 1);
+ in.s2 = sub_group_broadcast(input_data[kb], 2);
+ in.s3 = sub_group_broadcast(input_data[kb], 3);
+ in.s4 = sub_group_broadcast(input_data[kb], 4);
+ in.s5 = sub_group_broadcast(input_data[kb], 5);
+ in.s6 = sub_group_broadcast(input_data[kb], 6);
+ in.s7 = sub_group_broadcast(input_data[kb], 7);
+
+ dotProd = MMAD_8(in, weights_data[kb], dotProd);
+ }
+#endif // UNROLL_FACTOR < 2