// limitations under the License.
*/
+void FUNC(intel_sub_group_block_write_4)( __local uint* p, uint4 data )
+{
+ p[ get_sub_group_local_id() ] = data.s0;
+ p += 8;
+ p[ get_sub_group_local_id() ] = data.s1;
+ p += 8;
+ p[ get_sub_group_local_id() ] = data.s2;
+ p += 8;
+ p[ get_sub_group_local_id() ] = data.s3;
+}
+
+uint4 FUNC(intel_sub_group_block_read_uint4)(const __local uint* p)
+{
+ uint4 ret;
+ uint idx = get_sub_group_local_id();
+
+ ret.s0 = p[idx]; idx += get_max_sub_group_size();
+ ret.s1 = p[idx]; idx += get_max_sub_group_size();
+ ret.s2 = p[idx]; idx += get_max_sub_group_size();
+ ret.s3 = p[idx]; idx += get_max_sub_group_size();
+
+ return ret;
+}
+
+uint8 FUNC(intel_sub_group_block_read_uint8)(const __local uint* p)
+{
+ uint8 ret;
+ uint idx = get_sub_group_local_id();
+
+ ret.s0 = p[idx]; idx += get_max_sub_group_size();
+ ret.s1 = p[idx]; idx += get_max_sub_group_size();
+ ret.s2 = p[idx]; idx += get_max_sub_group_size();
+ ret.s3 = p[idx]; idx += get_max_sub_group_size();
+ ret.s4 = p[idx]; idx += get_max_sub_group_size();
+ ret.s5 = p[idx]; idx += get_max_sub_group_size();
+ ret.s6 = p[idx]; idx += get_max_sub_group_size();
+ ret.s7 = p[idx]; idx += get_max_sub_group_size();
+
+ return ret;
+}
+
inline int FUNC(mmad_4)(char4 input, char4 weight, int acc)
{
acc += (input[0] * weight[0]);
return ret;
}
+// TODO: remove it when cl_intel_subgroups_char extension will work
+inline void FUNC(sub_group_block_write_uchar8)(__global uchar* outPtr, uchar8 v)
+{
+#ifdef cl_intel_subgroups_char
+ intel_sub_group_block_write_uc8(outPtr, v);
+#else
+ uint idx = get_sub_group_local_id();
+
+ outPtr[idx] = v.s0; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s1; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s2; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s3; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s4; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s5; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s6; idx += get_max_sub_group_size();
+ outPtr[idx] = v.s7; idx += get_max_sub_group_size();
+#endif
+}
+
+inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr)
+{
+#ifdef cl_intel_subgroups_char
+ return intel_sub_group_block_read_uc8(ptr);
+#else
+ uint idx = get_sub_group_local_id();
+
+ uchar8 ret;
+
+ ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s2 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s3 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s4 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s5 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s6 = ptr[idx]; idx += get_max_sub_group_size();
+ ret.s7 = ptr[idx]; idx += get_max_sub_group_size();
+
+ return ret;
+
+#endif
+}
+
+//
+
#define MMAD_8(A, B, C) FUNC_CALL(mmad8)(A, B, C)
#define MMAD_4x8(A, B, C) FUNC_CALL(mmad4x8)(A, B, C)
#define MMAD_8x8(A, B, C) FUNC_CALL(mmad8x8)(A, B, C)
+#define SLM_BLOCK_WRITE_4(A, B) (FUNC_CALL(intel_sub_group_block_write_4)(A, B))
+#define SLM_BLOCK_READ_4(A) (FUNC_CALL(intel_sub_group_block_read_uint4)(A))
+#define SLM_BLOCK_READ_8(A) (FUNC_CALL(intel_sub_group_block_read_uint8)(A))