0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
};
- __m512i idx_lo = _mm512_loadu_epi64(permute_table);
- __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8);
+ __m512i idx_lo = _mm512_loadu_si512(permute_table);
+ __m512i idx_hi = _mm512_loadu_si512(permute_table + 8);
for (; i < m4; i += 4, mi += 4) {
for (j = 0; j < n4; j += 4) {
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
for (int ii = 0; ii < 8; ii++) {
index_n[ii] = ii * ldc;
}
- __m512i vindex_n = _mm512_loadu_epi64(index_n);
+ __m512i vindex_n = _mm512_loadu_si512(index_n);
for (; i < m4; i += 4) {
for (j = 0; j < n32; j += 32) {
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
};
- __m512i idx_lo = _mm512_loadu_epi64(permute_table);
- __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8);
+ __m512i idx_lo = _mm512_loadu_si512(permute_table);
+ __m512i idx_hi = _mm512_loadu_si512(permute_table + 8);
for (i = 0; i < m4; i += 4) {
for (j = 0; j < n4; j += 4) {
0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8,
2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8,
};
- __m512i idx_lo = _mm512_loadu_epi64(permute_table);
- __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8);
+ __m512i idx_lo = _mm512_loadu_si512(permute_table);
+ __m512i idx_hi = _mm512_loadu_si512(permute_table + 8);
for (i = 0; i < m8; i += 8) {
for (j = 0; j < n16; j += 16) {
0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
};
- idx_lo = _mm512_loadu_epi64(permute_table2);
- idx_hi = _mm512_loadu_epi64(permute_table2 + 8);
+ idx_lo = _mm512_loadu_si512(permute_table2);
+ idx_hi = _mm512_loadu_si512(permute_table2 + 8);
for (j = 0; j < n32; j += 32) {
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
for (int ii = 0; ii < 8; ii++) {
index_n[ii] = ii * ldc;
}
- __m512i vindex_n = _mm512_loadu_epi64(index_n);
+ __m512i vindex_n = _mm512_loadu_si512(index_n);
#if !defined(B0)
__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta));
#endif
for (int ii = 0; ii < 16; ii++) {
index_n[ii] = ii * ldc;
}
- __m512i vindex_n = _mm512_loadu_epi32(index_n);
+ __m512i vindex_n = _mm512_loadu_si512(index_n);
for (; i < m4; i += 4) {
for (j = 0; j < n64; j += 64) {
DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
};
- __m512i idx_lo = _mm512_loadu_epi32(permute_table);
- __m512i idx_hi = _mm512_loadu_epi32(permute_table + 16);
+ __m512i idx_lo = _mm512_loadu_si512(permute_table);
+ __m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
__mmask16 kc = 0xcccc;
__mmask16 k3 = 0x3333;
__mmask8 mask8 = 0xff; // force use AVX128 instead of SSE
for (int ii = 0; ii < 16; ii++) {
index_n[ii] = ii * ldc;
}
- __m512i vindex_n = _mm512_loadu_epi32(index_n);
+ __m512i vindex_n = _mm512_loadu_si512(index_n);
#if !defined(B0)
__m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta));
#endif