.. option:: -mavx512vpopcntdq, -mno-avx512vpopcntdq
+.. option:: -mavxvnni, -mno-avxvnni
+
.. option:: -mbmi, -mno-bmi
.. option:: -mbmi2, -mno-bmi2
- Support for ``UINTR`` instructions has been added.
+- Support for ``AVXVNNI`` instructions has been added.
+
Internal API Changes
--------------------
TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f")
TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Features_Group>;
def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
+def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;
+def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group<m_x86_Features_Group>;
def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
def maes : Flag<["-"], "maes">, Group<m_x86_Features_Group>;
HasAMXINT8 = true;
} else if (Feature == "+amx-tile") {
HasAMXTILE = true;
+ } else if (Feature == "+avxvnni") {
+ HasAVXVNNI = true;
} else if (Feature == "+serialize") {
HasSERIALIZE = true;
} else if (Feature == "+tsxldtrk") {
Builder.defineMacro("__AMXINT8__");
if (HasAMXBF16)
Builder.defineMacro("__AMXBF16__");
+ if (HasAVXVNNI)
+ Builder.defineMacro("__AVXVNNI__");
if (HasSERIALIZE)
Builder.defineMacro("__SERIALIZE__");
if (HasTSXLDTRK)
.Case("avx512vbmi2", true)
.Case("avx512ifma", true)
.Case("avx512vp2intersect", true)
+ .Case("avxvnni", true)
.Case("bmi", true)
.Case("bmi2", true)
.Case("cldemote", true)
.Case("amx-bf16", HasAMXBF16)
.Case("amx-int8", HasAMXINT8)
.Case("amx-tile", HasAMXTILE)
+ .Case("avxvnni", HasAVXVNNI)
.Case("avx", SSELevel >= AVX)
.Case("avx2", SSELevel >= AVX2)
.Case("avx512f", SSELevel >= AVX512F)
bool HasKL = false; // For key locker
bool HasWIDEKL = false; // For wide key locker
bool HasHRESET = false;
+ bool HasAVXVNNI = false;
bool HasAMXTILE = false;
bool HasAMXINT8 = false;
bool HasAMXBF16 = false;
avx512vnniintrin.h
avx512vlvnniintrin.h
avxintrin.h
+ avxvnniintrin.h
bmi2intrin.h
bmiintrin.h
__clang_cuda_builtin_vars.h
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusd_epi32(S, A, B) \
+ (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
- (__v8si)__B);
-}
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusds_epi32(S, A, B) \
+ (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssd_epi32(S, A, B) \
+ (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssds_epi32(S, A, B) \
+ (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusd_epi32(S, A, B) \
+ (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusds_epi32(S, A, B) \
+ (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssd_epi32(S, A, B) \
+ (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssds_epi32(S, A, B) \
+ (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
- (__v8si)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
- (__v8si)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
- (__v8si)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
- (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
- (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
- (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
- (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
--- /dev/null
+/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINTRIN_H
+#define __AVXVNNIINTRIN_H
+
+/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
+/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+
+/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINTRIN_H
#define bit_AMXINT8 0x02000000
/* Features in %eax for leaf 7 sub-leaf 1 */
+#define bit_AVXVNNI 0x00000008
#define bit_AVX512BF16 0x00000020
#define bit_HRESET 0x00400000
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVXVNNI__)
+#include <avxvnniintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVX512DQ__)
#include <avx512dqintrin.h>
#endif
--- /dev/null
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+__m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
+ // CHECK-LABEL: @test_mm256_dpbusd_epi32
+ // CHECK: @llvm.x86.avx512.vpdpbusd.256
+ return _mm256_dpbusd_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
+ // CHECK-LABEL: @test_mm256_dpbusds_epi32
+ // CHECK: @llvm.x86.avx512.vpdpbusds.256
+ return _mm256_dpbusds_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
+ // CHECK-LABEL: @test_mm256_dpwssd_epi32
+ // CHECK: @llvm.x86.avx512.vpdpwssd.256
+ return _mm256_dpwssd_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
+ // CHECK-LABEL: @test_mm256_dpwssds_epi32
+ // CHECK: @llvm.x86.avx512.vpdpwssds.256
+ return _mm256_dpwssds_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
+ // CHECK-LABEL: @test_mm_dpbusd_epi32
+ // CHECK: @llvm.x86.avx512.vpdpbusd.128
+ return _mm_dpbusd_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
+ // CHECK-LABEL: @test_mm_dpbusds_epi32
+ // CHECK: @llvm.x86.avx512.vpdpbusds.128
+ return _mm_dpbusds_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
+ // CHECK-LABEL: @test_mm_dpwssd_epi32
+ // CHECK: @llvm.x86.avx512.vpdpwssd.128
+ return _mm_dpwssd_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
+ // CHECK-LABEL: @test_mm_dpwssds_epi32
+ // CHECK: @llvm.x86.avx512.vpdpwssds.128
+ return _mm_dpwssds_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+ // CHECK-LABEL: @test_mm256_dpbusd_avx_epi32
+ // CHECK: @llvm.x86.avx512.vpdpbusd.256
+ return _mm256_dpbusd_avx_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+ // CHECK-LABEL: @test_mm256_dpbusds_avx_epi32
+ // CHECK: @llvm.x86.avx512.vpdpbusds.256
+ return _mm256_dpbusds_avx_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+ // CHECK-LABEL: @test_mm256_dpwssd_avx_epi32
+ // CHECK: @llvm.x86.avx512.vpdpwssd.256
+ return _mm256_dpwssd_avx_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+ // CHECK-LABEL: @test_mm256_dpwssds_avx_epi32
+ // CHECK: @llvm.x86.avx512.vpdpwssds.256
+ return _mm256_dpwssds_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+ // CHECK-LABEL: @test_mm_dpbusd_avx_epi32
+ // CHECK: @llvm.x86.avx512.vpdpbusd.128
+ return _mm_dpbusd_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+ // CHECK-LABEL: @test_mm_dpbusds_avx_epi32
+ // CHECK: @llvm.x86.avx512.vpdpbusds.128
+ return _mm_dpbusds_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+ // CHECK-LABEL: @test_mm_dpwssd_avx_epi32
+ // CHECK: @llvm.x86.avx512.vpdpwssd.128
+ return _mm_dpwssd_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+ // CHECK-LABEL: @test_mm_dpwssds_avx_epi32
+ // CHECK: @llvm.x86.avx512.vpdpwssds.128
+ return _mm_dpwssds_avx_epi32(__S, __A, __B);
+}
// CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686"
// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
// CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
// CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
// CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
// CHECK-NOT: tune-cpu
// CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx"
// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-UINTR %s
// UINTR: "-target-feature" "+uintr"
// NO-UINTR: "-target-feature" "-uintr"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=AVX-VNNI %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AVX-VNNI %s
+// AVX-VNNI: "-target-feature" "+avxvnni"
+// NO-AVX-VNNI: "-target-feature" "-avxvnni"
// CHECK_SPR_M32: #define __AVX512VL__ 1
// CHECK_SPR_M32: #define __AVX512VNNI__ 1
// CHECK_SPR_M32: #define __AVX512VPOPCNTDQ__ 1
+// CHECK_SPR_M32: #define __AVXVNNI__ 1
// CHECK_SPR_M32: #define __AVX__ 1
// CHECK_SPR_M32: #define __BMI2__ 1
// CHECK_SPR_M32: #define __BMI__ 1
// CHECK_SPR_M64: #define __AVX512VL__ 1
// CHECK_SPR_M64: #define __AVX512VNNI__ 1
// CHECK_SPR_M64: #define __AVX512VPOPCNTDQ__ 1
+// CHECK_SPR_M64: #define __AVXVNNI__ 1
// CHECK_SPR_M64: #define __AVX__ 1
// CHECK_SPR_M64: #define __BMI2__ 1
// CHECK_SPR_M64: #define __BMI__ 1
// CHECK_ADL_M32: #define __AES__ 1
// CHECK_ADL_M32: #define __AVX2__ 1
// CHECK_ADL_M32-NOT: AVX512
+// CHECK_ADL_M32: #define __AVXVNNI__ 1
// CHECK_ADL_M32: #define __AVX__ 1
// CHECK_ADL_M32: #define __BMI2__ 1
// CHECK_ADL_M32: #define __BMI__ 1
// CHECK_ADL_M64: #define __AES__ 1
// CHECK_ADL_M64: #define __AVX2__ 1
// CHECK_ADL_M64-NOT: AVX512
+// CHECK_ADL_M64: #define __AVXVNNI__ 1
// CHECK_ADL_M64: #define __AVX__ 1
// CHECK_ADL_M64: #define __BMI2__ 1
// CHECK_ADL_M64: #define __BMI__ 1
// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr -x c -E -dM -o - %s | FileCheck -check-prefix=NOUINTR %s
// NOUINTR-NOT: #define __UINTR__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s
+
+// AVXVNNI: #define __AVX2__ 1
+// AVXVNNI: #define __AVXVNNI__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mno-avxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOAVXVNNI %s
+
+// NOAVXVNNI-NOT: #define __AVXVNNI__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNINOAVX2 %s
+
+// AVXVNNINOAVX2-NOT: #define __AVX2__ 1
+// AVXVNNINOAVX2-NOT: #define __AVXVNNI__ 1
the target CPU.
* Support for ``HRESET`` instructions has been added.
* Support for ``UINTR`` instructions has been added.
+* Support for ``AVXVNNI`` instructions has been added.
Changes to the AMDGPU Target
-----------------------------
X86_FEATURE (XSAVEOPT, "xsaveopt")
X86_FEATURE (XSAVES, "xsaves")
X86_FEATURE (HRESET, "hreset")
+X86_FEATURE (AVXVNNI, "avxvnni")
// These features aren't really CPU features, but the frontend can set them.
X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk")
X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
Features["amx-int8"] = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave;
bool HasLeaf7Subleaf1 =
MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+ Features["avxvnni"] = HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave;
Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save;
Features["hreset"] = HasLeaf7Subleaf1 && ((EAX >> 22) & 1);
FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE |
FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE |
FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR |
- FeatureWAITPKG;
+ FeatureWAITPKG | FeatureAVXVNNI;
constexpr FeatureBitset FeaturesAlderlake =
FeaturesSkylakeClient | FeatureCLDEMOTE | FeatureHRESET | FeaturePTWRITE |
- FeatureSERIALIZE | FeatureWAITPKG;
+ FeatureSERIALIZE | FeatureWAITPKG | FeatureAVXVNNI;
// Intel Atom processors.
// Bonnell has feature parity with Core2 and adds MOVBE.
constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2;
constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL;
+// AVXVNNI Features
+constexpr FeatureBitset ImpliedFeaturesAVXVNNI = FeatureAVX2;
+
constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = {
#define X86_FEATURE(ENUM, STR) {{STR}, ImpliedFeatures##ENUM},
#include "llvm/Support/X86TargetParser.def"
(MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
return Match_Unsupported;
+ // These instructions are only available with {vex}, {vex2} or {vex3} prefix
+ if (MCID.TSFlags & X86II::ExplicitVEXPrefix &&
+ (ForcedVEXEncoding != VEXEncoding_VEX &&
+ ForcedVEXEncoding != VEXEncoding_VEX2 &&
+ ForcedVEXEncoding != VEXEncoding_VEX3))
+ return Match_Unsupported;
+
// These instructions match ambiguously with their VEX encoded counterparts
// and appear first in the matching table. Reject them unless we're forcing
// EVEX encoding.
// NOTRACK prefix
NoTrackShift = EVEX_RCShift + 1,
- NOTRACK = 1ULL << NoTrackShift
+ NOTRACK = 1ULL << NoTrackShift,
+
+ // Force VEX encoding
+ ExplicitVEXShift = NoTrackShift + 1,
+ ExplicitVEXPrefix = 1ULL << ExplicitVEXShift
};
/// \returns true if the instruction with given opcode is a prefix.
O << "\trep\t";
// These all require a pseudo prefix
- if (Flags & X86::IP_USE_VEX)
+ if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix))
O << "\t{vex}";
else if (Flags & X86::IP_USE_VEX2)
O << "\t{vex2}";
def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
"Enable AVX-512 Vector Neural Network Instructions",
[FeatureAVX512]>;
+def FeatureAVXVNNI : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true",
+ "Support AVX_VNNI encoding",
+ [FeatureAVX2]>;
def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
"Support bfloat16 floating point",
[FeatureBWI]>;
FeatureCLDEMOTE,
FeatureWAITPKG,
FeaturePTWRITE,
+ FeatureAVXVNNI,
FeatureTSXLDTRK,
FeatureENQCMD,
FeatureSHSTK,
!listconcat(ICXFeatures, SPRAdditionalFeatures);
// Alderlake
- list<SubtargetFeature> ADLAdditionalFeatures = [FeatureCLDEMOTE,
+ list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI,
+ FeatureCLDEMOTE,
FeatureHRESET,
FeaturePTWRITE,
FeatureSERIALIZE,
private:
/// Machine instruction info used throughout the class.
const X86InstrInfo *TII = nullptr;
+
+ const X86Subtarget *ST = nullptr;
};
} // end anonymous namespace
bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
- const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- if (!ST.hasAVX512())
+ ST = &MF.getSubtarget<X86Subtarget>();
+ if (!ST->hasAVX512())
return false;
bool Changed = false;
}
// Do any custom cleanup needed to finalize the conversion.
-static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc,
+ const X86Subtarget *ST) {
(void)NewOpc;
unsigned Opc = MI.getOpcode();
switch (Opc) {
+ case X86::VPDPBUSDSZ256m:
+ case X86::VPDPBUSDSZ256r:
+ case X86::VPDPBUSDSZ128m:
+ case X86::VPDPBUSDSZ128r:
+ case X86::VPDPBUSDZ256m:
+ case X86::VPDPBUSDZ256r:
+ case X86::VPDPBUSDZ128m:
+ case X86::VPDPBUSDZ128r:
+ case X86::VPDPWSSDSZ256m:
+ case X86::VPDPWSSDSZ256r:
+ case X86::VPDPWSSDSZ128m:
+ case X86::VPDPWSSDSZ128r:
+ case X86::VPDPWSSDZ256m:
+ case X86::VPDPWSSDZ256r:
+ case X86::VPDPWSSDZ128m:
+ case X86::VPDPWSSDZ128r:
+ // These can only VEX convert if AVXVNNI is enabled.
+ return ST->hasAVXVNNI();
case X86::VALIGNDZ128rri:
case X86::VALIGNDZ128rmi:
case X86::VALIGNQZ128rri:
if (usesExtendedRegister(MI))
return false;
- if (!performCustomAdjustments(MI, NewOpc))
+ if (!performCustomAdjustments(MI, NewOpc, ST))
return false;
MI.setDesc(TII->get(NewOpc));
{ X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 },
{ X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 },
{ X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 },
+ { X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 },
{ X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0 },
{ X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256m, 0 },
{ X86::VPDPBUSDSZr, X86::VPDPBUSDSZm, 0 },
+ { X86::VPDPBUSDSrr, X86::VPDPBUSDSrm, 0 },
+ { X86::VPDPBUSDYrr, X86::VPDPBUSDYrm, 0 },
{ X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128m, 0 },
{ X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256m, 0 },
{ X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0 },
+ { X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 },
+ { X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 },
{ X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0 },
{ X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256m, 0 },
{ X86::VPDPWSSDSZr, X86::VPDPWSSDSZm, 0 },
+ { X86::VPDPWSSDSrr, X86::VPDPWSSDSrm, 0 },
+ { X86::VPDPWSSDYrr, X86::VPDPWSSDYrm, 0 },
{ X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128m, 0 },
{ X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0 },
{ X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0 },
+ { X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0 },
{ X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
{ X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
{ X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
// Prevent EVEX->VEX conversion from considering this instruction.
class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
+// Force the instruction to use VEX encoding.
+class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; }
+
class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
string AsmStr, Domain d = GenericDomain>
: Instruction {
bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction?
bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
+ bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding.
// TSFlags layout should be kept in sync with X86BaseInfo.h.
let TSFlags{6-0} = FormBits;
let TSFlags{51-45} = CD8_Scale;
let TSFlags{52} = hasEVEX_RC;
let TSFlags{53} = hasNoTrackPrefix;
+ let TSFlags{54} = ExplicitVEXPrefix;
}
class PseudoI<dag oops, dag iops, list<dag> pattern>
case X86::VPTERNLOGQZ256rmbikz:
case X86::VPTERNLOGQZrmbikz:
return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ case X86::VPDPWSSDYrr:
+ case X86::VPDPWSSDrr:
+ case X86::VPDPWSSDSYrr:
+ case X86::VPDPWSSDSrr:
case X86::VPDPWSSDZ128r:
case X86::VPDPWSSDZ128rk:
case X86::VPDPWSSDZ128rkz:
def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
def HasBF16 : Predicate<"Subtarget->hasBF16()">;
+def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
+def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
WriteFMaskMove64, WriteFMaskMove64Y>;
//===----------------------------------------------------------------------===//
+// AVX_VNNI
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in
+multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit IsCommutable> {
+ let isCommutable = IsCommutable in
+ def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
+ VR128:$src2, VR128:$src3)))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+ def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
+ (loadv4i32 addr:$src3))))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+ let isCommutable = IsCommutable in
+ def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
+ VR256:$src2, VR256:$src3)))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+
+ def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
+ (loadv8i32 addr:$src3))))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+}
+
+defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix;
+defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix;
+defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix;
+defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix;
+
+//===----------------------------------------------------------------------===//
// VPERMIL - Permute Single and Double Floating-Point Values
//
/// Processor has AVX-512 Vector Neural Network Instructions
bool HasVNNI = false;
+ /// Processor has AVX Vector Neural Network Instructions
+ bool HasAVXVNNI = false;
+
/// Processor has AVX-512 bfloat16 floating-point extensions
bool HasBF16 = false;
bool useRetpolineIndirectBranches() const {
return UseRetpolineIndirectBranches;
}
+ bool hasAVXVNNI() const { return HasAVXVNNI; }
bool hasAMXTILE() const { return HasAMXTILE; }
bool hasAMXBF16() const { return HasAMXBF16; }
bool hasAMXINT8() const { return HasAMXINT8; }
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI
+
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_256:
+; AVXVNNI: # %bb.0:
+; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x50,0xc2]
+; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusd_256:
+; AVX512VNNI: # %bb.0:
+; AVX512VNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x50,0xc2]
+; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_128:
+; AVXVNNI: # %bb.0:
+; AVXVNNI-NEXT: {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x50,0xc2]
+; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusd_128:
+; AVX512VNNI: # %bb.0:
+; AVX512VNNI-NEXT: {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x50,0xc2]
+; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_256:
+; AVXVNNI: # %bb.0:
+; AVXVNNI-NEXT: {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x51,0xc2]
+; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusds_256:
+; AVX512VNNI: # %bb.0:
+; AVX512VNNI-NEXT: {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x51,0xc2]
+; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_128:
+; AVXVNNI: # %bb.0:
+; AVXVNNI-NEXT: {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x51,0xc2]
+; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusds_128:
+; AVX512VNNI: # %bb.0:
+; AVX512VNNI-NEXT: {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x51,0xc2]
+; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
+; AVXVNNI: # %bb.0:
+; AVXVNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
+; AVX512VNNI: # %bb.0:
+; AVX512VNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
+; AVXVNNI: # %bb.0:
+; AVXVNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
+; AVX512VNNI: # %bb.0:
+; AVX512VNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
+; AVXVNNI: # %bb.0:
+; AVXVNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
+; AVX512VNNI: # %bb.0:
+; AVX512VNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
+; AVXVNNI: # %bb.0:
+; AVXVNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
+; AVX512VNNI: # %bb.0:
+; AVX512VNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ ret <4 x i32> %res
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnni < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT: {vex} vpdpbusd %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT: {vex} vpdpbusd %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT: {vex} vpdpbusds %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT: {vex} vpdpbusds %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
--- /dev/null
+# RUN: llvm-mc --disassemble %s -triple=i686-apple-darwin9 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x30
+
+# CHECK: {vex} vpdpbusd -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd 4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x30
+
+# CHECK: {vex} vpdpbusd -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd 2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x30
+
+# CHECK: {vex} vpdpbusds -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds 4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x30
+
+# CHECK: {vex} vpdpbusds -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds 2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd 268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd 291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x30
+
+# CHECK: {vex} vpdpwssd -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd 4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd 268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd 291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x30
+
+# CHECK: {vex} vpdpwssd -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd 2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds 268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds 291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x30
+
+# CHECK: {vex} vpdpwssds -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds 4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds 268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds 291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x30
+
+# CHECK: {vex} vpdpwssds -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds 2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+
--- /dev/null
+# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x50,0x30
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x50,0x30
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x51,0x30
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x51,0x30
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x52,0x30
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x52,0x30
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x53,0x30
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x53,0x30
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+
--- /dev/null
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+
--- /dev/null
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd 4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd 2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds 4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds 2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd 4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd 2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds 4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds 2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+
--- /dev/null
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnni --show-encoding < %s | FileCheck %s
+
+// CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+ {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+ {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusd 268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusd 291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x30]
+ {vex} vpdpbusd (%eax), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpbusd -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpbusd 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpbusd -4096(%edx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusd 268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusd 291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x30]
+ {vex} vpdpbusd (%eax), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpbusd -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpbusd 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpbusd -2048(%edx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+ {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+ {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusds 268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusds 291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x30]
+ {vex} vpdpbusds (%eax), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpbusds -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpbusds 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpbusds -4096(%edx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusds 268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusds 291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x30]
+ {vex} vpdpbusds (%eax), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpbusds -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpbusds 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpbusds -2048(%edx), %xmm5, %xmm6
+
+// CHECK: vpdpwssd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+ {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+
+// CHECK: vpdpwssd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+ {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+
+// CHECK: vpdpwssd 268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssd 268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: vpdpwssd 291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssd 291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: vpdpwssd (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x30]
+ {vex} vpdpwssd (%eax), %ymm5, %ymm6
+
+// CHECK: vpdpwssd -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpwssd -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: vpdpwssd 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpwssd 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vpdpwssd -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpwssd -4096(%edx), %ymm5, %ymm6
+
+// CHECK: vpdpwssd 268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssd 268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: vpdpwssd 291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssd 291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: vpdpwssd (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x30]
+ {vex} vpdpwssd (%eax), %xmm5, %xmm6
+
+// CHECK: vpdpwssd -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpwssd -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: vpdpwssd 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpwssd 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vpdpwssd -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpwssd -2048(%edx), %xmm5, %xmm6
+
+// CHECK: vpdpwssds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+ {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+
+// CHECK: vpdpwssds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+ {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+
+// CHECK: vpdpwssds 268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssds 268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: vpdpwssds 291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssds 291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: vpdpwssds (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x30]
+ {vex} vpdpwssds (%eax), %ymm5, %ymm6
+
+// CHECK: vpdpwssds -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpwssds -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: vpdpwssds 4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpwssds 4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vpdpwssds -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpwssds -4096(%edx), %ymm5, %ymm6
+
+// CHECK: vpdpwssds 268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssds 268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: vpdpwssds 291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssds 291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: vpdpwssds (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x30]
+ {vex} vpdpwssds (%eax), %xmm5, %xmm6
+
+// CHECK: vpdpwssds -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpwssds -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: vpdpwssds 2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpwssds 2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vpdpwssds -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpwssds -2048(%edx), %xmm5, %xmm6
+
--- /dev/null
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnni -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+ {vex} vpdpbusd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+ {vex} vpdpbusd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x30]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x30]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+ {vex} vpdpbusds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+ {vex} vpdpbusds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x30]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x30]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+ {vex} vpdpwssd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+ {vex} vpdpwssd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x30]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x30]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+ {vex} vpdpwssds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+ {vex} vpdpwssds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x30]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x30]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
+
--- /dev/null
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnni -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+ {vex} vpdpbusd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+ {vex} vpdpbusd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+ {vex} vpdpbusds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+ {vex} vpdpbusds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+ {vex} vpdpwssd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+ {vex} vpdpwssd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+ {vex} vpdpwssds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+ {vex} vpdpwssds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
+
--- /dev/null
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnni --show-encoding < %s | FileCheck %s
+
+// CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+ {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+ {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusd 268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusd 291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpbusd (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpbusd -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd 4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpbusd 4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpbusd -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusd 268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusd 291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpbusd (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpbusd -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd 2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpbusd 2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpbusd -2048(%rdx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+ {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+ {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusds 268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusds 291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpbusds (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpbusds -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds 4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpbusds 4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpbusds -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpbusds 268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpbusds 291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpbusds (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpbusds -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds 2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpbusds 2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpbusds -2048(%rdx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+ {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+ {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssd 268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssd 291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpwssd (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpwssd -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd 4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpwssd 4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpwssd -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssd 268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssd 291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpwssd (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpwssd -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd 2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpwssd 2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpwssd -2048(%rdx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+ {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+ {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssds 268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssds 291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpwssds (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpdpwssds -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds 4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+ {vex} vpdpwssds 4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+ {vex} vpdpwssds -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpdpwssds 268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpdpwssds 291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00]
+ {vex} vpdpwssds (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpdpwssds -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds 2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+ {vex} vpdpwssds 2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+ {vex} vpdpwssds -2048(%rdx), %xmm5, %xmm6
+