From: Xiang1 Zhang <xiang1.zhang@intel.com>
Date: Fri, 31 Mar 2023 08:56:06 +0000 (+0800)
Subject: [X86] Support AMX Complex instructions
X-Git-Tag: upstream/17.0.6~12797
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=038b7e6b761c2bebb30440cdd39252a0fa74ac3f;p=platform%2Fupstream%2Fllvm.git

[X86] Support AMX Complex instructions

Reviewed By: Wang Pengfei

Differential Revision: https://reviews.llvm.org/D147420
---

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 53a0541..100be1b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -332,6 +332,9 @@ AMDGPU Support
 X86 Support
 ^^^^^^^^^^^
 
+- Add ISA of ``AMX-COMPLEX`` which supports ``tcmmimfp16ps`` and
+  ``tcmmrlfp16ps``.
+
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 4b9e7d2..e5c1fe8 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -117,6 +117,8 @@ TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-
 TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16")
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16")
+TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -134,6 +136,9 @@ TARGET_BUILTIN(__builtin_ia32_tdpbuud, "vIUcIUcIUc", "n", "amx-int8")
 TARGET_BUILTIN(__builtin_ia32_tdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16")
 TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
 
+TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex")
+
 TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
 TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
 TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0cc7052..831f8dd 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4656,6 +4656,8 @@ def m3dnowa : Flag<["-"], "m3dnowa">, Group<m_x86_Features_Group>;
 def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group<m_x86_Features_Group>;
 def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group<m_x86_Features_Group>;
 def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group<m_x86_Features_Group>;
+def mamx_complex : Flag<["-"], "mamx-complex">, Group<m_x86_Features_Group>;
+def mno_amx_complex : Flag<["-"], "mno-amx-complex">, Group<m_x86_Features_Group>;
 def mamx_fp16 : Flag<["-"], "mamx-fp16">, Group<m_x86_Features_Group>;
 def mno_amx_fp16 : Flag<["-"], "mno-amx-fp16">, Group<m_x86_Features_Group>;
 def mamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index ac04bf9..0cffc76 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -335,6 +335,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAMXINT8 = true;
     } else if (Feature == "+amx-tile") {
       HasAMXTILE = true;
+    } else if (Feature == "+amx-complex") {
+      HasAMXCOMPLEX = true;
     } else if (Feature == "+cmpccxadd") {
       HasCMPCCXADD = true;
     } else if (Feature == "+raoint") {
@@ -799,6 +801,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AMX_BF16__");
   if (HasAMXFP16)
     Builder.defineMacro("__AMX_FP16__");
+  if (HasAMXCOMPLEX)
+    Builder.defineMacro("__AMXCOMPLEX__");
   if (HasCMPCCXADD)
     Builder.defineMacro("__CMPCCXADD__");
   if (HasRAOINT)
@@ -912,6 +916,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("adx", true)
       .Case("aes", true)
       .Case("amx-bf16", true)
+      .Case("amx-complex", true)
       .Case("amx-fp16", true)
       .Case("amx-int8", true)
       .Case("amx-tile", true)
@@ -1013,6 +1018,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("adx", HasADX)
       .Case("aes", HasAES)
       .Case("amx-bf16", HasAMXBF16)
+      .Case("amx-complex", HasAMXCOMPLEX)
       .Case("amx-fp16", HasAMXFP16)
       .Case("amx-int8", HasAMXINT8)
       .Case("amx-tile", HasAMXTILE)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 816bf13..9a563db 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -154,6 +154,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAMXTILE = false;
   bool HasAMXINT8 = false;
   bool HasAMXBF16 = false;
+  bool HasAMXCOMPLEX = false;
   bool HasSERIALIZE = false;
   bool HasTSXLDTRK = false;
   bool HasUINTR = false;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index bb9a11ea..52f0e23 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -117,6 +117,7 @@ set(x86_files
 # Intrinsics
   adxintrin.h
   ammintrin.h
+  amxcomplexintrin.h
   amxfp16intrin.h
   amxintrin.h
   avx2intrin.h
diff --git a/clang/lib/Headers/amxcomplexintrin.h b/clang/lib/Headers/amxcomplexintrin.h
new file mode 100644
index 0000000..84ef972
--- /dev/null
+++ b/clang/lib/Headers/amxcomplexintrin.h
@@ -0,0 +1,169 @@
+/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXINTRIN_H
+#define __AMX_COMPLEXINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_COMPLEX                                             \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (row of \a a, column of \a b), it performs a set of multiplication
+///    and accumulations on all corresponding complex numbers (one from \a a
+///    and one from \a b). The imaginary part of the \a a element is multiplied
+///    with the real part of the corresponding \a b element, and the real part
+///    of the \a a element is multiplied with the imaginary part of the
+///    corresponding \a b elements. The two accumulated results are added, and
+///    then accumulated into the corresponding row and column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///    of (row of \a a, column of \a b), it performs a set of multiplication
+///    and accumulations on all corresponding complex numbers (one from \a a
+///    and one from \a b). The real part of the \a a element is multiplied
+///    with the real part of the corresponding \a b element, and the negated
+///    imaginary part of the \a a element is multiplied with the imaginary
+///    part of the corresponding \a b elements. The two accumulated results
+///    are added, and then accumulated into the corresponding row and column
+///    of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
+_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
+_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number with
+/// FP16 real part and FP16 imaginary part.
+/// This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_COMPLEX
+static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                               __tile1024i src1) {
+  dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                         dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number with
+/// FP16 real part and FP16 imaginary part.
+/// This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_COMPLEX
+static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+                               __tile1024i src1) {
+  dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+                                         dst->tile, src0.tile, src1.tile);
+}
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXINTRIN_H
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 0d2e8be..8e109af 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -539,6 +539,11 @@ _storebe_i64(void * __P, long long __D) {
 #endif
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AMXCOMPLEX__)
+#include <amxcomplexintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>
 #endif
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index c8b4251..af67552 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5237,6 +5237,8 @@ bool Sema::CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_tdpbuud:
   case X86::BI__builtin_ia32_tdpbf16ps:
   case X86::BI__builtin_ia32_tdpfp16ps:
+  case X86::BI__builtin_ia32_tcmmimfp16ps:
+  case X86::BI__builtin_ia32_tcmmrlfp16ps:
     return CheckX86BuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
   }
 }
diff --git a/clang/test/CodeGen/X86/amx_complex_api.c b/clang/test/CodeGen/X86/amx_complex_api.c
new file mode 100644
index 0000000..113c7f6
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_complex_api.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown  -target-feature +avx512f  -target-feature +amx-bf16  \
+// RUN: -target-feature +amx-complex \
+// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
+
+#include <immintrin.h>
+
+char buf[1024];
+#define STRIDE 32
+
+char buf2[1024];
+
+void test_tile_cmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_cmmimfp16ps
+  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+  //CHECK-DAG: call x86_amx @llvm.x86.tcmmimfp16ps.internal
+  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  __tile_cmmimfp16ps(&c, a, b);
+}
+
+void test_tile_cmmrlfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_cmmrlfp16ps
+  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+  //CHECK-DAG: call x86_amx @llvm.x86.tcmmrlfp16ps.internal
+  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  __tile_cmmrlfp16ps(&c, a, b);
+}
diff --git a/clang/test/CodeGen/X86/amxcomplex-builtins.c b/clang/test/CodeGen/X86/amxcomplex-builtins.c
new file mode 100644
index 0000000..a5478b0
--- /dev/null
+++ b/clang/test/CodeGen/X86/amxcomplex-builtins.c
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-complex \
+// RUN: -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+void test_tile_cmmimfp16ps(void) {
+  // CHECK-LABEL: @test_tile_cmmimfp16ps
+  // CHECK: call void @llvm.x86.tcmmimfp16ps(i8 1, i8 2, i8 3)
+  _tile_cmmimfp16ps(1, 2, 3);
+}
+
+void test_tile_cmmrlfp16ps(void) {
+  // CHECK-LABEL: @test_tile_cmmrlfp16ps
+  // CHECK: call void @llvm.x86.tcmmrlfp16ps(i8 1, i8 2, i8 3)
+  _tile_cmmrlfp16ps(1, 2, 3);
+}
diff --git a/clang/test/CodeGen/X86/amxcomplex-errors.c b/clang/test/CodeGen/X86/amxcomplex-errors.c
new file mode 100644
index 0000000..3dd5ea5
--- /dev/null
+++ b/clang/test/CodeGen/X86/amxcomplex-errors.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-complex -emit-llvm -fsyntax-only -verify
+
+#include <immintrin.h>
+#include <stddef.h>
+void test_tile_cmmimfp16ps() {
+  _tile_cmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
+  _tile_cmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
+  _tile_cmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
+  _tile_cmmimfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
+}
+
+void test_tile_cmmrlfp16ps() {
+  _tile_cmmrlfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
+  _tile_cmmrlfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
+  _tile_cmmrlfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
+  _tile_cmmrlfp16ps(1, 1, 3);  // expected-error {{tile arguments must refer to different tiles}}
+}
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index d5e40ff..6819c90 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -297,6 +297,13 @@
 // AMX-FP16: "-target-feature" "+amx-fp16"
 // NO-AMX-FP16: "-target-feature" "-amx-fp16"
 
+// RUN: %clang -target x86_64-unknown-linux-gnu -mamx-complex %s \
+// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-COMPLEX %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -mno-amx-complex %s \
+// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-COMPLEX %s
+// AMX-COMPLEX: "-target-feature" "+amx-complex"
+// NO-AMX-COMPLEX: "-target-feature" "-amx-complex"
+
 // RUN: %clang --target=i386 -march=i386 -mhreset %s -### 2>&1 | FileCheck -check-prefix=HRESET %s
 // RUN: %clang --target=i386 -march=i386 -mno-hreset %s -### 2>&1 | FileCheck -check-prefix=NO-HRESET %s
 // HRESET: "-target-feature" "+hreset"
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 3265824..5bf38cc 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -559,6 +559,18 @@
 // NO-AMX-FP16-NOT: #define __AMX_FP16__ 1
 // NO-AMX-FP16-NOT: #define __AMX_TILE__ 1
 
+// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-complex -x c \
+// RUN: -E -dM -o - %s | FileCheck  -check-prefix=AMX-COMPLEX %s
+
+// AMX-COMPLEX: #define __AMXCOMPLEX__ 1
+
+// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-complex -x c \
+// RUN: -E -dM -o - %s | FileCheck  -check-prefix=NO-AMX-COMPLEX %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-complex -mno-amx-tile \
+// RUN: -x c -E -dM -o - %s | FileCheck  -check-prefix=NO-AMX-COMPLEX %s
+
+// NO-AMX-COMPLEX-NOT: #define __AMXCOMPLEX__ 1
+
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s
 
 // AVXVNNI: #define __AVX2__ 1
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 239f158..d8d8bc5 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5352,6 +5352,16 @@ let TargetPrefix = "x86" in {
               Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
                          ImmArg<ArgIndex<2>>]>;
+  // AMX-COMPLEX
+  def int_x86_tcmmimfp16ps : ClangBuiltin<"__builtin_ia32_tcmmimfp16ps">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+                         ImmArg<ArgIndex<2>>]>;
+  def int_x86_tcmmrlfp16ps : ClangBuiltin<"__builtin_ia32_tcmmrlfp16ps">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+                         ImmArg<ArgIndex<2>>]>;
+
   // AMX - internal intrinsics
   def int_x86_ldtilecfg_internal :
               ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">,
@@ -5414,6 +5424,19 @@ let TargetPrefix = "x86" in {
       DefaultAttrsIntrinsic<[llvm_x86amx_ty], [llvm_anyvector_ty], [IntrNoMem]>;
   def int_x86_cast_tile_to_vector:
       DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_x86amx_ty], [IntrNoMem]>;
+
+  def int_x86_tcmmimfp16ps_internal :
+              ClangBuiltin<"__builtin_ia32_tcmmimfp16ps_internal">,
+              Intrinsic<[llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+                         llvm_x86amx_ty, llvm_x86amx_ty,
+                         llvm_x86amx_ty], []>;
+  def int_x86_tcmmrlfp16ps_internal :
+              ClangBuiltin<"__builtin_ia32_tcmmrlfp16ps_internal">,
+              Intrinsic<[llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+                         llvm_x86amx_ty, llvm_x86amx_ty,
+                         llvm_x86amx_ty], []>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index feec0b8..7ed5941 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -167,6 +167,7 @@ X86_FEATURE       (3DNOWA,          "3dnowa")
 X86_FEATURE       (64BIT,           "64bit")
 X86_FEATURE       (ADX,             "adx")
 X86_FEATURE       (AMX_BF16,        "amx-bf16")
+X86_FEATURE       (AMX_COMPLEX,     "amx-complex")
 X86_FEATURE       (AMX_INT8,        "amx-int8")
 X86_FEATURE       (AMX_TILE,        "amx-tile")
 X86_FEATURE       (CLDEMOTE,        "cldemote")
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 0f0ee6c..7f6399c 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -267,6 +267,9 @@ def FeatureAMXBF16     : SubtargetFeature<"amx-bf16", "HasAMXBF16", "true",
 def FeatureAMXFP16     : SubtargetFeature<"amx-fp16", "HasAMXFP16", "true",
                                       "Support AMX amx-fp16 instructions",
                                       [FeatureAMXTILE]>;
+def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true",
+                                         "Support AMX-COMPLEX instructions",
+                                         [FeatureAMXTILE]>;
 def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true",
                                         "Support CMPCCXADD instructions">;
 def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true",
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index f50602e..21ad08f 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -562,6 +562,8 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     MI.setDesc(TII->get(Opc));
     return true;
   }
+  case X86::PTCMMIMFP16PSV:
+  case X86::PTCMMRLFP16PSV:
   case X86::PTDPBSSDV:
   case X86::PTDPBSUDV:
   case X86::PTDPBUSDV:
@@ -573,6 +575,8 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MI.removeOperand(i);
     unsigned Opc;
     switch (Opcode) {
+    case X86::PTCMMIMFP16PSV:  Opc = X86::TCMMIMFP16PS; break;
+    case X86::PTCMMRLFP16PSV:  Opc = X86::TCMMRLFP16PS; break;
     case X86::PTDPBSSDV:   Opc = X86::TDPBSSD; break;
     case X86::PTDPBSUDV:   Opc = X86::TDPBSUD; break;
     case X86::PTDPBUSDV:   Opc = X86::TDPBUSD; break;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1d8327f..c7eac48 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38235,6 +38235,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
+  case X86::PTCMMIMFP16PS:
+  case X86::PTCMMRLFP16PS: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unexpected instruction!");
+    case X86::PTCMMIMFP16PS:     Opc = X86::TCMMIMFP16PS;     break;
+    case X86::PTCMMRLFP16PS:     Opc = X86::TCMMRLFP16PS;     break;
+    }
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
+    MI.eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index f47a068..6948deb 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -215,3 +215,45 @@ let Predicates = [HasAMXFP16, In64BitMode] in {
     }
   }
 } // HasAMXTILE, HasAMXFP16
+
+let Predicates = [HasAMXCOMPLEX, In64BitMode] in {
+  let SchedRW = [WriteSystem] in {
+    let Constraints = "$src1 = $dst" in {
+      def TCMMIMFP16PS   : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
+                            (ins TILE:$src1, TILE:$src2, TILE:$src3),
+                            "tcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+                            []>, T8PD, VEX_4V;
+      def TCMMRLFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
+                            (ins TILE:$src1, TILE:$src2, TILE:$src3),
+                            "tcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+                            []>, VEX_4V, VEX_WIG, T8PS;
+
+    } // Constraints = "$src1 = $dst"
+
+    let Constraints = "$src4 = $dst" in {
+      def PTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+                                  GR16:$src2, GR16:$src3, TILE:$src4,
+                                  TILE:$src5, TILE:$src6),
+                                  [(set TILE: $dst,
+                                  (int_x86_tcmmimfp16ps_internal GR16:$src1, GR16:$src2,
+                                   GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+      def PTCMMRLFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+                                  GR16:$src2, GR16:$src3, TILE:$src4,
+                                  TILE:$src5, TILE:$src6),
+                                  [(set TILE: $dst,
+                                  (int_x86_tcmmrlfp16ps_internal GR16:$src1, GR16:$src2,
+                                   GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+    }
+
+    let usesCustomInserter = 1 in {
+      def PTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1,
+                                u8imm:$src2, u8imm:$src3),
+                                [(int_x86_tcmmimfp16ps timm:$src1,
+                                  timm:$src2, timm:$src3)]>;
+      def PTCMMRLFP16PS : PseudoI<(outs), (ins u8imm:$src1,
+                                u8imm:$src2, u8imm:$src3),
+                                [(int_x86_tcmmrlfp16ps timm:$src1,
+                                  timm:$src2, timm:$src3)]>;
+    }
+  } // SchedRW = [WriteSystem]
+}
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 1c7cb1d..9de9941 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -1010,6 +1010,7 @@ def HasTSXLDTRK  : Predicate<"Subtarget->hasTSXLDTRK()">;
 def HasAMXTILE   : Predicate<"Subtarget->hasAMXTILE()">;
 def HasAMXBF16   : Predicate<"Subtarget->hasAMXBF16()">;
 def HasAMXINT8   : Predicate<"Subtarget->hasAMXINT8()">;
+def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">;
 def HasUINTR     : Predicate<"Subtarget->hasUINTR()">;
 def HasCRC32     : Predicate<"Subtarget->hasCRC32()">;
 
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 325bc3a..e861420 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -129,6 +129,8 @@ static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
   }
   // a * b + c
   // The shape depends on which operand.
+  case Intrinsic::x86_tcmmimfp16ps_internal:
+  case Intrinsic::x86_tcmmrlfp16ps_internal:
   case Intrinsic::x86_tdpbssd_internal:
   case Intrinsic::x86_tdpbsud_internal:
   case Intrinsic::x86_tdpbusd_internal:
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 0edc0a4..9898ce2 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1003,6 +1003,8 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
   case X86::PTILEZEROV:
   case X86::PTDPBF16PSV:
   case X86::PTDPFP16PSV:
+  case X86::PTCMMIMFP16PSV:
+  case X86::PTCMMRLFP16PSV:
     MachineOperand &MO1 = MI->getOperand(1);
     MachineOperand &MO2 = MI->getOperand(2);
     ShapeT Shape(&MO1, &MO2, MRI);
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 20770a4..8da45d9 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -606,6 +606,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_TILE = {};
 constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
+constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesHRESET = {};
 
 constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {};
diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-complex-internals.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-complex-internals.ll
new file mode 100644
index 0000000..924572a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-complex-internals.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
+; RUN: -mattr=+amx-complex \
+; RUN: -verify-machineinstrs | FileCheck %s
+
+define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
+; CHECK-LABEL: test_amx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, %ax
+; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm0
+; CHECK-NEXT:    tilezero %tmm1
+; CHECK-NEXT:    tilezero %tmm2
+; CHECK-NEXT:    tcmmimfp16ps %tmm1, %tmm0, %tmm2
+; CHECK-NEXT:    tcmmrlfp16ps %tmm1, %tmm0, %tmm2
+; CHECK-NEXT:    tilestored %tmm2, (%rdi,%rdx)
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+
+  %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
+  %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
+  %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
+
+  %c1 = call x86_amx @llvm.x86.tcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
+  %c2 = call x86_amx @llvm.x86.tcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
+
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2)
+  ret void
+}
+
+declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
+
+declare x86_amx @llvm.x86.tcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
diff --git a/llvm/test/CodeGen/X86/AMX/amxcomplex-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amxcomplex-intrinsics.ll
new file mode 100644
index 0000000..7774092
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amxcomplex-intrinsics.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-complex --show-mc-encoding | FileCheck %s
+
+define void @test_tcmmimfp16ps() {
+; CHECK-LABEL: test_tcmmimfp16ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    tcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x61,0x6c,0xca]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  call void @llvm.x86.tcmmimfp16ps(i8 1, i8 2, i8 3)
+  ret void
+}
+declare void @llvm.x86.tcmmimfp16ps(i8 %A, i8 %B, i8 %C)
+
+define void @test_tcmmrlfp16ps() {
+; CHECK-LABEL: test_tcmmrlfp16ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    tcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6c,0xca]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  call void @llvm.x86.tcmmrlfp16ps(i8 1, i8 2, i8 3)
+  ret void
+}
+declare void @llvm.x86.tcmmrlfp16ps(i8 %A, i8 %B, i8 %C)
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-att.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-att.txt
new file mode 100644
index 0000000..5a0cb87
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-att.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding --disassemble < %s  | FileCheck %s
+
+# CHECK:      tcmmimfp16ps %tmm4, %tmm5, %tmm6
+0xc4,0xe2,0x59,0x6c,0xf5
+
+# CHECK:      tcmmimfp16ps %tmm1, %tmm2, %tmm3
+0xc4,0xe2,0x71,0x6c,0xda
+
+# CHECK:      tcmmrlfp16ps %tmm4, %tmm5, %tmm6
+0xc4,0xe2,0x58,0x6c,0xf5
+
+# CHECK:      tcmmrlfp16ps %tmm1, %tmm2, %tmm3
+0xc4,0xe2,0x70,0x6c,0xda
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-intel.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-intel.txt
new file mode 100644
index 0000000..556fce7
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-intel.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s
+
+# CHECK:      tcmmimfp16ps tmm6, tmm5, tmm4
+0xc4,0xe2,0x59,0x6c,0xf5
+
+# CHECK:      tcmmimfp16ps tmm3, tmm2, tmm1
+0xc4,0xe2,0x71,0x6c,0xda
+
+# CHECK:      tcmmrlfp16ps tmm6, tmm5, tmm4
+0xc4,0xe2,0x58,0x6c,0xf5
+
+# CHECK:      tcmmrlfp16ps tmm3, tmm2, tmm1
+0xc4,0xe2,0x70,0x6c,0xda
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-complex-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-complex-att.s
new file mode 100644
index 0000000..fd47208
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-complex-att.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding < %s  | FileCheck %s
+
+// CHECK:      tcmmimfp16ps %tmm4, %tmm5, %tmm6
+// CHECK: encoding: [0xc4,0xe2,0x59,0x6c,0xf5]
+               tcmmimfp16ps %tmm4, %tmm5, %tmm6
+
+// CHECK:      tcmmimfp16ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x71,0x6c,0xda]
+               tcmmimfp16ps %tmm1, %tmm2, %tmm3
+
+// CHECK:      tcmmrlfp16ps %tmm4, %tmm5, %tmm6
+// CHECK: encoding: [0xc4,0xe2,0x58,0x6c,0xf5]
+               tcmmrlfp16ps %tmm4, %tmm5, %tmm6
+
+// CHECK:      tcmmrlfp16ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x70,0x6c,0xda]
+               tcmmrlfp16ps %tmm1, %tmm2, %tmm3
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-complex-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-complex-intel.s
new file mode 100644
index 0000000..18dce99
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-complex-intel.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK:      tcmmimfp16ps tmm6, tmm5, tmm4
+// CHECK: encoding: [0xc4,0xe2,0x59,0x6c,0xf5]
+               tcmmimfp16ps tmm6, tmm5, tmm4
+
+// CHECK:      tcmmimfp16ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe2,0x71,0x6c,0xda]
+               tcmmimfp16ps tmm3, tmm2, tmm1
+
+// CHECK:      tcmmrlfp16ps tmm6, tmm5, tmm4
+// CHECK: encoding: [0xc4,0xe2,0x58,0x6c,0xf5]
+               tcmmrlfp16ps tmm6, tmm5, tmm4
+
+// CHECK:      tcmmrlfp16ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe2,0x70,0x6c,0xda]
+               tcmmrlfp16ps tmm3, tmm2, tmm1