From 391fa371fdfbc5ea4d4a924aebb27cb77d483da4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 19 Sep 2021 13:20:36 -0700
Subject: [PATCH] [X86] Remove Commutable flag from mpsadbw intrinsics.

Unlike psadbw, mpsadbw is not commutable because of how it operates
on blocks. We already marked as not commutable for MachineIR, but
had it commutable for the tablegened isel patterns.

Fixes PR51908.
---
 llvm/include/llvm/IR/IntrinsicsX86.td         |  4 +--
 llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll  | 35 +++++++++++++++++++--------
 llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll | 22 +++++++++++------
 3 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 2601f96..1b941e8 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -792,7 +792,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_mpsadbw         : GCCBuiltin<"__builtin_ia32_mpsadbw128">,
           Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_i8_ty],
-                    [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
+                    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 // Test instruction with bitwise comparison.
@@ -1779,7 +1779,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                          llvm_v32i8_ty], [IntrNoMem]>;
   def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
-                         llvm_i8_ty], [IntrNoMem, Commutative, ImmArg<ArgIndex<2>>]>;
+                         llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 29b32e7..ca67da6 100644
--- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -731,18 +731,33 @@ define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
 }
 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
 
-; FIXME: We shouldn't commute this operation to fold the load.
+; We shouldn't commute this operation to fold the load.
 define <16 x i16> @test_x86_avx2_mpsadbw_load_op0(<32 x i8>* %ptr, <32 x i8> %a1) {
-; X86-LABEL: test_x86_avx2_mpsadbw_load_op0:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vmpsadbw $7, (%eax), %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x42,0x00,0x07]
-; X86-NEXT:    retl # encoding: [0xc3]
+; X86-AVX-LABEL: test_x86_avx2_mpsadbw_load_op0:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT:    vmovdqa (%eax), %ymm1 # encoding: [0xc5,0xfd,0x6f,0x08]
+; X86-AVX-NEXT:    vmpsadbw $7, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x42,0xc0,0x07]
+; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-LABEL: test_x86_avx2_mpsadbw_load_op0:
-; X64:       # %bb.0:
-; X64-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x42,0x07,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
+; X86-AVX512VL-LABEL: test_x86_avx2_mpsadbw_load_op0:
+; X86-AVX512VL:       # %bb.0:
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vmovdqa (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x08]
+; X86-AVX512VL-NEXT:    vmpsadbw $7, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x42,0xc0,0x07]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_mpsadbw_load_op0:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqa (%rdi), %ymm1 # encoding: [0xc5,0xfd,0x6f,0x0f]
+; X64-AVX-NEXT:    vmpsadbw $7, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x42,0xc0,0x07]
+; X64-AVX-NEXT:    retq # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_mpsadbw_load_op0:
+; X64-AVX512VL:       # %bb.0:
+; X64-AVX512VL-NEXT:    vmovdqa (%rdi), %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x0f]
+; X64-AVX512VL-NEXT:    vmpsadbw $7, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x42,0xc0,0x07]
+; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
   %a0 = load <32 x i8>, <32 x i8>* %ptr
   %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll
index 66e7836..3f73922 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -116,39 +116,47 @@ define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
 }
 declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
-; FIXME: We shouldn't commute this operation to fold the load.
+; We shouldn't commute this operation to fold the load.
 define <8 x i16> @test_x86_sse41_mpsadbw_load_op0(<16 x i8>* %ptr, <16 x i8> %a1) {
 ; X86-SSE-LABEL: test_x86_sse41_mpsadbw_load_op0:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    mpsadbw $7, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x42,0x00,0x07]
+; X86-SSE-NEXT:    movdqa (%eax), %xmm1 ## encoding: [0x66,0x0f,0x6f,0x08]
+; X86-SSE-NEXT:    mpsadbw $7, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x42,0xc8,0x07]
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc1]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_x86_sse41_mpsadbw_load_op0:
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmpsadbw $7, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x42,0x00,0x07]
+; X86-AVX1-NEXT:    vmovdqa (%eax), %xmm1 ## encoding: [0xc5,0xf9,0x6f,0x08]
+; X86-AVX1-NEXT:    vmpsadbw $7, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x42,0xc0,0x07]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_x86_sse41_mpsadbw_load_op0:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmpsadbw $7, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x42,0x00,0x07]
+; X86-AVX512-NEXT:    vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08]
+; X86-AVX512-NEXT:    vmpsadbw $7, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x42,0xc0,0x07]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_x86_sse41_mpsadbw_load_op0:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    mpsadbw $7, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x42,0x07,0x07]
+; X64-SSE-NEXT:    movdqa (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x6f,0x0f]
+; X64-SSE-NEXT:    mpsadbw $7, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x42,0xc8,0x07]
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc1]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: test_x86_sse41_mpsadbw_load_op0:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x42,0x07,0x07]
+; X64-AVX1-NEXT:    vmovdqa (%rdi), %xmm1 ## encoding: [0xc5,0xf9,0x6f,0x0f]
+; X64-AVX1-NEXT:    vmpsadbw $7, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x42,0xc0,0x07]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_x86_sse41_mpsadbw_load_op0:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x42,0x07,0x07]
+; X64-AVX512-NEXT:    vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f]
+; X64-AVX512-NEXT:    vmpsadbw $7, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x42,0xc0,0x07]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %a0 = load <16 x i8>, <16 x i8>* %ptr
   %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
-- 
2.7.4