[X86] Improve combineCastedMaskArithmetic to fold (bitcast (vXi1 (and/or/xor X, C...

author Craig Topper <craig.topper@intel.com>

Thu, 8 Feb 2018 22:26:39 +0000 (22:26 +0000)

committer Craig Topper <craig.topper@intel.com>

Thu, 8 Feb 2018 22:26:39 +0000 (22:26 +0000)
author Craig Topper <craig.topper@intel.com>
Thu, 8 Feb 2018 22:26:39 +0000 (22:26 +0000)
committer Craig Topper <craig.topper@intel.com>
Thu, 8 Feb 2018 22:26:39 +0000 (22:26 +0000)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index 693da1a..22d918f 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30494,6 +30494,24 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
    return DAG.getZExtOrTrunc(V, DL, VT);
  }
  
+// Convert a vXi1 constant build vector to the same width scalar integer.
+static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
+  EVT SrcVT = Op.getValueType();
+  assert(SrcVT.getVectorElementType() == MVT::i1 &&
+         "Expected a vXi1 vector");
+  assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+         "Expected a constant build vector");
+
+  APInt Imm(SrcVT.getVectorNumElements(), 0);
+  for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
+    SDValue In = Op.getOperand(Idx);
+    if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
+      Imm.setBit(Idx);
+  }
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
+  return DAG.getConstant(Imm, SDLoc(Op), IntVT);
+}
+
  static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
                                             TargetLowering::DAGCombinerInfo &DCI,
                                             const X86Subtarget &Subtarget) {
@@ -30539,6 +30557,14 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
      return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
                         DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
  
+  // If the RHS is a vXi1 build vector, this is a good reason to flip too.
+  // Most of these have to move a constant from the scalar domain anyway.
+  if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
+    RHS = combinevXi1ConstantToInteger(RHS, DAG);
+    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
+                       DAG.getBitcast(DstVT, LHS), RHS);
+  }
+
    return SDValue();
  }
  
@@ -30632,13 +30658,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
    if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
        SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
        ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
-    APInt Imm(SrcVT.getVectorNumElements(), 0);
-    for (unsigned Idx = 0, e = N0.getNumOperands(); Idx < e; ++Idx) {
-      SDValue In = N0.getOperand(Idx);
-      if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
-        Imm.setBit(Idx);
-    }
-    return DAG.getConstant(Imm, SDLoc(N), VT);
+    return combinevXi1ConstantToInteger(N0, DAG);
    }
  
    // Try to remove bitcasts from input and output of mask arithmetic to
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll

index f19f062..f684618 100644 (file)
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -3754,14 +3754,9 @@ declare i16 @llvm.x86.avx512.kandn.w(i16, i16) nounwind readnone
  define i16 @test_kandn(i16 %a0, i16 %a1) {
  ; CHECK-LABEL: test_kandn:
  ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    movw $8, %ax
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    kandnw %k1, %k0, %k0
-; CHECK-NEXT:    knotw %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl %esi, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    orl $-9, %edi
+; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    movl %edi, %eax
  ; CHECK-NEXT:    retq
    %t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8)
    %t2 = call i16 @llvm.x86.avx512.kandn.w(i16 %t1, i16 %a1)
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll

index f035b72..cce2f43 100644 (file)
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -513,11 +513,8 @@ define void @test7(<8 x i1> %mask)  {
  ; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
  ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
  ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; KNL-NEXT:    movb $85, %al
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
  ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    orb $85, %al
  ; KNL-NEXT:    vzeroupper
  ; KNL-NEXT:    retq
  ;
@@ -525,20 +522,16 @@ define void @test7(<8 x i1> %mask)  {
  ; SKX:       ## %bb.0: ## %allocas
  ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
  ; SKX-NEXT:    vpmovw2m %xmm0, %k0
-; SKX-NEXT:    movb $85, %al
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    kortestb %k1, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    orb $85, %al
  ; SKX-NEXT:    retq
  ;
  ; AVX512BW-LABEL: test7:
  ; AVX512BW:       ## %bb.0: ## %allocas
  ; AVX512BW-NEXT:    vpsllw $15, %xmm0, %xmm0
  ; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
-; AVX512BW-NEXT:    movb $85, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    korw %k1, %k0, %k0
  ; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    testb %al, %al
+; AVX512BW-NEXT:    orb $85, %al
  ; AVX512BW-NEXT:    vzeroupper
  ; AVX512BW-NEXT:    retq
  ;
@@ -547,9 +540,8 @@ define void @test7(<8 x i1> %mask)  {
  ; AVX512DQ-NEXT:    vpmovsxwq %xmm0, %zmm0
  ; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
  ; AVX512DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; AVX512DQ-NEXT:    movb $85, %al
-; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    kortestb %k1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    orb $85, %al
  ; AVX512DQ-NEXT:    vzeroupper
  ; AVX512DQ-NEXT:    retq
  allocas:
diff --git a/llvm/test/CodeGen/X86/avx512-schedule.ll b/llvm/test/CodeGen/X86/avx512-schedule.ll

index 99798c7..5cb9dbb 100755 (executable)
--- a/llvm/test/CodeGen/X86/avx512-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx512-schedule.ll
@@ -7020,18 +7020,16 @@ define void @vcmp_test7(<8 x i1> %mask)  {
  ; GENERIC:       # %bb.0: # %allocas
  ; GENERIC-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
  ; GENERIC-NEXT:    vpmovw2m %xmm0, %k0 # sched: [1:0.33]
-; GENERIC-NEXT:    movb $85, %al # sched: [1:0.33]
-; GENERIC-NEXT:    kmovd %eax, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    kortestb %k1, %k0 # sched: [1:1.00]
+; GENERIC-NEXT:    kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    orb $85, %al # sched: [1:0.33]
  ; GENERIC-NEXT:    retq # sched: [1:1.00]
  ;
  ; SKX-LABEL: vcmp_test7:
  ; SKX:       # %bb.0: # %allocas
  ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
  ; SKX-NEXT:    vpmovw2m %xmm0, %k0 # sched: [1:1.00]
-; SKX-NEXT:    movb $85, %al # sched: [1:0.25]
-; SKX-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
-; SKX-NEXT:    kortestb %k1, %k0 # sched: [3:1.00]
+; SKX-NEXT:    kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT:    orb $85, %al # sched: [1:0.25]
  ; SKX-NEXT:    retq # sched: [7:1.00]
  allocas:
    %a= or <8 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
author	Craig Topper <craig.topper@intel.com>
	Thu, 8 Feb 2018 22:26:39 +0000 (22:26 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Thu, 8 Feb 2018 22:26:39 +0000 (22:26 +0000)
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll		patch \| blob \| history
llvm/test/CodeGen/X86/avx512-mask-op.ll		patch \| blob \| history
llvm/test/CodeGen/X86/avx512-schedule.ll		patch \| blob \| history