[X86] Improve optimizeCompareInstr for signed comparisons after BMI/TBM instructions

author Craig Topper <craig.topper@sifive.com>

Wed, 31 Mar 2021 16:41:02 +0000 (09:41 -0700)

committer Craig Topper <craig.topper@sifive.com>

Wed, 31 Mar 2021 16:45:29 +0000 (09:45 -0700)
author Craig Topper <craig.topper@sifive.com>
Wed, 31 Mar 2021 16:41:02 +0000 (09:41 -0700)
committer Craig Topper <craig.topper@sifive.com>
Wed, 31 Mar 2021 16:45:29 +0000 (09:45 -0700)
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp

index 9c2970c..5d34912 100644 (file)
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3972,8 +3972,10 @@ inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
  
  /// Check whether the definition can be converted
  /// to remove a comparison against zero.
-inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
+inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
+                                    bool &ClearsOverflowFlag) {
    NoSignFlag = false;
+  ClearsOverflowFlag = false;
  
    switch (MI.getOpcode()) {
    default: return false;
@@ -4039,12 +4041,6 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
    case X86::SHL8r1:    case X86::SHL16r1:  case X86::SHL32r1:case X86::SHL64r1:
    case X86::ANDN32rr:  case X86::ANDN32rm:
    case X86::ANDN64rr:  case X86::ANDN64rm:
-  case X86::BLSI32rr:  case X86::BLSI32rm:
-  case X86::BLSI64rr:  case X86::BLSI64rm:
-  case X86::BLSMSK32rr:case X86::BLSMSK32rm:
-  case X86::BLSMSK64rr:case X86::BLSMSK64rm:
-  case X86::BLSR32rr:  case X86::BLSR32rm:
-  case X86::BLSR64rr:  case X86::BLSR64rm:
    case X86::BZHI32rr:  case X86::BZHI32rm:
    case X86::BZHI64rr:  case X86::BZHI64rm:
    case X86::LZCNT16rr: case X86::LZCNT16rm:
@@ -4056,6 +4052,13 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
    case X86::TZCNT16rr: case X86::TZCNT16rm:
    case X86::TZCNT32rr: case X86::TZCNT32rm:
    case X86::TZCNT64rr: case X86::TZCNT64rm:
+    return true;
+  case X86::BLSI32rr:    case X86::BLSI32rm:
+  case X86::BLSI64rr:    case X86::BLSI64rm:
+  case X86::BLSMSK32rr:  case X86::BLSMSK32rm:
+  case X86::BLSMSK64rr:  case X86::BLSMSK64rm:
+  case X86::BLSR32rr:    case X86::BLSR32rm:
+  case X86::BLSR64rr:    case X86::BLSR64rm:
    case X86::BLCFILL32rr: case X86::BLCFILL32rm:
    case X86::BLCFILL64rr: case X86::BLCFILL64rm:
    case X86::BLCI32rr:    case X86::BLCI32rm:
@@ -4074,12 +4077,17 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
    case X86::T1MSKC64rr:  case X86::T1MSKC64rm:
    case X86::TZMSK32rr:   case X86::TZMSK32rm:
    case X86::TZMSK64rr:   case X86::TZMSK64rm:
+    // These instructions clear the overflow flag just like TEST.
+    // FIXME: These are not the only instructions in this switch that clear the
+    // overflow flag.
+    ClearsOverflowFlag = true;
      return true;
    case X86::BEXTR32rr:   case X86::BEXTR64rr:
    case X86::BEXTR32rm:   case X86::BEXTR64rm:
    case X86::BEXTRI32ri:  case X86::BEXTRI32mi:
    case X86::BEXTRI64ri:  case X86::BEXTRI64mi:
-    // BEXTR doesn't update the sign flag so we can't use it.
+    // BEXTR doesn't update the sign flag so we can't use it. It does clear
+    // the overflow flag, but that's not useful without the sign flag.
      NoSignFlag = true;
      return true;
    }
@@ -4199,8 +4207,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
    // right way.
    bool ShouldUpdateCC = false;
    bool NoSignFlag = false;
+  bool ClearsOverflowFlag = false;
    X86::CondCode NewCC = X86::COND_INVALID;
-  if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag)) {
+  if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag, ClearsOverflowFlag)) {
      // Scan forward from the use until we hit the use we're looking for or the
      // compare instruction.
      for (MachineBasicBlock::iterator J = MI;; ++J) {
@@ -4312,11 +4321,15 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
        default: break;
        case X86::COND_A: case X86::COND_AE:
        case X86::COND_B: case X86::COND_BE:
+        // CF is used, we can't perform this optimization.
+        return false;
        case X86::COND_G: case X86::COND_GE:
        case X86::COND_L: case X86::COND_LE:
        case X86::COND_O: case X86::COND_NO:
-        // CF and OF are used, we can't perform this optimization.
-        return false;
+        // If OF is used, the instruction needs to clear it like CmpZero does.
+        if (!ClearsOverflowFlag)
+          return false;
+        break;
        case X86::COND_S: case X86::COND_NS:
          // If SF is used, but the instruction doesn't update the SF, then we
          // can't do the optimization.
diff --git a/llvm/test/CodeGen/X86/bmi.ll b/llvm/test/CodeGen/X86/bmi.ll

index 641b03e..1522d27 100644 (file)
--- a/llvm/test/CodeGen/X86/bmi.ll
+++ b/llvm/test/CodeGen/X86/bmi.ll
@@ -539,11 +539,12 @@ define i32 @blsi32_z2(i32 %a, i32 %b, i32 %c) nounwind {
    ret i32 %t3
  }
  
+; Inspired by PR48768, but using cmovcc instead of setcc. There should be
+; no test instruction.
  define i32 @blsi32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; X86-LABEL: blsi32_sle:
  ; X86:       # %bb.0:
  ; X86-NEXT:    blsil {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
  ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
  ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
  ; X86-NEXT:    cmovlel %eax, %ecx
@@ -554,7 +555,6 @@ define i32 @blsi32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; X64:       # %bb.0:
  ; X64-NEXT:    movl %esi, %eax
  ; X64-NEXT:    blsil %edi, %ecx
-; X64-NEXT:    testl %ecx, %ecx
  ; X64-NEXT:    cmovgl %edx, %eax
  ; X64-NEXT:    retq
    %t0 = sub i32 0, %a
@@ -685,7 +685,6 @@ define i64 @blsi64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; X64:       # %bb.0:
  ; X64-NEXT:    movq %rsi, %rax
  ; X64-NEXT:    blsiq %rdi, %rcx
-; X64-NEXT:    testq %rcx, %rcx
  ; X64-NEXT:    cmovgq %rdx, %rax
  ; X64-NEXT:    retq
    %t0 = sub i64 0, %a
@@ -776,7 +775,6 @@ define i32 @blsmsk32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; X86-LABEL: blsmsk32_sle:
  ; X86:       # %bb.0:
  ; X86-NEXT:    blsmskl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
  ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
  ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
  ; X86-NEXT:    cmovlel %eax, %ecx
@@ -787,7 +785,6 @@ define i32 @blsmsk32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; X64:       # %bb.0:
  ; X64-NEXT:    movl %esi, %eax
  ; X64-NEXT:    blsmskl %edi, %ecx
-; X64-NEXT:    testl %ecx, %ecx
  ; X64-NEXT:    cmovgl %edx, %eax
  ; X64-NEXT:    retq
    %t0 = sub i32 %a, 1
@@ -918,7 +915,6 @@ define i64 @blsmsk64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; X64:       # %bb.0:
  ; X64-NEXT:    movq %rsi, %rax
  ; X64-NEXT:    blsmskq %rdi, %rcx
-; X64-NEXT:    testq %rcx, %rcx
  ; X64-NEXT:    cmovgq %rdx, %rax
  ; X64-NEXT:    retq
    %t0 = sub i64 %a, 1
@@ -1009,7 +1005,6 @@ define i32 @blsr32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; X86-LABEL: blsr32_sle:
  ; X86:       # %bb.0:
  ; X86-NEXT:    blsrl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
  ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
  ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
  ; X86-NEXT:    cmovlel %eax, %ecx
@@ -1020,7 +1015,6 @@ define i32 @blsr32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; X64:       # %bb.0:
  ; X64-NEXT:    movl %esi, %eax
  ; X64-NEXT:    blsrl %edi, %ecx
-; X64-NEXT:    testl %ecx, %ecx
  ; X64-NEXT:    cmovgl %edx, %eax
  ; X64-NEXT:    retq
    %t0 = sub i32 %a, 1
@@ -1151,7 +1145,6 @@ define i64 @blsr64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; X64:       # %bb.0:
  ; X64-NEXT:    movq %rsi, %rax
  ; X64-NEXT:    blsrq %rdi, %rcx
-; X64-NEXT:    testq %rcx, %rcx
  ; X64-NEXT:    cmovgq %rdx, %rax
  ; X64-NEXT:    retq
    %t0 = sub i64 %a, 1
diff --git a/llvm/test/CodeGen/X86/tbm_patterns.ll b/llvm/test/CodeGen/X86/tbm_patterns.ll

index 5f5306a..30e9a26 100644 (file)
--- a/llvm/test/CodeGen/X86/tbm_patterns.ll
+++ b/llvm/test/CodeGen/X86/tbm_patterns.ll
@@ -193,7 +193,6 @@ define i32 @test_x86_tbm_blcfill_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movl %esi, %eax
  ; CHECK-NEXT:    blcfilll %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
  ; CHECK-NEXT:    cmovgl %edx, %eax
  ; CHECK-NEXT:    retq
    %t0 = add i32 %a, 1
@@ -245,7 +244,6 @@ define i64 @test_x86_tbm_blcfill_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movq %rsi, %rax
  ; CHECK-NEXT:    blcfillq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
  ; CHECK-NEXT:    cmovgq %rdx, %rax
  ; CHECK-NEXT:    retq
    %t0 = add i64 %a, 1
@@ -300,7 +298,6 @@ define i32 @test_x86_tbm_blci_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movl %esi, %eax
  ; CHECK-NEXT:    blcil %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
  ; CHECK-NEXT:    cmovgl %edx, %eax
  ; CHECK-NEXT:    retq
    %t0 = add i32 1, %a
@@ -356,7 +353,6 @@ define i64 @test_x86_tbm_blci_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movq %rsi, %rax
  ; CHECK-NEXT:    blciq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
  ; CHECK-NEXT:    cmovgq %rdx, %rax
  ; CHECK-NEXT:    retq
    %t0 = add i64 1, %a
@@ -432,7 +428,6 @@ define i32 @test_x86_tbm_blcic_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movl %esi, %eax
  ; CHECK-NEXT:    blcicl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
  ; CHECK-NEXT:    cmovgl %edx, %eax
  ; CHECK-NEXT:    retq
    %t0 = xor i32 %a, -1
@@ -488,7 +483,6 @@ define i64 @test_x86_tbm_blcic_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movq %rsi, %rax
  ; CHECK-NEXT:    blcicq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
  ; CHECK-NEXT:    cmovgq %rdx, %rax
  ; CHECK-NEXT:    retq
    %t0 = xor i64 %a, -1
@@ -541,7 +535,6 @@ define i32 @test_x86_tbm_blcmsk_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movl %esi, %eax
  ; CHECK-NEXT:    blcmskl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
  ; CHECK-NEXT:    cmovgl %edx, %eax
  ; CHECK-NEXT:    retq
    %t0 = add i32 %a, 1
@@ -593,7 +586,6 @@ define i64 @test_x86_tbm_blcmsk_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movq %rsi, %rax
  ; CHECK-NEXT:    blcmskq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
  ; CHECK-NEXT:    cmovgq %rdx, %rax
  ; CHECK-NEXT:    retq
    %t0 = add i64 %a, 1
@@ -645,7 +637,6 @@ define i32 @test_x86_tbm_blcs_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movl %esi, %eax
  ; CHECK-NEXT:    blcsl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
  ; CHECK-NEXT:    cmovgl %edx, %eax
  ; CHECK-NEXT:    retq
    %t0 = add i32 %a, 1
@@ -697,7 +688,6 @@ define i64 @test_x86_tbm_blcs_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movq %rsi, %rax
  ; CHECK-NEXT:    blcsq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
  ; CHECK-NEXT:    cmovgq %rdx, %rax
  ; CHECK-NEXT:    retq
    %t0 = add i64 %a, 1
@@ -749,7 +739,6 @@ define i32 @test_x86_tbm_blsfill_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movl %esi, %eax
  ; CHECK-NEXT:    blsfilll %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
  ; CHECK-NEXT:    cmovgl %edx, %eax
  ; CHECK-NEXT:    retq
    %t0 = add i32 %a, -1
@@ -801,7 +790,6 @@ define i64 @test_x86_tbm_blsfill_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movq %rsi, %rax
  ; CHECK-NEXT:    blsfillq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
  ; CHECK-NEXT:    cmovgq %rdx, %rax
  ; CHECK-NEXT:    retq
    %t0 = add i64 %a, -1
@@ -856,7 +844,6 @@ define i32 @test_x86_tbm_blsic_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movl %esi, %eax
  ; CHECK-NEXT:    blsicl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
  ; CHECK-NEXT:    cmovgl %edx, %eax
  ; CHECK-NEXT:    retq
    %t0 = xor i32 %a, -1
@@ -912,7 +899,6 @@ define i64 @test_x86_tbm_blsic_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movq %rsi, %rax
  ; CHECK-NEXT:    blsicq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
  ; CHECK-NEXT:    cmovgq %rdx, %rax
  ; CHECK-NEXT:    retq
    %t0 = xor i64 %a, -1
@@ -968,7 +954,6 @@ define i32 @test_x86_tbm_t1mskc_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movl %esi, %eax
  ; CHECK-NEXT:    t1mskcl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
  ; CHECK-NEXT:    cmovgl %edx, %eax
  ; CHECK-NEXT:    retq
    %t0 = xor i32 %a, -1
@@ -1024,7 +1009,6 @@ define i64 @test_x86_tbm_t1mskc_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movq %rsi, %rax
  ; CHECK-NEXT:    t1mskcq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
  ; CHECK-NEXT:    cmovgq %rdx, %rax
  ; CHECK-NEXT:    retq
    %t0 = xor i64 %a, -1
@@ -1080,7 +1064,6 @@ define i32 @test_x86_tbm_tzmsk_u32_sle(i32 %a, i32 %b, i32 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movl %esi, %eax
  ; CHECK-NEXT:    tzmskl %edi, %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
  ; CHECK-NEXT:    cmovgl %edx, %eax
  ; CHECK-NEXT:    retq
    %t0 = xor i32 %a, -1
@@ -1136,7 +1119,6 @@ define i64 @test_x86_tbm_tzmsk_u64_sle(i64 %a, i64 %b, i64 %c) nounwind {
  ; CHECK:       # %bb.0:
  ; CHECK-NEXT:    movq %rsi, %rax
  ; CHECK-NEXT:    tzmskq %rdi, %rcx
-; CHECK-NEXT:    testq %rcx, %rcx
  ; CHECK-NEXT:    cmovgq %rdx, %rax
  ; CHECK-NEXT:    retq
    %t0 = xor i64 %a, -1
author	Craig Topper <craig.topper@sifive.com>
	Wed, 31 Mar 2021 16:41:02 +0000 (09:41 -0700)
committer	Craig Topper <craig.topper@sifive.com>
	Wed, 31 Mar 2021 16:45:29 +0000 (09:45 -0700)
llvm/lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/bmi.ll		patch \| blob \| history
llvm/test/CodeGen/X86/tbm_patterns.ll		patch \| blob \| history