This fixes the patterns that have or/and as a root. 'and' is handled differently since thy usually have a CMP wrapped around them.
I had to look for uses of the CF flag because all these nodes have non-standard CF flag behavior. A real or/xor would always clear CF. In practice we shouldn't be using the CF flag from these nodes as far as I know.
Differential Revision: https://reviews.llvm.org/D55813
llvm-svn: 349962
SDValue &InFlag);
bool tryOptimizeRem8Extend(SDNode *N);
+
+ bool hasNoSignFlagUses(SDValue Flags) const;
+ bool hasNoCarryFlagUses(SDValue Flags) const;
};
}
/// Test whether the given X86ISD::CMP node has any uses which require the SF
/// flag to be accurate.
-static bool hasNoSignFlagUses(SDValue Flags) {
+bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
// Examine each user of the node.
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {
/// Test whether the given node which sets flags has any uses which require the
/// CF flag to be accurate.
-static bool hasNoCarryFlagUses(SDValue Flags) {
+ bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
// Examine each user of the node.
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {
// Pattern fragments to auto generate BMI instructions.
//===----------------------------------------------------------------------===//
+def or_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86or_flag node:$lhs, node:$rhs), [{
+ return hasNoCarryFlagUses(SDValue(N, 1));
+}]>;
+
+def xor_flag_nocf : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86xor_flag node:$lhs, node:$rhs), [{
+ return hasNoCarryFlagUses(SDValue(N, 1));
+}]>;
+
let Predicates = [HasBMI] in {
// FIXME: patterns for the load versions are not implemented
def : Pat<(and GR32:$src, (add GR32:$src, -1)),
(BLSI32rr GR32:$src)>;
def : Pat<(and GR64:$src, (ineg GR64:$src)),
(BLSI64rr GR64:$src)>;
+
+ // Versions to match flag producing ops.
+ // X86and_flag nodes are rarely created. Those should use CMP+AND. We do
+ // TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed.
+ def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, -1)),
+ (BLSMSK32rr GR32:$src)>;
+ def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, -1)),
+ (BLSMSK64rr GR64:$src)>;
}
multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC,
(TZMSK32rr GR32:$src)>;
def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
(TZMSK64rr GR64:$src)>;
+
+ // Patterns to match flag producing ops.
+ // X86and_flag nodes are rarely created. Those should use CMP+AND. We do
+ // TESTrr matching in PostProcessISelDAG to allow BLSR/BLSI to be formed.
+ def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))),
+ (BLCI32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))),
+ (BLCI64rr GR64:$src)>;
+
+ // Extra patterns because opt can optimize the above patterns to this.
+ def : Pat<(or_flag_nocf GR32:$src, (sub -2, GR32:$src)),
+ (BLCI32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf GR64:$src, (sub -2, GR64:$src)),
+ (BLCI64rr GR64:$src)>;
+
+ def : Pat<(xor_flag_nocf GR32:$src, (add GR32:$src, 1)),
+ (BLCMSK32rr GR32:$src)>;
+ def : Pat<(xor_flag_nocf GR64:$src, (add GR64:$src, 1)),
+ (BLCMSK64rr GR64:$src)>;
+
+ def : Pat<(or_flag_nocf GR32:$src, (add GR32:$src, 1)),
+ (BLCS32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf GR64:$src, (add GR64:$src, 1)),
+ (BLCS64rr GR64:$src)>;
+
+ def : Pat<(or_flag_nocf GR32:$src, (add GR32:$src, -1)),
+ (BLSFILL32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf GR64:$src, (add GR64:$src, -1)),
+ (BLSFILL64rr GR64:$src)>;
+
+ def : Pat<(or_flag_nocf (not GR32:$src), (add GR32:$src, -1)),
+ (BLSIC32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, -1)),
+ (BLSIC64rr GR64:$src)>;
+
+ def : Pat<(or_flag_nocf (not GR32:$src), (add GR32:$src, 1)),
+ (T1MSKC32rr GR32:$src)>;
+ def : Pat<(or_flag_nocf (not GR64:$src), (add GR64:$src, 1)),
+ (T1MSKC64rr GR64:$src)>;
} // HasTBM
//===----------------------------------------------------------------------===//
define i32 @blsmsk32_z2(i32 %a, i32 %b, i32 %c) nounwind {
; X86-LABEL: blsmsk32_z2:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: leal -1(%eax), %ecx
-; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: blsmskl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmovel %eax, %ecx
; X64-LABEL: blsmsk32_z2:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %eax
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: leal -1(%rdi), %ecx
-; X64-NEXT: xorl %edi, %ecx
+; X64-NEXT: blsmskl %edi, %ecx
; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
%t0 = sub i32 %a, 1
; X64-LABEL: blsmsk64_z2:
; X64: # %bb.0:
; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: leaq -1(%rdi), %rcx
-; X64-NEXT: xorq %rdi, %rcx
+; X64-NEXT: blsmskq %rdi, %rcx
; X64-NEXT: cmovneq %rdx, %rax
; X64-NEXT: retq
%t0 = sub i64 %a, 1
; CHECK-LABEL: test_x86_tbm_blci_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal 1(%rdi), %ecx
-; CHECK-NEXT: notl %ecx
-; CHECK-NEXT: orl %edi, %ecx
+; CHECK-NEXT: blcil %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = add i32 1, %a
; CHECK-LABEL: test_x86_tbm_blci_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: leaq 1(%rdi), %rcx
-; CHECK-NEXT: notq %rcx
-; CHECK-NEXT: orq %rdi, %rcx
+; CHECK-NEXT: blciq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = add i64 1, %a
; CHECK-LABEL: test_x86_tbm_blcmsk_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal 1(%rdi), %ecx
-; CHECK-NEXT: xorl %edi, %ecx
+; CHECK-NEXT: blcmskl %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = add i32 %a, 1
; CHECK-LABEL: test_x86_tbm_blcmsk_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: leaq 1(%rdi), %rcx
-; CHECK-NEXT: xorq %rdi, %rcx
+; CHECK-NEXT: blcmskq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = add i64 %a, 1
; CHECK-LABEL: test_x86_tbm_blcs_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal 1(%rdi), %ecx
-; CHECK-NEXT: orl %edi, %ecx
+; CHECK-NEXT: blcsl %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = add i32 %a, 1
; CHECK-LABEL: test_x86_tbm_blcs_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: leaq 1(%rdi), %rcx
-; CHECK-NEXT: orq %rdi, %rcx
+; CHECK-NEXT: blcsq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = add i64 %a, 1
; CHECK-LABEL: test_x86_tbm_blsfill_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT: leal -1(%rdi), %ecx
-; CHECK-NEXT: orl %edi, %ecx
+; CHECK-NEXT: blsfilll %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = add i32 %a, -1
; CHECK-LABEL: test_x86_tbm_blsfill_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: leaq -1(%rdi), %rcx
-; CHECK-NEXT: orq %rdi, %rcx
+; CHECK-NEXT: blsfillq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = add i64 %a, -1
; CHECK-LABEL: test_x86_tbm_blsic_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: notl %ecx
-; CHECK-NEXT: decl %edi
-; CHECK-NEXT: orl %ecx, %edi
+; CHECK-NEXT: blsicl %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = xor i32 %a, -1
; CHECK-LABEL: test_x86_tbm_blsic_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: movq %rdi, %rcx
-; CHECK-NEXT: notq %rcx
-; CHECK-NEXT: decq %rdi
-; CHECK-NEXT: orq %rcx, %rdi
+; CHECK-NEXT: blsicq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = xor i64 %a, -1
; CHECK-LABEL: test_x86_tbm_t1mskc_u32_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: notl %ecx
-; CHECK-NEXT: incl %edi
-; CHECK-NEXT: orl %ecx, %edi
+; CHECK-NEXT: t1mskcl %edi, %ecx
; CHECK-NEXT: cmovnel %edx, %eax
; CHECK-NEXT: retq
%t0 = xor i32 %a, -1
; CHECK-LABEL: test_x86_tbm_t1mskc_u64_z2:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: movq %rdi, %rcx
-; CHECK-NEXT: notq %rcx
-; CHECK-NEXT: incq %rdi
-; CHECK-NEXT: orq %rcx, %rdi
+; CHECK-NEXT: t1mskcq %rdi, %rcx
; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: retq
%t0 = xor i64 %a, -1