const TargetRegisterInfo *TRI);
std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(const TargetInstrInfo *TII,
- const TargetRegisterInfo *TRI);
+createMacroFusionDAGMutation(const TargetInstrInfo *TII);
std::unique_ptr<ScheduleDAGMutation>
createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
/// that may be fused by the processor into a single operation.
class MacroFusion : public ScheduleDAGMutation {
const TargetInstrInfo &TII;
- const TargetRegisterInfo &TRI;
public:
- MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI)
- : TII(TII), TRI(TRI) {}
+ MacroFusion(const TargetInstrInfo &TII)
+ : TII(TII) {}
void apply(ScheduleDAGInstrs *DAGInstrs) override;
};
namespace llvm {
std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(const TargetInstrInfo *TII,
- const TargetRegisterInfo *TRI) {
- return make_unique<MacroFusion>(*TII, *TRI);
+createMacroFusionDAGMutation(const TargetInstrInfo *TII) {
+ return make_unique<MacroFusion>(*TII);
}
} // namespace llvm
-/// Returns true if \p MI reads a register written by \p Other.
-static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI,
- const MachineInstr &Other) {
- for (const MachineOperand &MO : MI.uses()) {
- if (!MO.isReg() || !MO.readsReg())
- continue;
-
- unsigned Reg = MO.getReg();
- if (Other.modifiesRegister(Reg, &TRI))
- return true;
- }
- return false;
-}
-
/// \brief Callback from DAG postProcessing to create cluster edges to encourage
/// fused operations.
void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
if (!Branch)
return;
- for (SUnit &SU : DAG->SUnits) {
- // SUnits with successors can't be schedule in front of the ExitSU.
- if (!SU.Succs.empty())
- continue;
- // We only care if the node writes to a register that the branch reads.
- MachineInstr *Pred = SU.getInstr();
- if (!HasDataDep(TRI, *Branch, *Pred))
+ for (SDep &PredDep : ExitSU.Preds) {
+ if (PredDep.isWeak())
continue;
-
- if (!TII.shouldScheduleAdjacent(*Pred, *Branch))
+ SUnit &SU = *PredDep.getSUnit();
+ MachineInstr &Pred = *SU.getInstr();
+ if (!TII.shouldScheduleAdjacent(Pred, *Branch))
continue;
// Create a single weak edge from SU to ExitSU. The only effect is to cause
(void)Success;
assert(Success && "No DAG nodes should be reachable from ExitSU");
+ // Adjust latency of data deps between the nodes.
+ for (SDep &PredDep : ExitSU.Preds) {
+ if (PredDep.getSUnit() == &SU)
+ PredDep.setLatency(0);
+ }
+ for (SDep &SuccDep : SU.Succs) {
+ if (SuccDep.getSUnit() == &ExitSU)
+ SuccDep.setLatency(0);
+ }
+
DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
break;
}
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
}
if (EnableMacroFusion)
- DAG->addMutation(createMacroFusionDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
return DAG;
}
void ScheduleDAGInstrs::addSchedBarrierDeps() {
MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr;
ExitSU.setInstr(ExitMI);
- bool AllDepKnown = ExitMI &&
- (ExitMI->isCall() || ExitMI->isBarrier());
- if (ExitMI && AllDepKnown) {
- // If it's a call or a barrier, add dependencies on the defs and uses of
- // instruction.
+ // Add dependencies on the defs and uses of the instruction.
+ if (ExitMI) {
for (const MachineOperand &MO : ExitMI->operands()) {
if (!MO.isReg() || MO.isDef()) continue;
unsigned Reg = MO.getReg();
addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO));
}
}
- } else {
+ }
+ if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) {
// For others, e.g. fallthrough, conditional branch, assume the exit
// uses all the registers that are livein to the successor blocks.
- assert(Uses.empty() && "Uses in set before adding deps?");
for (const MachineBasicBlock *Succ : BB->successors()) {
for (const auto &LI : Succ->liveins()) {
if (!Uses.contains(LI.PhysReg))
; Next BB.
; CHECK: [[LOOP:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
-; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
+; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP]]
;
; Next BB.
; Next BB.
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
-; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
+; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: ; %for.end
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
-; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
+; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: bl _somethingElse
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
-; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
+; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: lsl w0, [[SUM]], #3
;
; Sum is merged with the returned register.
; CHECK: add [[VA_BASE:x[0-9]+]], sp, #16
-; CHECK-NEXT: str [[VA_BASE]], [sp, #8]
; CHECK-NEXT: cmp w1, #1
+; CHECK-NEXT: str [[VA_BASE]], [sp, #8]
+; CHECK-NEXT: mov [[SUM:w0]], wzr
; CHECK-NEXT: b.lt [[IFEND_LABEL:LBB[0-9_]+]]
-; CHECK: mov [[SUM:w0]], wzr
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: ldr [[VA_ADDR:x[0-9]+]], [sp, #8]
; CHECK-NEXT: add [[NEXT_VA_ADDR:x[0-9]+]], [[VA_ADDR]], #8
; CHECK-NEXT: str [[NEXT_VA_ADDR]], [sp, #8]
; CHECK-NEXT: ldr [[VA_VAL:w[0-9]+]], {{\[}}[[VA_ADDR]]]
-; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]]
; CHECK-NEXT: sub w1, w1, #1
+; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]]
; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]]
+; DISABLE-NEXT: b [[IFEND_LABEL]]
;
-; DISABLE-NEXT: b
; DISABLE: [[ELSE_LABEL]]: ; %if.else
; DISABLE: lsl w0, w1, #1
;
-; ENABLE: [[ELSE_LABEL]]: ; %if.else
-; ENABLE: lsl w0, w1, #1
-; ENABLE-NEXT: ret
-;
; CHECK: [[IFEND_LABEL]]:
; Epilogue code.
; CHECK: add sp, sp, #16
; CHECK-NEXT: ret
+;
+; ENABLE: [[ELSE_LABEL]]: ; %if.else
+; ENABLE-NEXT: lsl w0, w1, #1
+; ENABLE_NEXT: ret
define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 {
entry:
%ap = alloca i8*, align 8
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; Inline asm statement.
-; CHECK: add x19, x19, #1
; CHECK: sub [[IV]], [[IV]], #1
-; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
+; CHECK: add x19, x19, #1
+; CHECK: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: mov w0, wzr
; Epilogue code.
-; RUN: llc -o - %s -mattr=+arith-cbz-fusion,+use-postra-scheduler | FileCheck %s
+; RUN: llc -o - %s -mattr=+arith-cbz-fusion | FileCheck %s
; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s
target triple = "arm64-apple-ios"
for.inc:
; CHECK_LABEL: %for.inc
-; CHECK: add
-; CHECK-NEXT: cmp
-; CHECK: b.le
+; CHECK: cmp
+; CHECK-NEXT: add
+; CHECK-NEXT: b.le
; CHECK_LABEL: %for.cond.cleanup
%inc = add nsw i32 %x.015, 1
%cmp1 = icmp sgt i32 %x.015, %px
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_add_i32_e32 [[INC:v[0-9]+]], vcc, 1, [[LOOPIDX]]
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 10, [[INC]]
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: v_nop_e64
; GCN-NEXT: v_nop_e64
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
; GCN-NEXT: s_setpc_b64 vcc
; GCN-NEXT: [[LOOP_BODY]]: ; %loop_body
+; GCN: s_mov_b64 vcc, -1{{$}}
+; GCN: ;;#ASMSTART
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: ;;#ASMEND
-; GCN-NEXT: s_mov_b64 vcc, -1{{$}}
; GCN-NEXT: s_cbranch_vccz [[RET]]
; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop_body
; GCN: [[LONG_BR_DEST0]]
; GCN: s_cmp_eq_u32
+; GCN-NEXT: ; implicit-def
; GCN-NEXT: s_cbranch_scc0
; GCN: s_setpc_b64
; GCN-LABEL: {{^}}vcc_shrink_vcc_def:
; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
-; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
bb0:
%tmp = icmp sgt i32 %arg1, 4
; GCN-LABEL: {{^}}preserve_condition_undef_flag:
; GCN-NOT: vcc
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
-; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
bb0:
%tmp = icmp sgt i32 %arg1, 4
; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
+; IDXMODE: v_mov_b32_e32 v2, 2
+; IDXMODE: v_mov_b32_e32 v3, 3
; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE-NEXT: s_set_gpr_idx_off
; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
+; IDXMODE: v_mov_b32_e32 v0,
+; IDXMODE: v_mov_b32_e32 v1,
+; IDXMODE: v_mov_b32_e32 v2,
+; IDXMODE: v_mov_b32_e32 v3,
; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE-NEXT: s_set_gpr_idx_off
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000
; GCN-DAG: s_load_dword [[ARG:s[0-9]+]]
+; IDXMODE-DAG: s_add_i32 [[ARG_ADD:s[0-9]+]], [[ARG]], -16
; MOVREL-DAG: s_add_i32 m0, [[ARG]], -16
; MOVREL: v_movreld_b32_e32 v[[VEC0_ELT0]], 4.0
; GCN-NOT: m0
-; IDXMODE-DAG: s_add_i32 [[ARG_ADD:s[0-9]+]], [[ARG]], -16
; IDXMODE: s_set_gpr_idx_on [[ARG_ADD]], dst
; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT0]], 4.0
; IDXMODE: s_set_gpr_idx_off
; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0
+; CHECK-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
; CHECK-NEXT: ; BB#1: ; %bb
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;CHECK: image_sample
-;CHECK: store
;CHECK: v_cmp
+;CHECK: store
define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
; CHECK: push
;
; DISABLE: tst{{(\.w)?}} r2, #1
+; DISABLE-NEXT: vst1.64
; DISABLE-NEXT: beq [[BB13:LBB[0-9_]+]]
;
; CHECK: bl{{x?}} _pow
; CHECK: mflr {{[0-9]+}}
;
; DISABLE: cmplwi 0, 3, 0
+; DISABLE-NEXT: std
+; DISABLE-NEXT: std
; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
;
; Loop preheader
; CHECK: mflr {{[0-9]+}}
;
; DISABLE: cmplwi 0, 3, 0
+; DISABLE-NEXT: std
+; DISABLE-NEXT: std
; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
;
; CHECK: bl somethingElse
; ENABLE-DAG: li [[IV:[0-9]+]], 10
; ENABLE-DAG: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
;
-; DISABLE: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
; DISABLE: cmplwi 0, 3, 0
+; DISABLE-NEXT: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
; DISABLE: li [[IV:[0-9]+]], 10
;
define i32 @test_inlineasm(i32 %a) nounwind {
entry:
;CHECK-LABEL: test_inlineasm:
+;CHECK: cmp
;CHECK: sethi
;CHECK: !NO_APP
-;CHECK-NEXT: cmp
;CHECK-NEXT: ble
;CHECK-NEXT: mov
tail call void asm sideeffect "sethi 0, %g0", ""() nounwind
define void @f2(i8 *%src) {
; CHECK-LABEL: f2:
; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
-; CHECK: mvi 0(%r2), 0
; CHECK: tmll [[REG]], 1
+; CHECK: mvi 0(%r2), 0
; CHECK: ber %r14
; CHECK: br %r14
entry: