ScoreBrackets.dump();
});
- // Assume VCCZ is correct at basic block boundaries, unless and until we need
- // to handle cases where that is not true.
+ // Track the correctness of vccz through this basic block. There are two
+ // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
+ // ST->partialVCCWritesUpdateVCCZ().
bool VCCZCorrect = true;
+ if (ST->hasReadVCCZBug()) {
+ // vccz could be incorrect at a basic block boundary if a predecessor wrote
+ // to vcc and then issued an smem load.
+ VCCZCorrect = false;
+ } else if (!ST->partialVCCWritesUpdateVCCZ()) {
+ // vccz could be incorrect at a basic block boundary if a predecessor wrote
+ // to vcc_lo or vcc_hi.
+ VCCZCorrect = false;
+ }
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
continue;
}
- // We might need to restore vccz to its correct value for either of two
- // different reasons; see ST->hasReadVCCZBug() and
- // ST->partialVCCWritesUpdateVCCZ().
- bool RestoreVCCZ = false;
- if (readsVCCZ(Inst)) {
- if (!VCCZCorrect) {
- // Restore vccz if it's not known to be correct already.
- RestoreVCCZ = true;
- } else if (ST->hasReadVCCZBug()) {
+ // Generate an s_waitcnt instruction to be placed before Inst, if needed.
+ Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+ OldWaitcntInstr = nullptr;
+
+ // Restore vccz if it's not known to be correct already.
+ bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
+
+ // Don't examine operands unless we need to track vccz correctness.
+ if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
+ if (Inst.definesRegister(AMDGPU::VCC_LO) ||
+ Inst.definesRegister(AMDGPU::VCC_HI)) {
+ // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+ if (!ST->partialVCCWritesUpdateVCCZ())
+ VCCZCorrect = false;
+ } else if (Inst.definesRegister(AMDGPU::VCC)) {
// There is a hardware bug on CI/SI where SMRD instruction may corrupt
// vccz bit, so when we detect that an instruction may read from a
// corrupt vccz bit, we need to:
// operations to complete.
// 2. Restore the correct value of vccz by writing the current value
// of vcc back to vcc.
- if (ScoreBrackets.getScoreLB(LGKM_CNT) <
- ScoreBrackets.getScoreUB(LGKM_CNT) &&
+ if (ST->hasReadVCCZBug() &&
+ ScoreBrackets.getScoreLB(LGKM_CNT) <
+ ScoreBrackets.getScoreUB(LGKM_CNT) &&
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
- // Restore vccz if there's an outstanding smem read, which could
- // complete and clobber vccz at any time.
- RestoreVCCZ = true;
+ // Writes to vcc while there's an outstanding smem read may get
+ // clobbered as soon as any read completes.
+ VCCZCorrect = false;
+ } else {
+ // Writes to vcc will fix any incorrect value in vccz.
+ VCCZCorrect = true;
}
}
}
const Value *Ptr = Memop->getValue();
SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
}
- }
-
- if (!ST->partialVCCWritesUpdateVCCZ()) {
- if (Inst.definesRegister(AMDGPU::VCC_LO) ||
- Inst.definesRegister(AMDGPU::VCC_HI)) {
- // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+ if (ST->hasReadVCCZBug()) {
+ // This smem read could complete and clobber vccz at any time.
VCCZCorrect = false;
- } else if (Inst.definesRegister(AMDGPU::VCC)) {
- // Writes to vcc will fix any incorrect value in vccz.
- VCCZCorrect = true;
}
}
- // Generate an s_waitcnt instruction to be placed before
- // cur_Inst, if needed.
- Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
- OldWaitcntInstr = nullptr;
-
updateEventWaitcntAfter(Inst, &ScoreBrackets);
#if 0 // TODO: implement resource type check controlled by options with ub = LB.
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccnz BB1_2
; SI-NEXT: BB1_3: ; %UnifiedReturnBlock
; SI-NEXT: s_endpgm
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccnz BB2_2
; SI-NEXT: ; %bb.3: ; %Flow
; SI-NEXT: s_mov_b64 s[2:3], 0
; SI-NEXT: BB2_6: ; %loop1
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccz BB2_6
; SI-NEXT: BB2_7: ; %DummyReturnBlock
; SI-NEXT: s_endpgm
# CHECK-LABEL: name: vcc_def_pred
# CHECK: bb.1:
+# SI: $vcc = S_MOV_B64 $vcc
+# GFX9: $vcc = S_MOV_B64 $vcc
# CHECK: S_CBRANCH_VCCZ %bb.2, implicit $vcc
name: vcc_def_pred
# SI-NEXT: $vcc = S_MOV_B64 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: S_NOP 0
-# FIXME should have $vcc = S_MOV_B64 $vcc
+# SI-NEXT: $vcc = S_MOV_B64 $vcc
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: load_def_wait_nop_use
body: |
# SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: S_NOP 0
-# FIXME should have $vcc = S_MOV_B64 $vcc
+# SI-NEXT: $vcc = S_MOV_B64 $vcc
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: def_load_wait_nop_use
body: |