ScoreBrackets.dump();
});
+ // Assume VCCZ is correct at basic block boundaries, unless and until we need
+ // to handle cases where that is not true.
+ bool VCCZCorrect = true;
+
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
continue;
}
- bool VCCZBugWorkAround = false;
+ // We might need to restore vccz to its correct value for either of two
+ // different reasons; see ST->hasReadVCCZBug() and
+ // ST->partialVCCWritesUpdateVCCZ().
+ bool RestoreVCCZ = false;
if (readsVCCZ(Inst)) {
- if (ScoreBrackets.getScoreLB(LGKM_CNT) <
- ScoreBrackets.getScoreUB(LGKM_CNT) &&
- ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
- if (ST->hasReadVCCZBug())
- VCCZBugWorkAround = true;
+ if (!VCCZCorrect)
+ RestoreVCCZ = true;
+ else if (ST->hasReadVCCZBug()) {
+ // There is a hardware bug on CI/SI where SMRD instruction may corrupt
+ // vccz bit, so when we detect that an instruction may read from a
+ // corrupt vccz bit, we need to:
+ // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
+ // operations to complete.
+ // 2. Restore the correct value of vccz by writing the current value
+ // of vcc back to vcc.
+ if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+ ScoreBrackets.getScoreUB(LGKM_CNT) &&
+ ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+ RestoreVCCZ = true;
+ }
}
}
}
}
+ if (!ST->partialVCCWritesUpdateVCCZ()) {
+ // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+ // Writes to vcc will fix it.
+ if (Inst.definesRegister(AMDGPU::VCC_LO) ||
+ Inst.definesRegister(AMDGPU::VCC_HI))
+ VCCZCorrect = false;
+ else if (Inst.definesRegister(AMDGPU::VCC))
+ VCCZCorrect = true;
+ }
+
// Generate an s_waitcnt instruction to be placed before
// cur_Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
// TODO: Remove this work-around after fixing the scheduler and enable the
// assert above.
- if (VCCZBugWorkAround) {
+ if (RestoreVCCZ) {
// Restore the vccz bit. Any time a value is written to vcc, the vcc
// bit is updated, so we can restore the bit by reading the value of
// vcc and then writing it back to the register.
TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
TRI->getVCC())
.addReg(TRI->getVCC());
+ VCCZCorrect = true;
Modified = true;
}
S_ENDPGM 0
...
+---
+# Test that after reloading vcc spilled to a vgpr, we insert any necessary
+# instructions to fix vccz.
+
+# CHECK-LABEL: name: reload_vcc_from_vgpr
+# CHECK: $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc
+# CHECK: $vcc_hi = V_READLANE_B32_vi $vgpr0, 9
+# SI: $vcc = S_MOV_B64 $vcc
+# GFX9: $vcc = S_MOV_B64 $vcc
+# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+
+name: reload_vcc_from_vgpr
+body: |
+ bb.0:
+ $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc
+ $vcc_hi = V_READLANE_B32_vi $vgpr0, 9
+ S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ bb.1:
+
+...
+---
+# Test that after reloading vcc spilled to memory, we insert any necessary
+# instructions to fix vccz.
+
+# CHECK-LABEL: name: reload_vcc_from_mem
+# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
+# CHECK: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
+# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
+# CHECK: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
+# SI: $vcc = S_MOV_B64 $vcc
+# GFX9: $vcc = S_MOV_B64 $vcc
+# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+
+name: reload_vcc_from_mem
+body: |
+ bb.0:
+ $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
+ $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
+ $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
+ $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
+ S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ bb.1:
+
+...
+---
+# Test that after inline asm that defines vcc_lo, we insert any necessary
+# instructions to fix vccz.
+
+# CHECK-LABEL: name: inlineasm_def_vcc_lo
+# CHECK: INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
+# SI: $vcc = S_MOV_B64 $vcc
+# GFX9: $vcc = S_MOV_B64 $vcc
+# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+
+name: inlineasm_def_vcc_lo
+body: |
+ bb.0:
+ INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
+ S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ bb.1:
+
+...
+---
+# Test that after inline asm that defines vcc, no unnecessary instructions are
+# inserted to fix vccz.
+
+# CHECK-LABEL: name: inlineasm_def_vcc
+# CHECK: INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
+# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+
+name: inlineasm_def_vcc
+body: |
+ bb.0:
+ INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
+ S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ bb.1:
+
+...