// The isPseudo check really shouldn't be here, but unfortunately there are
// some negative lit tests that depend on being able to continue through
// here even when pseudo instructions haven't been lowered.
- if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU())) {
+ //
+ // We also overestimate branch sizes with the offset bug.
+ if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) &&
+ (!STI.hasOffset3fBug() || !MI->isBranch())) {
SmallVector<MCFixup, 4> Fixups;
SmallVector<char, 16> CodeBytes;
raw_svector_ostream CodeStream(CodeBytes);
BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
.addMBB(TBB);
if (BytesAdded)
- *BytesAdded = 4;
+ *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
return 1;
}
fixImplicitOperands(*CondBr);
if (BytesAdded)
- *BytesAdded = 4;
+ *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
return 1;
}
CondReg.setIsKill(Cond[1].isKill());
if (BytesAdded)
- *BytesAdded = 8;
+ *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
return 2;
}
// If we have a definitive size, we can use it. Otherwise we need to inspect
// the operands to know the size.
- if (isFixedSize(MI))
- return DescSize;
+ if (isFixedSize(MI)) {
+ unsigned Size = DescSize;
+
+ // If we hit the buggy offset, an extra nop will be inserted in MC so
+ // estimate the worst case.
+ if (MI.isBranch() && ST.hasOffset3fBug())
+ Size += 4;
+
+ return Size;
+ }
// 4-byte instructions may have a 32-bit literal encoded after them. Check
// operands that coud ever be literals.
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s
+
+; For gfx1010, overestimate the branch size in case we need to insert
+; a nop for the buggy offset.
+
+; GCN-LABEL: long_forward_scc_branch_3f_offset_bug:
+; GFX1030: s_cmp_lg_u32
+; GFX1030-NEXT: s_cbranch_scc1 [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; GFX1010: s_cmp_lg_u32
+; GFX1010-NEXT: s_cbranch_scc0 [[RELAX_BB:BB[0-9]+_[0-9]+]]
+; GFX1010: s_getpc_b64
+; GFX1010-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[ENDBB:BB[0-9]+_[0-9]+]]-(BB
+; GFX1010-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX1010: [[RELAX_BB]]:
+
+; GCN: v_nop
+; GCN: s_sleep
+; GCN: s_cbranch_scc1
+
+; GCN: [[ENDBB]]:
+; GCN: global_store_dword
+define amdgpu_kernel void @long_forward_scc_branch_3f_offset_bug(i32 addrspace(1)* %arg, i32 %cnd0) #0 {
+bb0:
+ %cmp0 = icmp eq i32 %cnd0, 0
+ br i1 %cmp0, label %bb2, label %bb3
+
+bb2:
+ %val = call i32 asm sideeffect
+ "s_mov_b32 $0, 0
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64", "=s"() ; 20 * 12 = 240
+ call void @llvm.amdgcn.s.sleep(i32 0) ; +4 = 244
+ %cmp1 = icmp eq i32 %val, 0 ; +4 = 248
+ br i1 %cmp1, label %bb2, label %bb3 ; +4 (gfx1030), +8 with workaround (gfx1010)
+
+bb3:
+ store volatile i32 %cnd0, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug:
+; GFX1030: v_cmp_eq_u32
+; GFX1030: s_and_saveexec_b32
+; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:BB[0-9]+_[0-9]+]]
+
+; GFX1010: v_cmp_eq_u32
+; GFX1010: s_and_saveexec_b32
+; GFX1010-NEXT: s_cbranch_execnz [[RELAX_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: s_getpc_b64
+; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[ENDBB:BB[0-9]+_[0-9]+]]-(BB
+; GCN-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}
+; GCN: [[RELAX_BB]]:
+
+; GCN: v_nop
+; GCN: s_sleep
+; GCN: s_cbranch_execz
+
+; GCN: [[ENDBB]]:
+; GCN: global_store_dword
+define void @long_forward_exec_branch_3f_offset_bug(i32 addrspace(1)* %arg, i32 %cnd0) #0 {
+bb0:
+ %cmp0 = icmp eq i32 %cnd0, 0
+ br i1 %cmp0, label %bb2, label %bb3
+
+bb2:
+ %val = call i32 asm sideeffect
+ "v_mov_b32 $0, 0
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64", "=v"() ; 20 * 12 = 240
+ call void @llvm.amdgcn.s.sleep(i32 0) ; +4 = 244
+ %cmp1 = icmp eq i32 %val, 0 ; +4 = 248
+ br i1 %cmp1, label %bb2, label %bb3 ; +4 (gfx1030), +8 with workaround (gfx1010)
+
+bb3:
+ store volatile i32 %cnd0, i32 addrspace(1)* %arg
+ ret void
+}
+
+declare void @llvm.amdgcn.s.sleep(i32 immarg)