#include <algorithm>
#include <bitset>
+#include <set>
#include <stack>
#include <vector>
}
}
+/* GFX11 */
+unsigned
+parse_vdst_wait(aco_ptr<Instruction>& instr)
+{
+ if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP())
+ return 0;
+ else if (instr->isLDSDIR())
+ return instr->ldsdir().wait_vdst;
+ else if (instr->opcode == aco_opcode::s_waitcnt_depctr)
+ return (instr->sopp().imm >> 12) & 0xf;
+ else
+ return 15;
+}
+
+struct LdsDirectVALUHazardGlobalState {
+ unsigned wait_vdst = 15;
+ PhysReg vgpr;
+ std::set<unsigned> loop_headers_visited;
+};
+
+struct LdsDirectVALUHazardBlockState {
+ unsigned num_valu = 0;
+ bool has_trans = false;
+};
+
+bool
+handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState& global_state,
+ LdsDirectVALUHazardBlockState& block_state,
+ aco_ptr<Instruction>& instr)
+{
+ if (instr->isVALU() || instr->isVINTERP_INREG()) {
+ instr_class cls = instr_info.classes[(int)instr->opcode];
+ block_state.has_trans |= cls == instr_class::valu_transcendental32 ||
+ cls == instr_class::valu_double_transcendental;
+
+ bool uses_vgpr = false;
+ for (Definition& def : instr->definitions)
+ uses_vgpr |= regs_intersect(def.physReg(), def.size(), global_state.vgpr, 1);
+ for (Operand& op : instr->operands) {
+ uses_vgpr |=
+ !op.isConstant() && regs_intersect(op.physReg(), op.size(), global_state.vgpr, 1);
+ }
+ if (uses_vgpr) {
+ /* Transcendentals execute in parallel to other VALU and va_vdst count becomes unusable */
+ global_state.wait_vdst =
+ MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
+ return true;
+ }
+
+ block_state.num_valu++;
+ }
+
+ if (parse_vdst_wait(instr) == 0)
+ return true;
+
+ return block_state.num_valu >= global_state.wait_vdst;
+}
+
+bool
+handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState& global_state,
+ LdsDirectVALUHazardBlockState& block_state, Block* block)
+{
+ if (block->kind & block_kind_loop_header) {
+ if (global_state.loop_headers_visited.count(block->index))
+ return false;
+ global_state.loop_headers_visited.insert(block->index);
+ }
+
+ return true;
+}
+
+unsigned
+handle_lds_direct_valu_hazard(State& state, aco_ptr<Instruction>& instr)
+{
+ /* LdsDirectVALUHazard
+ * Handle LDSDIR writing a VGPR after it's used by a VALU instruction.
+ */
+ if (instr->ldsdir().wait_vdst == 0)
+ return 0; /* early exit */
+
+ LdsDirectVALUHazardGlobalState global_state;
+ global_state.wait_vdst = instr->ldsdir().wait_vdst;
+ global_state.vgpr = instr->definitions[0].physReg();
+ LdsDirectVALUHazardBlockState block_state;
+ search_backwards<LdsDirectVALUHazardGlobalState, LdsDirectVALUHazardBlockState,
+ &handle_lds_direct_valu_hazard_block, &handle_lds_direct_valu_hazard_instr>(
+ state, global_state, block_state);
+ return global_state.wait_vdst;
+}
+
void
handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>& instr,
std::vector<aco_ptr<Instruction>>& new_instructions)
} else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
ctx.has_Vcmpx = false;
}
+
+ if (instr->isLDSDIR()) {
+ unsigned count = handle_lds_direct_valu_hazard(state, instr);
+ LDSDIR_instruction* ldsdir = &instr->ldsdir();
+ ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
+ }
}
template <typename Ctx>
bld.ldsdir(aco_opcode::lds_direct_load, dst, op).instr->ldsdir().wait_vdst = 6;
//! lds_direct_load v42 ; ce10002a
- bld.ldsdir(aco_opcode::lds_direct_load, dst, op);
+ bld.ldsdir(aco_opcode::lds_direct_load, dst, op).instr->ldsdir().wait_vdst = 0;
//! lds_param_load v42, attr56.x wait_vdst:8 ; ce08e02a
bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0).instr->ldsdir().wait_vdst = 8;
//! lds_param_load v42, attr56.x ; ce00e02a
- bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0);
+ bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0).instr->ldsdir().wait_vdst = 0;
//! lds_param_load v42, attr34.y ; ce00892a
- bld.ldsdir(aco_opcode::lds_param_load, dst, op, 34, 1);
+ bld.ldsdir(aco_opcode::lds_param_load, dst, op, 34, 1).instr->ldsdir().wait_vdst = 0;
//! lds_param_load v42, attr12.z ; ce00322a
- bld.ldsdir(aco_opcode::lds_param_load, dst, op, 12, 2);
+ bld.ldsdir(aco_opcode::lds_param_load, dst, op, 12, 2).instr->ldsdir().wait_vdst = 0;
finish_assembler_test();
END_TEST
finish_insert_nops_test();
END_TEST
+
+BEGIN_TEST(insert_nops.lds_direct_valu)
+ if (!setup_cs(NULL, GFX11))
+ return;
+
+ /* WaW */
+ //>> p_unit_test 0
+ //! v1: %0:v[0] = v_mov_b32 0
+ //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* WaR */
+ //! p_unit_test 1
+ //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+ //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* No hazard. */
+ //! p_unit_test 2
+ //! v1: %0:v[1] = v_mov_b32 0
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero());
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* multiples hazards, nearest should be considered */
+ //! p_unit_test 3
+ //! v1: %0:v[1] = v_mov_b32 %0:v[0]
+ //! v1: %0:v[0] = v_mov_b32 0
+ //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* independent VALU increase wait_vdst */
+ //! p_unit_test 4
+ //! v1: %0:v[0] = v_mov_b32 0
+ //! v_nop
+ //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+ bld.vop1(aco_opcode::v_nop);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ //! p_unit_test 5
+ //! v1: %0:v[0] = v_mov_b32 0
+ //; for i in range(10): insert_pattern('v_nop')
+ //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+ for (unsigned i = 0; i < 10; i++)
+ bld.vop1(aco_opcode::v_nop);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ //! p_unit_test 6
+ //! v1: %0:v[0] = v_mov_b32 0
+ //; for i in range(20): insert_pattern('v_nop')
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+ for (unsigned i = 0; i < 20; i++)
+ bld.vop1(aco_opcode::v_nop);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* transcendental requires wait_vdst=0 */
+ //! p_unit_test 7
+ //! v1: %0:v[0] = v_mov_b32 0
+ //! v_nop
+ //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
+ //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+ bld.vop1(aco_opcode::v_nop);
+ bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ //! p_unit_test 8
+ //! v1: %0:v[0] = v_sqrt_f32 %0:v[0]
+ //! v_nop
+ //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
+ bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
+ bld.vop1(aco_opcode::v_nop);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* transcendental is fine if it's before the instruction */
+ //! p_unit_test 9
+ //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
+ //! v1: %0:v[0] = v_mov_b32 0
+ //! v_nop
+ //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
+ bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+ bld.vop1(aco_opcode::v_nop);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* non-VALU does not increase wait_vdst */
+ //! p_unit_test 10
+ //! v1: %0:v[0] = v_mov_b32 0
+ //! s1: %0:m0 = s_mov_b32 0
+ //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+ bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ /* consider instructions which wait on vdst */
+ //! p_unit_test 11
+ //! v1: %0:v[0] = v_mov_b32 0
+ //! v_nop
+ //! s_waitcnt_depctr va_vdst(0)
+ //! v1: %0:v[0] = lds_direct_load %0:m0
+ bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
+ bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
+ bld.vop1(aco_opcode::v_nop);
+ bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff);
+ bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
+
+ finish_insert_nops_test();
+END_TEST