Similar to p_constaddr but targeting BBs.
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22096>
enum amd_gfx_level gfx_level;
std::vector<std::pair<int, SOPP_instruction*>> branches;
std::map<unsigned, constaddr_info> constaddrs;
+ std::map<unsigned, constaddr_info> resumeaddrs;
std::vector<struct aco_symbol>* symbols;
const int16_t* opcode;
// TODO: keep track of branch instructions referring blocks
assert(instr->operands[1].isConstant());
/* in case it's an inline constant, make it a literal */
instr->operands[1] = Operand::literal32(instr->operands[1].constantValue());
+ } else if (instr->opcode == aco_opcode::p_resumeaddr_getpc) {
+ ctx.resumeaddrs[instr->operands[0].constantValue()].getpc_end = out.size() + 1;
+
+ instr->opcode = aco_opcode::s_getpc_b64;
+ instr->operands.pop_back();
+ } else if (instr->opcode == aco_opcode::p_resumeaddr_addlo) {
+ ctx.resumeaddrs[instr->operands[2].constantValue()].add_literal = out.size() + 1;
+
+ instr->opcode = aco_opcode::s_add_u32;
+ instr->operands.pop_back();
+ assert(instr->operands[1].isConstant());
+ /* in case it's an inline constant, make it a literal */
+ instr->operands[1] = Operand::literal32(instr->operands[1].constantValue());
} else if (instr->opcode == aco_opcode::p_load_symbol) {
assert(instr->operands[0].isConstant());
assert(ctx.symbols);
if (info.add_literal >= insert_before)
info.add_literal += insert_count;
}
+ for (auto& constaddr : ctx.resumeaddrs) {
+ constaddr_info& info = constaddr.second;
+ if (info.getpc_end >= insert_before)
+ info.getpc_end += insert_count;
+ if (info.add_literal >= insert_before)
+ info.add_literal += insert_count;
+ }
if (ctx.symbols) {
for (auto& symbol : *ctx.symbols) {
constaddr_info& info = constaddr.second;
out[info.add_literal] += (out.size() - info.getpc_end) * 4u;
}
+ for (auto& addr : ctx.resumeaddrs) {
+ constaddr_info& info = addr.second;
+ const Block& block = ctx.program->blocks[out[info.add_literal]];
+ assert(block.kind & block_kind_resume);
+ out[info.add_literal] = (block.offset - info.getpc_end) * 4u;
+ }
}
unsigned
bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
get_arg(ctx, ctx->args->rt.dynamic_callable_stack_base));
break;
+ case nir_intrinsic_load_resume_shader_address_amd: {
+ bld.pseudo(aco_opcode::p_resume_shader_address,
+ Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
+ Operand::c32(nir_intrinsic_call_idx(instr)));
+ break;
+ }
case nir_intrinsic_overwrite_vs_arguments_amd: {
ctx->arg_temps[ctx->args->vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
ctx->arg_temps[ctx->args->instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
block_kind_discard_early_exit = 1 << 11,
block_kind_uses_discard = 1 << 12,
block_kind_needs_lowering = 1 << 13,
+ block_kind_resume = 1 << 14,
block_kind_export_end = 1 << 15,
};
std::vector<unsigned> logical_succs;
std::vector<unsigned> linear_succs;
RegisterDemand register_demand = RegisterDemand();
+ uint32_t kind = 0;
+ int32_t logical_idom = -1;
+ int32_t linear_idom = -1;
uint16_t loop_nest_depth = 0;
uint16_t divergent_if_logical_depth = 0;
uint16_t uniform_if_depth = 0;
- uint16_t kind = 0;
- int logical_idom = -1;
- int linear_idom = -1;
/* this information is needed for predecessors to blocks with phis when
* moving out of ssa */
/* s_addc_u32 not needed because the program is in a 32-bit VA range */
break;
}
+ case aco_opcode::p_resume_shader_address: {
+ /* Find index of resume block. */
+ unsigned resume_idx = instr->operands[0].constantValue();
+ unsigned resume_block_idx = 0;
+ for (Block& resume_block : program->blocks) {
+ if (resume_block.kind & block_kind_resume) {
+ if (resume_idx == 0) {
+ resume_block_idx = resume_block.index;
+ break;
+ }
+ resume_idx--;
+ }
+ }
+ assert(resume_block_idx != 0);
+ unsigned id = instr->definitions[0].tempId();
+ PhysReg reg = instr->definitions[0].physReg();
+ bld.sop1(aco_opcode::p_resumeaddr_getpc, instr->definitions[0], Operand::c32(id));
+ bld.sop2(aco_opcode::p_resumeaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
+ Operand(reg, s1), Operand::c32(resume_block_idx), Operand::c32(id));
+ /* s_addc_u32 not needed because the program is in a 32-bit VA range */
+ break;
+ }
case aco_opcode::p_extract: {
assert(instr->operands[1].isConstant());
assert(instr->operands[2].isConstant());
opcode("p_elect")
opcode("p_constaddr")
+opcode("p_resume_shader_address")
# These don't have to be pseudo-ops, but it makes optimization easier to only
# have to consider two instructions.
( -1, -1, -1, 0x2d, 0x36, 0x2e, "s_mul_hi_i32"),
# actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP2.
( -1, -1, -1, -1, -1, -1, "p_constaddr_addlo"),
+ ( -1, -1, -1, -1, -1, -1, "p_resumeaddr_addlo"),
}
for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, cls) in default_class(SOP2, InstrClass.Salu):
opcode(name, gfx7, gfx9, gfx10, gfx11, Format.SOP2, cls)
( -1, -1, -1, -1, -1, 0x4d, "s_sendmsg_rtn_b64"),
# actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1.
( -1, -1, -1, -1, -1, -1, "p_constaddr_getpc"),
+ ( -1, -1, -1, -1, -1, -1, "p_resumeaddr_getpc"),
( -1, -1, -1, -1, -1, -1, "p_load_symbol"),
}
for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, cls) in default_class(SOP1, InstrClass.Salu):