* instructions may use the same address. */
if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
*NOPs = 1;
- } else if (program->xnack_enabled) {
+ } else if (program->dev.xnack_enabled) {
for (Operand op : instr->operands) {
if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
*NOPs = 1;
ctx.smem_clause = false;
ctx.smem_write = false;
- if (program->xnack_enabled) {
+ if (program->dev.xnack_enabled) {
BITSET_ZERO(ctx.smem_clause_read_write);
BITSET_ZERO(ctx.smem_clause_write);
}
} else {
ctx.smem_clause = true;
- if (program->xnack_enabled) {
+ if (program->dev.xnack_enabled) {
for (Operand op : instr->operands) {
if (!op.isConstant()) {
set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
/* We need one pair of shared VGPRs:
* Note, that these have twice the allocation granularity of normal VGPRs */
- ctx->program->config->num_shared_vgprs = 2 * ctx->program->vgpr_alloc_granule;
+ ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
} else {
Builder bld(ctx->program, ctx->block);
if (dst.regClass() == v2b) {
- if (ctx->program->has_16bank_lds) {
+ if (ctx->program->dev.has_16bank_lds) {
assert(ctx->options->chip_class <= GFX8);
Builder::Result interp_p1 =
bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1),
bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
bld.m0(prim_mask), idx, component);
- if (ctx->program->has_16bank_lds)
+ if (ctx->program->dev.has_16bank_lds)
interp_p1.instr->operands[0].setLateKill(true);
bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2,
if (ctx->stage == vertex_ngg && ctx->args->options->key.vs_common_out.export_prim_id) {
/* We need to store the primitive IDs in LDS */
unsigned lds_size = ctx->program->info->ngg_info.esgs_ring_size;
- ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->lds_encoding_granule);
+ ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->dev.lds_encoding_granule);
}
}
unsigned total_lds_bytes = esgs_ring_bytes + ngg_emit_bytes + ngg_gs_scratch_bytes;
assert(total_lds_bytes >= ctx->ngg_gs_emit_addr);
assert(total_lds_bytes >= ctx->ngg_gs_scratch_addr);
- ctx->program->config->lds_size = DIV_ROUND_UP(total_lds_bytes, ctx->program->lds_encoding_granule);
+ ctx->program->config->lds_size = DIV_ROUND_UP(total_lds_bytes, ctx->program->dev.lds_encoding_granule);
/* Make sure we have enough room for emitted GS vertices */
if (nir->info.gs.vertices_out)
ctx->args->shader_info->tcs.num_patches = ctx->tcs_num_patches;
ctx->args->shader_info->tcs.num_lds_blocks = lds_size;
- ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->lds_encoding_granule);
+ ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->dev.lds_encoding_granule);
}
void
break;
}
case MESA_SHADER_COMPUTE: {
- ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.cs.shared_size, ctx->program->lds_encoding_granule);
+ ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.cs.shared_size, ctx->program->dev.lds_encoding_granule);
break;
}
case MESA_SHADER_VERTEX: {
}
/* Make sure we fit the available LDS space. */
- assert((ctx->program->config->lds_size * ctx->program->lds_encoding_granule) <= ctx->program->lds_limit);
+ assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) <= ctx->program->dev.lds_limit);
}
void
nir_index_ssa_defs(func);
}
-void
-setup_xnack(Program *program)
-{
- switch (program->family) {
- /* GFX8 APUs */
- case CHIP_CARRIZO:
- case CHIP_STONEY:
- /* GFX9 APUS */
- case CHIP_RAVEN:
- case CHIP_RAVEN2:
- case CHIP_RENOIR:
- program->xnack_enabled = true;
- break;
- default:
- break;
- }
-}
-
} /* end namespace */
void init_context(isel_context *ctx, nir_shader *shader)
ctx.block->loop_nest_depth = 0;
ctx.block->kind = block_kind_top_level;
- setup_xnack(program);
- program->sram_ecc_enabled = args->options->family == CHIP_ARCTURUS;
- /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
- program->has_fast_fma32 = program->chip_class >= GFX9;
- if (args->options->family == CHIP_TAHITI || args->options->family == CHIP_CARRIZO || args->options->family == CHIP_HAWAII)
- program->has_fast_fma32 = true;
-
return ctx;
}
program->wave_size = info->wave_size;
program->lane_mask = program->wave_size == 32 ? s1 : s2;
- program->lds_encoding_granule = chip_class >= GFX7 ? 512 : 256;
- program->lds_alloc_granule = chip_class >= GFX10_3 ? 1024 : program->lds_encoding_granule;
- program->lds_limit = chip_class >= GFX7 ? 65536 : 32768;
+ program->dev.lds_encoding_granule = chip_class >= GFX7 ? 512 : 256;
+ program->dev.lds_alloc_granule = chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
+ program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768;
/* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
- program->has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
+ program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
- program->vgpr_limit = 256;
- program->physical_vgprs = 256;
- program->vgpr_alloc_granule = 4;
+ program->dev.vgpr_limit = 256;
+ program->dev.physical_vgprs = 256;
+ program->dev.vgpr_alloc_granule = 4;
if (chip_class >= GFX10) {
- program->physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
- program->physical_vgprs = program->wave_size == 32 ? 1024 : 512;
- program->sgpr_alloc_granule = 128;
- program->sgpr_limit = 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
+ program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
+ program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
+ program->dev.sgpr_alloc_granule = 128;
+ program->dev.sgpr_limit = 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
if (chip_class >= GFX10_3)
- program->vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
+ program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
else
- program->vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
+ program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
} else if (program->chip_class >= GFX8) {
- program->physical_sgprs = 800;
- program->sgpr_alloc_granule = 16;
- program->sgpr_limit = 102;
+ program->dev.physical_sgprs = 800;
+ program->dev.sgpr_alloc_granule = 16;
+ program->dev.sgpr_limit = 102;
if (family == CHIP_TONGA || family == CHIP_ICELAND)
- program->sgpr_alloc_granule = 96; /* workaround hardware bug */
+ program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
} else {
- program->physical_sgprs = 512;
- program->sgpr_alloc_granule = 8;
- program->sgpr_limit = 104;
+ program->dev.physical_sgprs = 512;
+ program->dev.sgpr_alloc_granule = 8;
+ program->dev.sgpr_limit = 104;
}
+ program->dev.max_wave64_per_simd = 10;
+ if (program->chip_class >= GFX10_3)
+ program->dev.max_wave64_per_simd = 16;
+ else if (program->chip_class == GFX10)
+ program->dev.max_wave64_per_simd = 20;
+ else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
+ program->dev.max_wave64_per_simd = 8;
+
+ program->dev.simd_per_cu = program->chip_class >= GFX10 ? 2 : 4;
+
+ switch (program->family) {
+ /* GFX8 APUs */
+ case CHIP_CARRIZO:
+ case CHIP_STONEY:
+ /* GFX9 APUS */
+ case CHIP_RAVEN:
+ case CHIP_RAVEN2:
+ case CHIP_RENOIR:
+ program->dev.xnack_enabled = true;
+ break;
+ default:
+ break;
+ }
+
+ program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS;
+ /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
+ program->dev.has_fast_fma32 = program->chip_class >= GFX9;
+ if (program->family == CHIP_TAHITI ||
+ program->family == CHIP_CARRIZO ||
+ program->family == CHIP_HAWAII)
+ program->dev.has_fast_fma32 = true;
+
program->wgp_mode = wgp_mode;
program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
num_statistics
};
+struct DeviceInfo {
+ uint16_t lds_encoding_granule;
+ uint16_t lds_alloc_granule;
+ uint32_t lds_limit; /* in bytes */
+ bool has_16bank_lds;
+ uint16_t physical_sgprs;
+ uint16_t physical_vgprs;
+ uint16_t vgpr_limit;
+ uint16_t sgpr_limit;
+ uint16_t sgpr_alloc_granule;
+ uint16_t vgpr_alloc_granule; /* must be power of two */
+ unsigned max_wave64_per_simd;
+ unsigned simd_per_cu;
+ bool has_fast_fma32 = false;
+ bool xnack_enabled = false;
+ bool sram_ecc_enabled = false;
+};
+
class Program final {
public:
float_mode next_fp_mode;
struct radv_shader_info *info;
enum chip_class chip_class;
enum radeon_family family;
+ DeviceInfo dev;
unsigned wave_size;
RegClass lane_mask;
Stage stage;
Temp scratch_offset;
uint16_t min_waves = 0;
- uint16_t lds_encoding_granule;
- uint16_t lds_alloc_granule;
- uint32_t lds_limit; /* in bytes */
- bool has_16bank_lds;
- uint16_t vgpr_limit;
- uint16_t sgpr_limit;
- uint16_t physical_sgprs;
- uint16_t physical_vgprs;
- uint16_t sgpr_alloc_granule;
- uint16_t vgpr_alloc_granule; /* must be power of two */
unsigned workgroup_size; /* if known; otherwise UINT_MAX */
bool wgp_mode;
-
- bool xnack_enabled = false;
- bool sram_ecc_enabled = false;
- bool has_fast_fma32 = false;
bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
bool needs_vcc = false;
{
if (program->chip_class >= GFX10) {
assert(!program->needs_flat_scr);
- assert(!program->xnack_enabled);
+ assert(!program->dev.xnack_enabled);
return 0;
} else if (program->chip_class >= GFX8) {
if (program->needs_flat_scr)
return 6;
- else if (program->xnack_enabled)
+ else if (program->dev.xnack_enabled)
return 4;
else if (program->needs_vcc)
return 2;
else
return 0;
} else {
- assert(!program->xnack_enabled);
+ assert(!program->dev.xnack_enabled);
if (program->needs_flat_scr)
return 4;
else if (program->needs_vcc)
uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
{
uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
- uint16_t granule = program->sgpr_alloc_granule;
+ uint16_t granule = program->dev.sgpr_alloc_granule;
return ALIGN_NPOT(std::max(sgprs, granule), granule);
}
uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
{
- assert(addressable_vgprs <= program->vgpr_limit);
- uint16_t granule = program->vgpr_alloc_granule;
+ assert(addressable_vgprs <= program->dev.vgpr_limit);
+ uint16_t granule = program->dev.vgpr_alloc_granule;
return align(std::max(addressable_vgprs, granule), granule);
}
uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
{
/* it's not possible to allocate more than 128 SGPRs */
- uint16_t sgprs = std::min(program->physical_sgprs / waves, 128);
- sgprs = round_down(sgprs, program->sgpr_alloc_granule);
+ uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);
+ sgprs = round_down(sgprs, program->dev.sgpr_alloc_granule);
sgprs -= get_extra_sgprs(program);
- return std::min(sgprs, program->sgpr_limit);
+ return std::min(sgprs, program->dev.sgpr_limit);
}
uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves)
{
- uint16_t vgprs = program->physical_vgprs / waves & ~(program->vgpr_alloc_granule - 1);
+ uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);
vgprs -= program->config->num_shared_vgprs / 2;
- return std::min(vgprs, program->vgpr_limit);
+ return std::min(vgprs, program->dev.vgpr_limit);
}
void calc_min_waves(Program* program)
{
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
-
- unsigned simd_per_cu = program->chip_class >= GFX10 ? 2 : 4;
- unsigned simd_per_cu_wgp = program->wgp_mode ? simd_per_cu * 2 : simd_per_cu;
-
+ unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
}
void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
{
- unsigned max_waves_per_simd = program->chip_class == GFX10 ? 20 : 10;
- if (program->chip_class >= GFX10_3)
- max_waves_per_simd = 16;
- else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
- max_waves_per_simd = 8;
- if (program->wave_size == 32)
- max_waves_per_simd *= 2;
-
- unsigned simd_per_cu = program->chip_class >= GFX10 ? 2 : 4;
-
- unsigned simd_per_cu_wgp = program->wgp_mode ? simd_per_cu * 2 : simd_per_cu;
- unsigned lds_limit = program->wgp_mode ? program->lds_limit * 2 : program->lds_limit;
+ unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size);
+ unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
+ unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit;
assert(program->min_waves >= 1);
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
program->num_waves = 0;
program->max_reg_demand = new_demand;
} else {
- program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
+ program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
uint16_t vgpr_demand = get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
- program->num_waves = std::min<uint16_t>(program->num_waves, program->physical_vgprs / vgpr_demand);
+ program->num_waves = std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
program->max_waves = max_waves_per_simd;
/* adjust max_waves for workgroup and LDS limits */
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
if (program->config->lds_size) {
- unsigned lds = program->config->lds_size * program->lds_encoding_granule;
- lds = align(lds, program->lds_alloc_granule);
+ unsigned lds = program->config->lds_size * program->dev.lds_encoding_granule;
+ lds = align(lds, program->dev.lds_alloc_granule);
workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
}
if (waves_per_workgroup > 1 && program->chip_class < GFX10)
(block.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
if (need_fma && instr->definitions[0].isPrecise())
return;
- if (need_fma && mad32 && !ctx.program->has_fast_fma32)
+ if (need_fma && mad32 && !ctx.program->dev.has_fast_fma32)
return;
Instruction* mul_instr = nullptr;
case aco_opcode::global_load_short_d16:
case aco_opcode::ds_read_u8_d16:
case aco_opcode::ds_read_u16_d16:
- if (chip >= GFX9 && !program->sram_ecc_enabled)
+ if (chip >= GFX9 && !program->dev.sram_ecc_enabled)
return std::make_pair(2u, 2u);
else
return std::make_pair(2u, 4u);
* improves performance of Thrones of Britannia significantly and doesn't
* seem to hurt anything else. */
//TODO: account for possible uneven num_waves on GFX10+
- unsigned wave_fac = program->physical_vgprs / 256;
+ unsigned wave_fac = program->dev.physical_vgprs / 256;
if (program->num_waves <= 5 * wave_fac)
ctx.num_waves = program->num_waves;
else if (demand.vgpr >= 29)
case aco_opcode::global_load_short_d16_hi:
case aco_opcode::ds_read_u8_d16_hi:
case aco_opcode::ds_read_u16_d16_hi:
- return program->sram_ecc_enabled ? 4 : 2;
+ return program->dev.sram_ecc_enabled ? 4 : 2;
case aco_opcode::v_mad_f16:
case aco_opcode::v_mad_u16:
case aco_opcode::v_mad_i16: