program->config->float_mode = program->blocks[0].fp_mode.val;
/* addition on GFX6-8 requires a carry-out (we use VCC) */
program->needs_vcc = program->gfx_level <= GFX8;
- program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
+ program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
}
if (gfx_level >= GFX10) {
program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
- program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
program->dev.sgpr_alloc_granule = 128;
program->dev.sgpr_limit =
108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
- if (gfx_level == GFX10_3)
- program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
- else
- program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
+
+ if (family == CHIP_GFX1100 || family == CHIP_GFX1101) {
+ program->dev.physical_vgprs = program->wave_size == 32 ? 1536 : 768;
+ program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 24 : 12;
+ } else {
+ program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
+ if (gfx_level >= GFX10_3)
+ program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
+ else
+ program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
+ }
} else if (program->gfx_level >= GFX8) {
program->dev.physical_sgprs = 800;
program->dev.sgpr_alloc_granule = 16;
uint16_t vgpr_limit;
uint16_t sgpr_limit;
uint16_t sgpr_alloc_granule;
- uint16_t vgpr_alloc_granule; /* must be power of two */
+ uint16_t vgpr_alloc_granule;
unsigned max_wave64_per_simd;
unsigned simd_per_cu;
bool has_fast_fma32 = false;
{
assert(addressable_vgprs <= program->dev.vgpr_limit);
uint16_t granule = program->dev.vgpr_alloc_granule;
- return align(std::max(addressable_vgprs, granule), granule);
+ return ALIGN_NPOT(std::max(addressable_vgprs, granule), granule);
}
unsigned
uint16_t
get_addr_vgpr_from_waves(Program* program, uint16_t waves)
{
- uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);
+ uint16_t vgprs = program->dev.physical_vgprs / waves;
+ vgprs = vgprs / program->dev.vgpr_alloc_granule * program->dev.vgpr_alloc_granule;
vgprs -= program->config->num_shared_vgprs / 2;
return std::min(vgprs, program->dev.vgpr_limit);
}
} /* end for BB */
/* num_gpr = rnd_up(max_used_gpr + 1) */
- program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1);
+ program->config->num_vgprs =
+ std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256);
program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
program->progress = CompilationProgress::after_ra;