* fit. For smaller workgroup sizes, we follow the blob and use the
* smaller threadsize.
*/
- if (compiler->gpu_id < 600) {
+ if (compiler->gen < 6) {
return v->local_size_variable ||
threads_per_wg >
compiler->threadsize_base * compiler->max_waves;
*/
unsigned regs_count =
info->max_reg + 1 +
- (compiler->gpu_id >= 600 ? ((info->max_half_reg + 2) / 2) : 0);
+ (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
unsigned reg_independent_max_waves =
* same block (since we can't propagate address register values
* across blocks currently)
*/
- if (compiler->gpu_id < 600)
+ if (compiler->gen < 6)
return false;
/* NOTE in the special try_swap_mad_two_srcs() case we can be
compiler->dev = dev;
compiler->gpu_id = gpu_id;
+ compiler->gen = gpu_id / 100;
compiler->robust_ubo_access = robust_ubo_access;
/* All known GPU's have 32k local memory (aka shared) */
compiler->wave_granularity = 2;
compiler->max_waves = 16;
- if (compiler->gpu_id >= 600) {
+ if (compiler->gen >= 6) {
compiler->samgq_workaround = true;
/* a6xx split the pipeline state into geometry and fragment state, in
* order to let the VS run ahead of the FS. As a result there are now
compiler->max_const_safe = 256;
}
- if (compiler->gpu_id >= 600) {
+ if (compiler->gen >= 6) {
compiler->reg_size_vec4 =
fd_dev_info(compiler->gpu_id)->a6xx.reg_size_vec4;
- } else if (compiler->gpu_id >= 400) {
+ } else if (compiler->gen >= 4) {
/* On a4xx-a5xx, using r24.x and above requires using the smallest
* threadsize.
*/
compiler->reg_size_vec4 = 96;
}
- if (compiler->gpu_id >= 600) {
+ if (compiler->gen >= 6) {
compiler->threadsize_base = 64;
- } else if (compiler->gpu_id >= 400) {
+ } else if (compiler->gen >= 4) {
/* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
* 1.1 subgroupSize which is 32.
*/
compiler->threadsize_base = 8;
}
- if (compiler->gpu_id >= 400) {
+ if (compiler->gen >= 4) {
/* need special handling for "flat" */
compiler->flat_bypass = true;
compiler->levels_add_one = false;
struct ir3_compiler {
struct fd_device *dev;
uint32_t gpu_id;
+ uint8_t gen;
uint32_t shader_count;
struct disk_cache *disk_cache;
struct ir3_block *b = ctx->block;
struct ir3_instruction *barrier = ir3_BAR(b);
barrier->cat7.g = true;
- if (ctx->compiler->gpu_id < 600)
+ if (ctx->compiler->gen < 6)
barrier->cat7.l = true;
barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
barrier->barrier_class = IR3_BARRIER_EVERYTHING;
barrier->cat7.g = true;
}
- if (ctx->compiler->gpu_id > 600) {
+ if (ctx->compiler->gen >= 6) {
if (modes & nir_var_mem_ssbo) {
barrier->cat7.l = true;
}
case nir_intrinsic_memory_barrier_buffer:
barrier = ir3_FENCE(b);
barrier->cat7.g = true;
- if (ctx->compiler->gpu_id > 600)
+ if (ctx->compiler->gen >= 6)
barrier->cat7.l = true;
barrier->cat7.r = true;
barrier->cat7.w = true;
break;
case nir_intrinsic_memory_barrier_shared:
barrier = ir3_FENCE(b);
- if (ctx->compiler->gpu_id < 600)
+ if (ctx->compiler->gen < 6)
barrier->cat7.l = true;
barrier->cat7.r = true;
barrier->cat7.w = true;
sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
break;
case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
- if (ctx->compiler->gpu_id < 600)
+ if (ctx->compiler->gen < 6)
sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
break;
case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
break;
case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
- if (ctx->compiler->gpu_id < 600)
+ if (ctx->compiler->gen < 6)
sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
break;
default:
* out, we guarantee that all exit paths flow into the stream-
* out instructions.
*/
- if ((ctx->compiler->gpu_id < 500) &&
+ if ((ctx->compiler->gen < 5) &&
(ctx->so->shader->stream_output.num_outputs > 0) &&
!ctx->so->binning_pass) {
debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
* need to make sure not to remove any inputs that are used by
* the nonbinning VS.
*/
- if (ctx->compiler->gpu_id >= 600 && so->binning_pass &&
+ if (ctx->compiler->gen >= 6 && so->binning_pass &&
so->type == MESA_SHADER_VERTEX) {
for (int i = 0; i < ctx->ninputs; i++) {
struct ir3_instruction *in = ctx->inputs[i];
array_insert(ctx->block, ctx->block->keeps, end);
/* at this point, for binning pass, throw away unneeded outputs: */
- if (so->binning_pass && (ctx->compiler->gpu_id < 600))
+ if (so->binning_pass && (ctx->compiler->gen < 6))
fixup_binning_pass(ctx, end);
}
* that the uniform/constant layout for BS and VS matches, so that
* we can re-use same VS_CONST state group.
*/
- if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) {
+ if (so->binning_pass && (ctx->compiler->gen >= 6)) {
fixup_binning_pass(ctx, find_end(ctx->so->ir));
/* cleanup the result of removing unneeded outputs: */
while (IR3_PASS(ir, ir3_dce, so)) {
*
* Note that VS inputs are expected to be full precision.
*/
- bool pre_assign_inputs = (ir->compiler->gpu_id >= 600) &&
+ bool pre_assign_inputs = (ir->compiler->gen >= 6) &&
(ir->type == MESA_SHADER_VERTEX) &&
so->binning_pass;
{
struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);
- if (compiler->gpu_id >= 400) {
+ if (compiler->gen >= 4) {
if (so->type == MESA_SHADER_VERTEX) {
ctx->astc_srgb = so->key.vastc_srgb;
} else if (so->type == MESA_SHADER_FRAGMENT) {
}
}
- if (compiler->gpu_id >= 600) {
+ if (compiler->gen >= 6) {
ctx->funcs = &ir3_a6xx_funcs;
- } else if (compiler->gpu_id >= 400) {
+ } else if (compiler->gen >= 4) {
ctx->funcs = &ir3_a4xx_funcs;
}
/* Enable the texture pre-fetch feature only a4xx onwards. But
* only enable it on generations that have been tested:
*/
- if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gpu_id >= 600))
+ if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gen >= 6))
NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);
NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true);
regmask_set(&state->needs_sy, n->dsts[0]);
} else if (is_atomic(n->opc)) {
if (n->flags & IR3_INSTR_G) {
- if (ctx->compiler->gpu_id >= 600) {
+ if (ctx->compiler->gen >= 6) {
/* New encoding, returns result via second src: */
regmask_set(&state->needs_sy, n->srcs[2]);
} else {
* a6xx.
*/
- if ((delay > 0) && (ir->compiler->gpu_id >= 600) && last &&
+ if ((delay > 0) && (ir->compiler->gen >= 6) && last &&
((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) &&
(last->repeat == 0)) {
/* the previous cat2/cat3 instruction can encode at most 3 nop's: */
}
}
- assert(ctx->early_input_release || ctx->compiler->gpu_id > 500);
+ assert(ctx->early_input_release || ctx->compiler->gen >= 5);
/* process each block: */
do {
/* a5xx+ is known to support swz, which enables us to swap two registers
* in-place. If unsupported we emulate it using the xor trick.
*/
- if (compiler->gpu_id < 500) {
+ if (compiler->gen < 5) {
/* Shared regs only exist since a5xx, so we don't have to provide a
* fallback path for them.
*/
const nir_shader_compiler_options *
ir3_get_compiler_options(struct ir3_compiler *compiler)
{
- if (compiler->gpu_id >= 600)
+ if (compiler->gen >= 6)
return &options_a6xx;
return &options;
}
.lower_tg4_offsets = true,
};
- if (compiler->gpu_id >= 400) {
+ if (compiler->gen >= 4) {
/* a4xx seems to have *no* sam.p */
tex_options.lower_txp = ~0; /* lower all txp */
} else {
OPT_V(s, nir_lower_tex, &tex_options);
OPT_V(s, nir_lower_load_const_to_scalar);
- if (compiler->gpu_id < 500)
+ if (compiler->gen < 5)
OPT_V(s, ir3_nir_lower_tg4_to_tex);
ir3_optimize_loop(compiler, s);
NIR_PASS_V(s, nir_lower_fb_read);
}
- if (compiler->gpu_id >= 600 && s->info.stage == MESA_SHADER_FRAGMENT &&
+ if (compiler->gen >= 6 && s->info.stage == MESA_SHADER_FRAGMENT &&
!(ir3_shader_debug & IR3_DBG_NOFP16)) {
NIR_PASS_V(s, nir_lower_mediump_io, nir_var_shader_out, 0, false);
}
/* UBO offset lowering has to come after we've decided what will
* be left as load_ubo
*/
- if (so->shader->compiler->gpu_id >= 600)
+ if (so->shader->compiler->gen >= 6)
progress |= OPT(s, nir_lower_ubo_vec4);
OPT_V(s, ir3_nir_lower_io_offsets, so->shader->compiler->gpu_id);
ir3_nir_scan_driver_consts(nir, const_state);
- if ((compiler->gpu_id < 500) && (v->shader->stream_output.num_outputs > 0)) {
+ if ((compiler->gen < 5) && (v->shader->stream_output.num_outputs > 0)) {
const_state->num_driver_params =
MAX2(const_state->num_driver_params, IR3_DP_VTXCNT_MAX + 1);
}
if (const_state->num_driver_params > 0) {
/* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */
- if (v->type == MESA_SHADER_VERTEX && compiler->gpu_id >= 600)
+ if (v->type == MESA_SHADER_VERTEX && compiler->gen >= 6)
constoff = MAX2(constoff, 1);
const_state->offsets.driver_param = constoff;
}
constoff += const_state->num_driver_params / 4;
- if ((v->type == MESA_SHADER_VERTEX) && (compiler->gpu_id < 500) &&
+ if ((v->type == MESA_SHADER_VERTEX) && (compiler->gen < 5) &&
v->shader->stream_output.num_outputs > 0) {
const_state->offsets.tfbo = constoff;
constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
* uploads are in units of 4 dwords. Round it up here to make calculations
* regarding the shared constlen simpler.
*/
- if (compiler->gpu_id >= 400)
+ if (compiler->gen >= 4)
v->constlen = align(v->constlen, 4);
/* Use the per-wave layout by default on a6xx for compute shaders. It
* should result in better performance when loads/stores are to a uniform
* index.
*/
- v->pvtmem_per_wave = compiler->gpu_id >= 600 && !info->multi_dword_ldp_stp &&
+ v->pvtmem_per_wave = compiler->gen >= 6 && !info->multi_dword_ldp_stp &&
v->type == MESA_SHADER_COMPUTE;
fixup_regfootprint(v);
v->nonbinning = nonbinning;
v->key = *key;
v->type = shader->type;
- v->mergedregs = shader->compiler->gpu_id >= 600;
+ v->mergedregs = shader->compiler->gen >= 6;
if (!v->binning_pass)
v->const_state = rzalloc_size(v, sizeof(*v->const_state));
* on older HW.
*/
key->msaa = info->fs.uses_sample_qualifier ||
- (shader->compiler->gpu_id < 600 &&
+ (shader->compiler->gen < 6 &&
(BITSET_TEST(info->system_values_read,
SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID) ||
BITSET_TEST(info->system_values_read,
* a6xx and the total limit. The frag limit on a6xx only matters for a
* single stage, so it's always satisfied with the first variant.
*/
- if (compiler->gpu_id >= 600) {
+ if (compiler->gen >= 6) {
trimmed |=
trim_constlens(constlens, MESA_SHADER_VERTEX, MESA_SHADER_GEOMETRY,
compiler->max_const_geom, compiler->max_const_safe);
if (!v->shader)
return 0;
- if (v->shader->compiler->gpu_id < 500)
+ if (v->shader->compiler->gen < 5)
return v->branchstack;
if (v->branchstack > 0) {
}
} else if (instr->opc == OPC_DEMOTE) {
return OPC_KILL;
- } else if ((instr->block->shader->compiler->gpu_id > 600) &&
+ } else if ((instr->block->shader->compiler->gen >= 6) &&
is_atomic(instr->opc) && (instr->flags & IR3_INSTR_G)) {
return instr->opc - OPC_ATOMIC_ADD + OPC_ATOMIC_B_ADD;
- } else if (s->compiler->gpu_id >= 600) {
+ } else if (s->compiler->gen >= 6) {
if (instr->opc == OPC_RESINFO) {
return OPC_RESINFO_B;
} else if (instr->opc == OPC_LDIB) {