nir_variable *clip_vertex_var;
nir_variable *clipdist_neg_mask_var;
bool has_clipdist;
+
+ /* outputs */
+ nir_ssa_def *outputs[VARYING_SLOT_MAX][4];
+ nir_ssa_def *outputs_16bit_lo[16][4];
+ nir_ssa_def *outputs_16bit_hi[16][4];
+ shader_output_types output_types;
} lower_ngg_nogs_state;
typedef struct
nir_store_output(b, prim_id, nir_imm_zero(b, 1, 32),
.base = st->options->primitive_id_location,
.src_type = nir_type_uint32, .io_semantics = io_sem);
+
+ /* Update outputs_written to reflect that the pass added a new output. */
+ b->shader->info.outputs_written |= VARYING_BIT_PRIMITIVE_ID;
}
static void
unreachable("Should be VS or TES.");
}
-static bool
-do_ngg_nogs_store_output_to_lds(nir_builder *b, nir_instr *instr, void *state)
+static void
+ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
{
- lower_ngg_nogs_state *st = (lower_ngg_nogs_state *)state;
+ if (!s->outputs[VARYING_SLOT_EDGE][0])
+ return;
- if (instr->type != nir_instr_type_intrinsic)
- return false;
+ /* clamp user edge flag to 1 for latter bit operations */
+ nir_ssa_def *edgeflag = s->outputs[VARYING_SLOT_EDGE][0];
+ edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1));
- nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
- if (intrin->intrinsic != nir_intrinsic_store_output)
- return false;
+ /* user edge flag is stored at the beginning of a vertex if streamout is not enabled */
+ unsigned offset = 0;
+ if (s->streamout_enabled) {
+ unsigned packed_location =
+ util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(VARYING_SLOT_EDGE));
+ offset = packed_location * 16;
+ }
- /* no indirect output */
- assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
+ nir_ssa_def *tid = nir_load_local_invocation_index(b);
+ nir_ssa_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
- b->cursor = nir_before_instr(instr);
+ nir_store_shared(b, edgeflag, addr, .base = offset);
+}
- nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
- unsigned component = nir_intrinsic_component(intrin);
- unsigned write_mask = nir_intrinsic_write_mask(intrin);
- nir_ssa_def *store_val = intrin->src[0].ssa;
+static void
+ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
+{
+ nir_xfb_info *info = b->shader->xfb_info;
- if (sem.location == VARYING_SLOT_EDGE) {
- if (st->has_user_edgeflags) {
- /* clamp user edge flag to 1 for latter bit operations */
- store_val = nir_umin(b, store_val, nir_imm_int(b, 1));
- /* remove instr after cursor point to the new node */
- nir_instr_remove(instr);
+ uint64_t xfb_outputs = 0;
+ unsigned xfb_outputs_16bit = 0;
+ uint8_t xfb_mask[VARYING_SLOT_MAX] = {0};
+ uint8_t xfb_mask_16bit_lo[16] = {0};
+ uint8_t xfb_mask_16bit_hi[16] = {0};
+
+ /* Get XFB output mask for each slot. */
+ for (int i = 0; i < info->output_count; i++) {
+ nir_xfb_output_info *out = info->outputs + i;
+
+ if (out->location < VARYING_SLOT_VAR0_16BIT) {
+ xfb_outputs |= BITFIELD64_BIT(out->location);
+ xfb_mask[out->location] |= out->component_mask;
} else {
- /* remove the edge flag output anyway as it should not be passed to next stage */
- nir_instr_remove(instr);
- return true;
+ unsigned index = out->location - VARYING_SLOT_VAR0_16BIT;
+ xfb_outputs_16bit |= BITFIELD_BIT(index);
+
+ if (out->high_16bits)
+ xfb_mask_16bit_hi[index] |= out->component_mask;
+ else
+ xfb_mask_16bit_lo[index] |= out->component_mask;
}
- } else {
- write_mask = nir_instr_xfb_write_mask(intrin) >> component;
- if (!(write_mask && st->streamout_enabled))
- return false;
}
- /* user edge flag is stored at the beginning of a vertex if streamout is not enabled */
- unsigned offset = 0;
- if (st->streamout_enabled) {
+ nir_ssa_def *tid = nir_load_local_invocation_index(b);
+ nir_ssa_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
+
+ u_foreach_bit64(slot, xfb_outputs) {
unsigned packed_location =
- util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(sem.location));
- offset = packed_location * 16 + component * 4;
+ util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(slot));
+
+ unsigned mask = xfb_mask[slot];
+ while (mask) {
+ int start, count;
+ u_bit_scan_consecutive_range(&mask, &start, &count);
+ /* Outputs here are sure to be 32bit.
+ *
+ * 64bit outputs have been lowered to two 32bit. As 16bit outputs:
+ * Vulkan does not allow streamout outputs less than 32bit.
+ * OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT.
+ */
+ nir_ssa_def *store_val = nir_vec(b, &s->outputs[slot][start], (unsigned)count);
+ nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
+ }
}
- nir_ssa_def *tid = nir_load_local_invocation_index(b);
- nir_ssa_def *addr = pervertex_lds_addr(b, tid, st->pervertex_lds_bytes);
+ unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
+ u_foreach_bit64(slot, xfb_outputs_16bit) {
+ unsigned packed_location = num_32bit_outputs +
+ util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
- nir_store_shared(b, store_val, addr, .base = offset, .write_mask = write_mask);
+ unsigned mask_lo = xfb_mask_16bit_lo[slot];
+ unsigned mask_hi = xfb_mask_16bit_hi[slot];
- return true;
-}
+ nir_ssa_def **outputs_lo = s->outputs_16bit_lo[slot];
+ nir_ssa_def **outputs_hi = s->outputs_16bit_hi[slot];
+ nir_ssa_def *undef = nir_ssa_undef(b, 1, 16);
-static void
-ngg_nogs_store_all_outputs_to_lds(nir_shader *shader, lower_ngg_nogs_state *st)
-{
- nir_shader_instructions_pass(shader, do_ngg_nogs_store_output_to_lds,
- nir_metadata_block_index | nir_metadata_dominance, st);
+ unsigned mask = mask_lo | mask_hi;
+ while (mask) {
+ int start, count;
+ u_bit_scan_consecutive_range(&mask, &start, &count);
+
+ nir_ssa_def *values[4] = {0};
+ for (int c = start; c < start + count; ++c) {
+ nir_ssa_def *lo = mask_lo & BITFIELD_BIT(c) ? outputs_lo[c] : undef;
+ nir_ssa_def *hi = mask_hi & BITFIELD_BIT(c) ? outputs_hi[c] : undef;
+
+ /* extend 8/16 bit to 32 bit, 64 bit has been lowered */
+ values[c - start] = nir_pack_32_2x16_split(b, lo, hi);
+ }
+
+ nir_ssa_def *store_val = nir_vec(b, values, (unsigned)count);
+ nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
+ }
+ }
}
static void
return pervertex_lds_bytes;
}
-static unsigned
-gather_vs_outputs(nir_builder *b, struct exec_list *cf_list, vs_output *outputs,
- const uint8_t *vs_output_param_offset)
+static void
+ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s)
{
- uint64_t output_mask32 = 0;
- nir_ssa_def *outputs32[VARYING_SLOT_MAX][4] = {0};
-
- unsigned output_mask16_lo = 0;
- unsigned output_mask16_hi = 0;
- nir_ssa_def *outputs16_lo[16][4];
- nir_ssa_def *outputs16_hi[16][4];
-
/* Assume:
* - the shader used nir_lower_io_to_temporaries
* - 64-bit outputs are lowered
* - no indirect indexing is present
*/
- struct nir_cf_node *first_node = exec_node_data(nir_cf_node, exec_list_get_head(cf_list), node);
+ struct nir_cf_node *first_node =
+ exec_node_data(nir_cf_node, exec_list_get_head(cf_list), node);
+
for (nir_block *block = nir_cf_node_cf_tree_first(first_node); block != NULL;
block = nir_block_cf_tree_next(block)) {
nir_foreach_instr_safe (instr, block) {
assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
- unsigned slot = nir_intrinsic_io_semantics(intrin).location;
- if (vs_output_param_offset[slot] > AC_EXP_PARAM_OFFSET_31)
- continue;
-
- bool is_hi = nir_intrinsic_io_semantics(intrin).high_16bits;
- bool is_16bit = slot >= VARYING_SLOT_VAR0_16BIT;
+ nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
+ unsigned slot = sem.location;
- u_foreach_bit (i, nir_intrinsic_write_mask(intrin)) {
- unsigned comp = nir_intrinsic_component(intrin) + i;
- nir_ssa_def *chan = nir_channel(b, intrin->src[0].ssa, i);
- if (is_16bit && is_hi)
- outputs16_hi[slot - VARYING_SLOT_VAR0_16BIT][comp] = chan;
- else if (is_16bit)
- outputs16_lo[slot - VARYING_SLOT_VAR0_16BIT][comp] = chan;
- else
- outputs32[slot][comp] = chan;
+ nir_ssa_def **output;
+ nir_alu_type *type;
+ if (slot >= VARYING_SLOT_VAR0_16BIT) {
+ unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
+ if (sem.high_16bits) {
+ output = s->outputs_16bit_hi[index];
+ type = s->output_types.types_16bit_hi[index];
+ } else {
+ output = s->outputs_16bit_lo[index];
+ type = s->output_types.types_16bit_lo[index];
+ }
+ } else {
+ output = s->outputs[slot];
+ type = s->output_types.types[slot];
}
- if (is_16bit && is_hi)
- output_mask16_hi |= BITFIELD_BIT(slot - VARYING_SLOT_VAR0_16BIT);
- else if (is_16bit)
- output_mask16_lo |= BITFIELD_BIT(slot - VARYING_SLOT_VAR0_16BIT);
- else
- output_mask32 |= BITFIELD64_BIT(slot);
+ unsigned component = nir_intrinsic_component(intrin);
+ unsigned write_mask = nir_intrinsic_write_mask(intrin);
+ nir_alu_type src_type = nir_intrinsic_src_type(intrin);
- if (slot >= VARYING_SLOT_VAR0 || !(BITFIELD64_BIT(slot) & POS_EXPORT_MASK))
- nir_instr_remove(&intrin->instr);
+ u_foreach_bit (i, write_mask) {
+ unsigned c = component + i;
+ output[c] = nir_channel(b, intrin->src[0].ssa, i);
+ type[c] = src_type;
+ }
+
+ /* remove the edge flag output anyway as it should not be passed to next stage */
+ bool is_edge_slot = slot == VARYING_SLOT_EDGE;
+ /* remove non-pos-export slot when GFX11, they are written to buffer memory */
+ bool is_pos_export_slot = slot < VARYING_SLOT_MAX && (BITFIELD64_BIT(slot) & POS_EXPORT_MASK);
+ if (is_edge_slot || (s->options->gfx_level >= GFX11 && !is_pos_export_slot))
+ nir_instr_remove(instr);
}
}
+}
+static unsigned
+gather_vs_outputs(nir_builder *b, vs_output *outputs, lower_ngg_nogs_state *s)
+{
unsigned num_outputs = 0;
- u_foreach_bit64 (i, output_mask32) {
- outputs[num_outputs].slot = i;
- for (unsigned j = 0; j < 4; j++) {
- nir_ssa_def *chan = outputs32[i][j];
+ u_foreach_bit64 (slot, b->shader->info.outputs_written) {
+ if (s->options->vs_output_param_offset[slot] > AC_EXP_PARAM_OFFSET_31)
+ continue;
+
+ /* skip output if no one written before */
+ if (!s->outputs[slot][0] && !s->outputs[slot][1] &&
+ !s->outputs[slot][2] && !s->outputs[slot][3])
+ continue;
+
+ outputs[num_outputs].slot = slot;
+ for (int i = 0; i < 4; i++) {
+ nir_ssa_def *chan = s->outputs[slot][i];
/* RADV implements 16-bit outputs as 32-bit with VARYING_SLOT_VAR0-31. */
- outputs[num_outputs].chan[j] = chan && chan->bit_size == 16 ? nir_u2u32(b, chan) : chan;
+ outputs[num_outputs].chan[i] = chan && chan->bit_size == 16 ? nir_u2u32(b, chan) : chan;
}
num_outputs++;
}
- if (output_mask16_lo | output_mask16_hi) {
+ u_foreach_bit (i, b->shader->info.outputs_written_16bit) {
+ unsigned slot = VARYING_SLOT_VAR0_16BIT + i;
+ if (s->options->vs_output_param_offset[slot] > AC_EXP_PARAM_OFFSET_31)
+ continue;
+
+ /* skip output if no one written before */
+ if (!s->outputs_16bit_lo[i][0] && !s->outputs_16bit_lo[i][1] &&
+ !s->outputs_16bit_lo[i][2] && !s->outputs_16bit_lo[i][3] &&
+ !s->outputs_16bit_hi[i][0] && !s->outputs_16bit_hi[i][1] &&
+ !s->outputs_16bit_hi[i][2] && !s->outputs_16bit_hi[i][3])
+ continue;
+
+ vs_output *output = &outputs[num_outputs++];
+ output->slot = slot;
+
+ nir_ssa_def **output_lo = s->outputs_16bit_lo[i];
+ nir_ssa_def **output_hi = s->outputs_16bit_hi[i];
nir_ssa_def *undef = nir_ssa_undef(b, 1, 16);
- u_foreach_bit (i, output_mask16_lo | output_mask16_hi) {
- vs_output *output = &outputs[num_outputs++];
-
- output->slot = i + VARYING_SLOT_VAR0_16BIT;
- for (unsigned j = 0; j < 4; j++) {
- nir_ssa_def *lo = output_mask16_lo & BITFIELD_BIT(i) ? outputs16_lo[i][j] : NULL;
- nir_ssa_def *hi = output_mask16_hi & BITFIELD_BIT(i) ? outputs16_hi[i][j] : NULL;
- if (lo || hi)
- output->chan[j] = nir_pack_32_2x16_split(b, lo ? lo : undef, hi ? hi : undef);
- else
- output->chan[j] = NULL;
- }
+ for (int j = 0; j < 4; j++) {
+ nir_ssa_def *lo = output_lo[j] ? output_lo[j] : undef;
+ nir_ssa_def *hi = output_hi[j] ? output_hi[j] : undef;
+ if (output_lo[j] || output_hi[j])
+ output->chan[j] = nir_pack_32_2x16_split(b, lo, hi);
+ else
+ output->chan[j] = NULL;
}
}
}
nir_pop_if(b, if_es_thread);
+ if (options->can_cull) {
+ /* Replace uniforms. */
+ apply_reusable_variables(b, &state);
+
+ /* Remove the redundant position output. */
+ remove_extra_pos_outputs(shader, &state);
+
+ /* After looking at the performance in apps eg. Doom Eternal, and The Witcher 3,
+ * it seems that it's best to put the position export always at the end, and
+ * then let ACO schedule it up (slightly) only when early prim export is used.
+ */
+ b->cursor = nir_before_instr(&export_vertex_instr->instr);
+
+ nir_ssa_def *pos_val = nir_load_var(b, state.position_value_var);
+ nir_io_semantics io_sem = { .location = VARYING_SLOT_POS, .num_slots = 1 };
+ nir_store_output(b, pos_val, nir_imm_int(b, 0), .base = state.position_store_base,
+ .component = 0, .io_semantics = io_sem, .src_type = nir_type_float32);
+ }
+
+ /* Gather outputs data and types */
+ b->cursor = nir_after_cf_list(&if_es_thread->then_list);
+ ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state);
+
+ if (state.has_user_edgeflags)
+ ngg_nogs_store_edgeflag_to_lds(b, &state);
+
if (state.streamout_enabled) {
/* TODO: support culling after streamout. */
assert(!options->can_cull);
- ngg_nogs_build_streamout(b, &state);
- }
+ ngg_nogs_store_xfb_outputs_to_lds(b, &state);
- if (state.streamout_enabled || has_user_edgeflags) {
- ngg_nogs_store_all_outputs_to_lds(shader, &state);
b->cursor = nir_after_cf_list(&impl->body);
+ ngg_nogs_build_streamout(b, &state);
}
/* Take care of late primitive export */
if (!state.early_prim_export) {
+ b->cursor = nir_after_cf_list(&impl->body);
emit_ngg_nogs_prim_export(b, &state, nir_load_var(b, prim_exp_arg_var));
}
vs_output outputs[64];
b->cursor = nir_after_cf_list(&if_es_thread->then_list);
- unsigned num_outputs =
- gather_vs_outputs(b, &if_es_thread->then_list, outputs, options->vs_output_param_offset);
+ unsigned num_outputs = gather_vs_outputs(b, outputs, &state);
if (num_outputs) {
b->cursor = nir_after_cf_node(&if_es_thread->cf_node);
}
}
- if (options->can_cull) {
- /* Replace uniforms. */
- apply_reusable_variables(b, &state);
-
- /* Remove the redundant position output. */
- remove_extra_pos_outputs(shader, &state);
-
- /* After looking at the performance in apps eg. Doom Eternal, and The Witcher 3,
- * it seems that it's best to put the position export always at the end, and
- * then let ACO schedule it up (slightly) only when early prim export is used.
- */
- b->cursor = nir_before_instr(&export_vertex_instr->instr);
-
- nir_ssa_def *pos_val = nir_load_var(b, state.position_value_var);
- nir_io_semantics io_sem = { .location = VARYING_SLOT_POS, .num_slots = 1 };
- nir_store_output(b, pos_val, nir_imm_int(b, 0), .base = state.position_store_base,
- .component = 0, .io_semantics = io_sem);
- }
-
nir_metadata_preserve(impl, nir_metadata_none);
nir_validate_shader(shader, "after emitting NGG VS/TES");