nir_ssa_def *buffer_offsets_ret[4],
nir_ssa_def *emit_prim_ret[4])
{
+ nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
+
/* For radeonsi which pass this value by arg when VS. Streamout need accurate
* num-vert-per-prim for writing correct amount of data to buffer.
*/
workgroup_buffer_sizes[buffer] =
nir_bcsel(b, buffer_valid, inc_buffer_size, nir_imm_int(b, 0));
} else
- workgroup_buffer_sizes[buffer] = nir_ssa_undef(b, 1, 32);
+ workgroup_buffer_sizes[buffer] = undef;
}
nir_ssa_def *ordered_id = nir_load_ordered_id_amd(b);
nir_ssa_def *emit_prim[4];
memcpy(emit_prim, gen_prim, 4 * sizeof(nir_ssa_def *));
+ nir_ssa_def *any_overflow = nir_imm_bool(b, false);
+ nir_ssa_def *overflow_amount[4] = {undef, undef, undef, undef};
+
for (unsigned buffer = 0; buffer < 4; buffer++) {
if (!(info->buffers_written & BITFIELD_BIT(buffer)))
continue;
nir_ssa_def *remain_prim = nir_idiv(b, remain_size, prim_stride_ret[buffer]);
nir_ssa_def *overflow = nir_ilt(b, buffer_size, buffer_offset);
+ any_overflow = nir_ior(b, any_overflow, overflow);
+ overflow_amount[buffer] = nir_imax(b, nir_imm_int(b, 0),
+ nir_isub(b, buffer_offset, buffer_size));
+
unsigned stream = info->buffer_to_stream[buffer];
/* when previous workgroup overflow, we can't emit any primitive */
emit_prim[stream] = nir_bcsel(
nir_store_shared(b, buffer_offset, scratch_base, .base = buffer * 4);
}
- /* No need to fixup the global buffer offset once we overflowed,
- * because following workgroups overflow for sure.
+ /* We have to fix up the streamout offsets if we overflowed because they determine
+ * the vertex count for DrawTransformFeedback.
*/
+ nir_if *if_any_overflow = nir_push_if(b, any_overflow);
+ {
+ nir_build_xfb_counter_sub_amd(b, nir_vec(b, overflow_amount, 4),
+ /* mask of buffers to update */
+ .write_mask = info->buffers_written);
+ }
+ nir_pop_if(b, if_any_overflow);
/* Save to LDS for being accessed by other waves in this workgroup. */
for (unsigned stream = 0; stream < 4; stream++) {
result = ac_build_gather_values(&ctx->ac, global_count, instr->num_components);
break;
}
+ case nir_intrinsic_xfb_counter_sub_amd: {
+ /* must be called in a single lane of a workgroup. */
+ LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
+ LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, gdsptr, "");
+ LLVMValueRef sub_vec = get_src(ctx, instr->src[0]);
+ unsigned write_mask = nir_intrinsic_write_mask(instr);
+
+ for (unsigned i = 0; i < instr->num_components; i++) {
+ if (write_mask & (1 << i)) {
+ LLVMValueRef value =
+ LLVMBuildExtractElement(ctx->ac.builder, sub_vec,
+ LLVMConstInt(ctx->ac.i32, i, false), "");
+
+ LLVMValueRef gds_ptr =
+ ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
+ LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpSub, gds_ptr, value,
+ LLVMAtomicOrderingMonotonic, false);
+ }
+ }
+ break;
+ }
case nir_intrinsic_export_amd: {
unsigned flags = nir_intrinsic_flags(instr);
unsigned target = nir_intrinsic_base(instr);
# An ID for each workgroup ordered by primitve sequence
system_value("ordered_id_amd", 1)
-# Add to global streamout buffer counter in specified order
+# Add src1 to global streamout buffer offsets in the specified order
# src[] = { ordered_id, counter }
# WRITE_MASK = mask for counter channel to update
intrinsic("ordered_xfb_counter_add_amd", dest_comp=0, src_comp=[1, 0], indices=[WRITE_MASK], bit_sizes=[32])
+# Subtract from global streamout buffer offsets. Used to fix up the offsets
+# when we overflow streamout buffers.
+# src[] = { offsets }
+# WRITE_MASK = mask of offsets to subtract
+intrinsic("xfb_counter_sub_amd", src_comp=[0], indices=[WRITE_MASK], bit_sizes=[32])
# Provoking vertex index in a primitive
system_value("provoking_vtx_in_prim_amd", 1)