amd: add nir_intrinsic_xfb_counter_sub_amd and fix overflowed streamout offsets

author Marek Olšák <marek.olsak@amd.com>

Tue, 28 Feb 2023 04:07:02 +0000 (23:07 -0500)

committer Marge Bot <emma+marge@anholt.net>

Tue, 7 Mar 2023 22:08:47 +0000 (22:08 +0000)
author Marek Olšák <marek.olsak@amd.com>
Tue, 28 Feb 2023 04:07:02 +0000 (23:07 -0500)
committer Marge Bot <emma+marge@anholt.net>
Tue, 7 Mar 2023 22:08:47 +0000 (22:08 +0000)
diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c

index 7cbacec..2d13364 100644 (file)
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -1752,6 +1752,8 @@ ngg_build_streamout_buffer_info(nir_builder *b,
                                  nir_ssa_def *buffer_offsets_ret[4],
                                  nir_ssa_def *emit_prim_ret[4])
  {
+   nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
+
     /* For radeonsi which pass this value by arg when VS. Streamout need accurate
      * num-vert-per-prim for writing correct amount of data to buffer.
      */
@@ -1785,7 +1787,7 @@ ngg_build_streamout_buffer_info(nir_builder *b,
              workgroup_buffer_sizes[buffer] =
                 nir_bcsel(b, buffer_valid, inc_buffer_size, nir_imm_int(b, 0));
           } else
-            workgroup_buffer_sizes[buffer] = nir_ssa_undef(b, 1, 32);
+            workgroup_buffer_sizes[buffer] = undef;
        }
  
        nir_ssa_def *ordered_id = nir_load_ordered_id_amd(b);
@@ -1801,6 +1803,9 @@ ngg_build_streamout_buffer_info(nir_builder *b,
        nir_ssa_def *emit_prim[4];
        memcpy(emit_prim, gen_prim, 4 * sizeof(nir_ssa_def *));
  
+      nir_ssa_def *any_overflow = nir_imm_bool(b, false);
+      nir_ssa_def *overflow_amount[4] = {undef, undef, undef, undef};
+
        for (unsigned buffer = 0; buffer < 4; buffer++) {
           if (!(info->buffers_written & BITFIELD_BIT(buffer)))
              continue;
@@ -1811,6 +1816,10 @@ ngg_build_streamout_buffer_info(nir_builder *b,
           nir_ssa_def *remain_prim = nir_idiv(b, remain_size, prim_stride_ret[buffer]);
           nir_ssa_def *overflow = nir_ilt(b, buffer_size, buffer_offset);
  
+         any_overflow = nir_ior(b, any_overflow, overflow);
+         overflow_amount[buffer] = nir_imax(b, nir_imm_int(b, 0),
+                                            nir_isub(b, buffer_offset, buffer_size));
+
           unsigned stream = info->buffer_to_stream[buffer];
           /* when previous workgroup overflow, we can't emit any primitive */
           emit_prim[stream] = nir_bcsel(
@@ -1822,9 +1831,16 @@ ngg_build_streamout_buffer_info(nir_builder *b,
           nir_store_shared(b, buffer_offset, scratch_base, .base = buffer * 4);
        }
  
-      /* No need to fixup the global buffer offset once we overflowed,
-       * because following workgroups overflow for sure.
+      /* We have to fix up the streamout offsets if we overflowed because they determine
+       * the vertex count for DrawTransformFeedback.
         */
+      nir_if *if_any_overflow = nir_push_if(b, any_overflow);
+      {
+         nir_build_xfb_counter_sub_amd(b, nir_vec(b, overflow_amount, 4),
+                                       /* mask of buffers to update */
+                                       .write_mask = info->buffers_written);
+      }
+      nir_pop_if(b, if_any_overflow);
  
        /* Save to LDS for being accessed by other waves in this workgroup. */
        for (unsigned stream = 0; stream < 4; stream++) {
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index c2f2423..73971c5 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -9132,6 +9132,9 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
        emit_split_vector(ctx, dst, instr->num_components);
        break;
     }
+   case nir_intrinsic_xfb_counter_sub_amd:
+      /* TODO: implement this */
+      break;
     case nir_intrinsic_memory_barrier_buffer: {
        wait_imm wait;
        wait.lgkm = 0;
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c

index 0389682..3e109fc 100644 (file)
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -4232,6 +4232,27 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
        result = ac_build_gather_values(&ctx->ac, global_count, instr->num_components);
        break;
     }
+   case nir_intrinsic_xfb_counter_sub_amd: {
+      /* must be called in a single lane of a workgroup. */
+      LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
+      LLVMValueRef gdsbase = LLVMBuildIntToPtr(ctx->ac.builder, ctx->ac.i32_0, gdsptr, "");
+      LLVMValueRef sub_vec = get_src(ctx, instr->src[0]);
+      unsigned write_mask = nir_intrinsic_write_mask(instr);
+
+      for (unsigned i = 0; i < instr->num_components; i++) {
+         if (write_mask & (1 << i)) {
+            LLVMValueRef value =
+               LLVMBuildExtractElement(ctx->ac.builder, sub_vec,
+                                       LLVMConstInt(ctx->ac.i32, i, false), "");
+
+            LLVMValueRef gds_ptr =
+               ac_build_gep_ptr(&ctx->ac, ctx->ac.i32, gdsbase, LLVMConstInt(ctx->ac.i32, i, 0));
+            LLVMBuildAtomicRMW(ctx->ac.builder, LLVMAtomicRMWBinOpSub, gds_ptr, value,
+                               LLVMAtomicOrderingMonotonic, false);
+         }
+      }
+      break;
+   }
     case nir_intrinsic_export_amd: {
        unsigned flags = nir_intrinsic_flags(instr);
        unsigned target = nir_intrinsic_base(instr);
diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c

index e57d2bc..78087e9 100644 (file)
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@@ -672,6 +672,7 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
     case nir_intrinsic_load_topology_id_intel:
     case nir_intrinsic_load_scratch_base_ptr:
     case nir_intrinsic_ordered_xfb_counter_add_amd:
+   case nir_intrinsic_xfb_counter_sub_amd:
     case nir_intrinsic_load_stack:
     case nir_intrinsic_load_ray_launch_id:
     case nir_intrinsic_load_ray_instance_custom_index:
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py

index 30ee381..29ba0c7 100644 (file)
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1529,10 +1529,15 @@ intrinsic("load_streamout_buffer_amd", dest_comp=4, indices=[BASE], bit_sizes=[3
  # An ID for each workgroup ordered by primitve sequence
  system_value("ordered_id_amd", 1)
  
-# Add to global streamout buffer counter in specified order
+# Add src1 to global streamout buffer offsets in the specified order
  # src[] = { ordered_id, counter }
  # WRITE_MASK = mask for counter channel to update
  intrinsic("ordered_xfb_counter_add_amd", dest_comp=0, src_comp=[1, 0], indices=[WRITE_MASK], bit_sizes=[32])
+# Subtract from global streamout buffer offsets. Used to fix up the offsets
+# when we overflow streamout buffers.
+# src[] = { offsets }
+# WRITE_MASK = mask of offsets to subtract
+intrinsic("xfb_counter_sub_amd", src_comp=[0], indices=[WRITE_MASK], bit_sizes=[32])
  
  # Provoking vertex index in a primitive
  system_value("provoking_vtx_in_prim_amd", 1)
author	Marek Olšák <marek.olsak@amd.com>
	Tue, 28 Feb 2023 04:07:02 +0000 (23:07 -0500)
committer	Marge Bot <emma+marge@anholt.net>
	Tue, 7 Mar 2023 22:08:47 +0000 (22:08 +0000)
src/amd/common/ac_nir_lower_ngg.c		patch \| blob \| history
src/amd/compiler/aco_instruction_selection.cpp		patch \| blob \| history
src/amd/llvm/ac_nir_to_llvm.c		patch \| blob \| history
src/compiler/nir/nir_divergence_analysis.c		patch \| blob \| history
src/compiler/nir/nir_intrinsics.py		patch \| blob \| history