nir,ac,radv: add primitive count add intrinsics

author Qiang Yu <yuq825@gmail.com>

Thu, 16 Jun 2022 07:54:54 +0000 (15:54 +0800)

committer Marge Bot <emma+marge@anholt.net>

Tue, 25 Oct 2022 12:58:43 +0000 (12:58 +0000)
author Qiang Yu <yuq825@gmail.com>
Thu, 16 Jun 2022 07:54:54 +0000 (15:54 +0800)
committer Marge Bot <emma+marge@anholt.net>
Tue, 25 Oct 2022 12:58:43 +0000 (12:58 +0000)
diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c

index 742c4b5..4a47460 100644 (file)
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -456,11 +456,11 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *st, nir_ssa_def
           {
              /* Number of active GS threads. Each has 1 output primitive. */
              nir_ssa_def *num_gs_threads = nir_bit_count(b, nir_ballot(b, 1, st->wave_size, nir_imm_bool(b, true)));
-            /* Activate only 1 lane and add the number of primitives to GDS. */
+            /* Activate only 1 lane and add the number of primitives to query result. */
              nir_if *if_elected = nir_push_if(b, nir_elect(b, 1));
              {
                 /* Add to stream 0 primitive generated counter. */
-               nir_gds_atomic_add_amd(b, 32, num_gs_threads, nir_imm_int(b, 4), nir_imm_int(b, 0x100));
+               nir_atomic_add_gen_prim_count_amd(b, num_gs_threads, .stream_id = 0);
              }
              nir_pop_if(b, if_elected);
           }
@@ -2107,28 +2107,21 @@ ngg_gs_shader_query(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg_gs_st
        num_prims_in_wave = nir_reduce(b, prm_cnt, .reduction_op = nir_op_iadd);
     }
  
-   /* Store the query result to GDS using an atomic add. */
+   /* Store the query result to query result using an atomic add. */
     nir_if *if_first_lane = nir_push_if(b, nir_elect(b, 1));
     {
-      /* GDS counters:
-       *   offset 0         - pipeline statistics counter for all streams
-       *   offset 4|8|12|16 - generated primitive counter for stream 0|1|2|3
-       */
-
        nir_if *if_pipeline_query = nir_push_if(b, pipeline_query_enabled);
        {
           /* Add all streams' number to the same counter. */
-         nir_gds_atomic_add_amd(b, 32, num_prims_in_wave, nir_imm_int(b, 0),
-                                nir_imm_int(b, 0x100));
+         nir_atomic_add_gs_emit_prim_count_amd(b, num_prims_in_wave);
        }
        nir_pop_if(b, if_pipeline_query);
  
        nir_if *if_prim_gen_query = nir_push_if(b, prim_gen_query_enabled);
        {
           /* Add to the counter for this stream. */
-         nir_gds_atomic_add_amd(b, 32, num_prims_in_wave,
-                                nir_imm_int(b, 4 + nir_intrinsic_stream_id(intrin) * 4),
-                                nir_imm_int(b, 0x100));
+         nir_atomic_add_gen_prim_count_amd(
+            b, num_prims_in_wave, .stream_id = nir_intrinsic_stream_id(intrin));
        }
        nir_pop_if(b, if_prim_gen_query);
     }
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c

index 36b0c65..56396e9 100644 (file)
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -4425,6 +4425,21 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
        result = ac_build_gather_values(&ctx->ac, global_count, instr->num_components);
        break;
     }
+   case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
+      ctx->abi->atomic_add_prim_count(ctx->abi, ~0U, get_src(ctx, instr->src[0]),
+                                      ac_prim_count_gs_emit);
+      break;
+   case nir_intrinsic_atomic_add_gen_prim_count_amd:
+   case nir_intrinsic_atomic_add_xfb_prim_count_amd: {
+      LLVMValueRef prim_count = get_src(ctx, instr->src[0]);
+      unsigned stream = nir_intrinsic_stream_id(instr);
+      enum ac_prim_count count_type =
+         instr->intrinsic == nir_intrinsic_atomic_add_gen_prim_count_amd ?
+         ac_prim_count_gen : ac_prim_count_xfb;
+
+      ctx->abi->atomic_add_prim_count(ctx->abi, stream, prim_count, count_type);
+      break;
+   }
     default:
        fprintf(stderr, "Unknown intrinsic: ");
        nir_print_instr(&instr->instr, stderr);
diff --git a/src/amd/llvm/ac_shader_abi.h b/src/amd/llvm/ac_shader_abi.h

index a1fe0b8..00f32c9 100644 (file)
--- a/src/amd/llvm/ac_shader_abi.h
+++ b/src/amd/llvm/ac_shader_abi.h
@@ -34,6 +34,12 @@
  
  #define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
  
+enum ac_prim_count {
+   ac_prim_count_gs_emit,
+   ac_prim_count_gen,
+   ac_prim_count_xfb,
+};
+
  /* Document the shader ABI during compilation. This is what allows radeonsi and
   * radv to share a compiler backend.
   */
@@ -69,6 +75,9 @@ struct ac_shader_abi {
     void (*emit_vertex_with_counter)(struct ac_shader_abi *abi, unsigned stream,
                                      LLVMValueRef vertexidx, LLVMValueRef *addrs);
  
+   void (*atomic_add_prim_count)(struct ac_shader_abi *abi, unsigned stream,
+                                 LLVMValueRef prim_count, enum ac_prim_count count_type);
+
     LLVMValueRef (*load_inputs)(struct ac_shader_abi *abi,
                                 unsigned driver_location, unsigned component,
                                 unsigned num_components, unsigned vertex_index,
diff --git a/src/amd/vulkan/radv_nir_lower_abi.c b/src/amd/vulkan/radv_nir_lower_abi.c

index 742f05c..4212fc3 100644 (file)
--- a/src/amd/vulkan/radv_nir_lower_abi.c
+++ b/src/amd/vulkan/radv_nir_lower_abi.c
@@ -78,11 +78,14 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
     b->cursor = nir_before_instr(instr);
  
     nir_ssa_def *replacement = NULL;
+   bool progress = true;
  
     switch (intrin->intrinsic) {
     case nir_intrinsic_load_ring_tess_factors_amd:
-      if (s->use_llvm)
+      if (s->use_llvm) {
+         progress = false;
           break;
+      }
  
        replacement = load_ring(b, RING_HS_TESS_FACTOR, s);
        break;
@@ -90,8 +93,10 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
        replacement = ac_nir_load_arg(b, &s->args->ac, s->args->ac.tcs_factor_offset);
        break;
     case nir_intrinsic_load_ring_tess_offchip_amd:
-      if (s->use_llvm)
+      if (s->use_llvm) {
+         progress = false;
           break;
+      }
  
        replacement = load_ring(b, RING_HS_TESS_OFFCHIP, s);
        break;
@@ -111,8 +116,10 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
        }
        break;
     case nir_intrinsic_load_ring_esgs_amd:
-      if (s->use_llvm)
+      if (s->use_llvm) {
+         progress = false;
           break;
+      }
  
        replacement = load_ring(b, stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS, s);
        break;
@@ -322,14 +329,34 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
        replacement = nir_imm_int(b, provoking_vertex);
        break;
     }
+
+   /* GDS counters:
+    *   offset 0         - pipeline statistics counter for all streams
+    *   offset 4|8|12|16 - generated primitive counter for stream 0|1|2|3
+    */
+   case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
+      nir_gds_atomic_add_amd(b, 32, intrin->src[0].ssa, nir_imm_int(b, 0), nir_imm_int(b, 0x100));
+      break;
+   case nir_intrinsic_atomic_add_gen_prim_count_amd:
+      nir_gds_atomic_add_amd(b, 32, intrin->src[0].ssa,
+                             nir_imm_int(b, 4 + nir_intrinsic_stream_id(intrin) * 4),
+                             nir_imm_int(b, 0x100));
+      break;
+   case nir_intrinsic_atomic_add_xfb_prim_count_amd:
+      /* No-op for RADV. */
+      break;
+
     default:
+      progress = false;
        break;
     }
  
-   if (!replacement)
+   if (!progress)
        return false;
  
-   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, replacement);
+   if (replacement)
+      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, replacement);
+
     nir_instr_remove(instr);
     nir_instr_free(instr);
  
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py

index bf6b205..d094401 100644 (file)
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1457,6 +1457,16 @@ intrinsic("ordered_xfb_counter_add_amd", dest_comp=0, src_comp=[1, 0], indices=[
  # Provoking vertex index in a primitive
  system_value("provoking_vtx_in_prim_amd", 1)
  
+# Atomically add current wave's primitive count to query result
+#   * GS emitted primitive is primitive emitted by any GS stream
+#   * generated primitive is primitive that has been produced for that stream by VS/TES/GS
+#   * streamout primitve is primitve that has been written to xfb buffer, may be different
+#     than generated primitive when xfb buffer is too small to hold more primitives
+# src[] = { primitive_count }.
+intrinsic("atomic_add_gs_emit_prim_count_amd", [1])
+intrinsic("atomic_add_gen_prim_count_amd", [1], indices=[STREAM_ID])
+intrinsic("atomic_add_xfb_prim_count_amd", [1], indices=[STREAM_ID])
+
  # V3D-specific instrinc for tile buffer color reads.
  #
  # The hardware requires that we read the samples and components of a pixel
author	Qiang Yu <yuq825@gmail.com>
	Thu, 16 Jun 2022 07:54:54 +0000 (15:54 +0800)
committer	Marge Bot <emma+marge@anholt.net>
	Tue, 25 Oct 2022 12:58:43 +0000 (12:58 +0000)
src/amd/common/ac_nir_lower_ngg.c		patch \| blob \| history
src/amd/llvm/ac_nir_to_llvm.c		patch \| blob \| history
src/amd/llvm/ac_shader_abi.h		patch \| blob \| history
src/amd/vulkan/radv_nir_lower_abi.c		patch \| blob \| history
src/compiler/nir/nir_intrinsics.py		patch \| blob \| history