ac/nir/ngg: save and restore output bit size for gs
authorQiang Yu <yuq825@gmail.com>
Sat, 23 Jul 2022 08:22:49 +0000 (16:22 +0800)
committerMarge Bot <emma+marge@anholt.net>
Fri, 26 Aug 2022 05:50:30 +0000 (05:50 +0000)
radeonsi does not have io nir variables, so need to save output
bit size when lower store_output intrinsic.

Acked-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17651>

src/amd/common/ac_nir_lower_ngg.c

index 9f6e0e4..32f75b1 100644 (file)
@@ -84,6 +84,7 @@ typedef struct
 
 typedef struct
 {
+   nir_function_impl *impl;
    nir_variable *output_vars[VARYING_SLOT_MAX][4];
    nir_variable *current_clear_primflag_idx_var;
    int const_out_vtxcnt[4];
@@ -1712,7 +1713,15 @@ lower_ngg_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg
       info->stream = stream;
       info->components_mask |= BITFIELD_BIT(component_offset + comp);
 
-      nir_variable *var = s->output_vars[location][component_offset + comp];
+      unsigned component = component_offset + comp;
+      nir_variable *var = s->output_vars[location][component];
+      if (!var) {
+         var = nir_local_variable_create(
+            s->impl, glsl_uintN_t_type(store_val->bit_size), "output");
+         s->output_vars[location][component] = var;
+      }
+      assert(glsl_base_type_bit_size(glsl_get_base_type(var->type)) == store_val->bit_size);
+
       nir_store_var(b, var, nir_channel(b, store_val, comp), 0x1u);
    }
 
@@ -1747,10 +1756,22 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
          u_bit_scan_consecutive_range(&mask, &start, &count);
          nir_ssa_def *values[4] = {0};
          for (int c = start; c < start + count; ++c) {
+            nir_variable *var = s->output_vars[slot][c];
+            if (!var) {
+               /* no one write to this output before */
+               values[c - start] = nir_ssa_undef(b, 1, 32);
+               continue;
+            }
+
             /* Load output from variable. */
-            values[c - start] = nir_load_var(b, s->output_vars[slot][c]);
+            nir_ssa_def *val = nir_load_var(b, var);
+
+            /* extend 8/16 bit to 32 bit, 64 bit has been lowered */
+            unsigned bit_size = glsl_base_type_bit_size(glsl_get_base_type(var->type));
+            values[c - start] = bit_size == 32 ? val : nir_u2u32(b, val);
+
             /* Clear the variable (it is undefined after emit_vertex) */
-            nir_store_var(b, s->output_vars[slot][c], nir_ssa_undef(b, 1, 32), 0x1);
+            nir_store_var(b, s->output_vars[slot][c], nir_ssa_undef(b, 1, bit_size), 0x1);
          }
 
          nir_ssa_def *store_val = nir_vec(b, values, (unsigned)count);
@@ -1893,16 +1914,6 @@ ngg_gs_export_vertices(nir_builder *b, nir_ssa_def *max_num_out_vtx, nir_ssa_def
       exported_out_vtx_lds_addr = ngg_gs_out_vertex_addr(b, nir_u2u32(b, exported_vtx_idx), s);
    }
 
-   /* Remember proper bit sizes of output variables. */
-   uint8_t out_bitsizes[VARYING_SLOT_MAX];
-   memset(out_bitsizes, 32, VARYING_SLOT_MAX);
-   nir_foreach_shader_out_variable(var, b->shader) {
-      /* Check 8/16-bit. All others should be lowered to 32-bit already. */
-      unsigned bit_size = glsl_base_type_bit_size(glsl_get_base_type(glsl_without_array(var->type)));
-      if (bit_size == 8 || bit_size == 16)
-         out_bitsizes[var->data.location] = bit_size;
-   }
-
    for (unsigned slot = 0; slot < VARYING_SLOT_MAX; ++slot) {
       if (!(b->shader->info.outputs_written & BITFIELD64_BIT(slot)))
          continue;
@@ -1923,13 +1934,21 @@ ngg_gs_export_vertices(nir_builder *b, nir_ssa_def *max_num_out_vtx, nir_ssa_def
                             .base = packed_location * 16 + start * 4,
                             .align_mul = 4);
 
-         /* Convert to the expected bit size of the output variable. */
-         if (out_bitsizes[slot] != 32)
-            load = nir_u2u(b, load, out_bitsizes[slot]);
+         for (int i = 0; i < count; i++) {
+            nir_variable *var = s->output_vars[slot][start + i];
+            assert(var);
 
-         nir_store_output(b, load, nir_imm_int(b, 0), .base = info->base,
-                          .io_semantics = io_sem, .component = start,
-                          .write_mask = BITFIELD_MASK(count));
+            nir_ssa_def *val = nir_channel(b, load, i);
+
+            /* Convert to the expected bit size of the output variable. */
+            unsigned bit_size = glsl_base_type_bit_size(glsl_get_base_type(var->type));
+            if (bit_size != 32)
+               val = nir_u2u(b, val, bit_size);
+
+            nir_store_output(b, val, nir_imm_int(b, 0), .base = info->base,
+                             .io_semantics = io_sem, .component = start + i,
+                             .write_mask = 1);
+         }
       }
    }
 
@@ -2043,6 +2062,7 @@ ac_nir_lower_ngg_gs(nir_shader *shader,
    assert(impl);
 
    lower_ngg_gs_state state = {
+      .impl = impl,
       .max_num_waves = DIV_ROUND_UP(max_workgroup_size, wave_size),
       .wave_size = wave_size,
       .lds_addr_gs_out_vtx = esgs_ring_lds_bytes,
@@ -2088,13 +2108,6 @@ ac_nir_lower_ngg_gs(nir_shader *shader,
    /* Wrap the GS control flow. */
    nir_if *if_gs_thread = nir_push_if(b, nir_has_input_primitive_amd(b));
 
-   /* Create and initialize output variables */
-   for (unsigned slot = 0; slot < VARYING_SLOT_MAX; ++slot) {
-      for (unsigned comp = 0; comp < 4; ++comp) {
-         state.output_vars[slot][comp] = nir_local_variable_create(impl, glsl_uint_type(), "output");
-      }
-   }
-
    nir_cf_reinsert(&extracted, b->cursor);
    b->cursor = nir_after_cf_list(&if_gs_thread->then_list);
    nir_pop_if(b, if_gs_thread);