freedreno+turnip: Upload large shader constants as a UBO.

author Eric Anholt <eric@anholt.net>

Tue, 7 Jul 2020 18:56:35 +0000 (11:56 -0700)

committer Eric Anholt <eric@anholt.net>

Mon, 16 Nov 2020 21:55:41 +0000 (13:55 -0800)
author Eric Anholt <eric@anholt.net>
Tue, 7 Jul 2020 18:56:35 +0000 (11:56 -0700)
committer Eric Anholt <eric@anholt.net>
Mon, 16 Nov 2020 21:55:41 +0000 (13:55 -0800)
diff --git a/.gitlab-ci/deqp-freedreno-a307-fails.txt b/.gitlab-ci/deqp-freedreno-a307-fails.txt

index 060d10c..fa6a12d 100644 (file)
--- a/.gitlab-ci/deqp-freedreno-a307-fails.txt
+++ b/.gitlab-ci/deqp-freedreno-a307-fails.txt
@@ -388,10 +388,6 @@ dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_highp,Fail
  dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_mediump,Fail
  dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_highp,Fail
  dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_mediump,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_fragment,Fail
  dEQP-GLES3.functional.shaders.linkage.varying.rules.differing_interpolation_2,Fail
  dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler2d_vertex,Fail
  dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler3d_vertex,Fail
diff --git a/.gitlab-ci/deqp-freedreno-a630-fails.txt b/.gitlab-ci/deqp-freedreno-a630-fails.txt

index 4d8c2a6..2a555e2 100644 (file)
--- a/.gitlab-ci/deqp-freedreno-a630-fails.txt
+++ b/.gitlab-ci/deqp-freedreno-a630-fails.txt
@@ -1,8 +1,4 @@
  
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_vertex,Fail
  dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a1r5g5b5_unorm_pack16.a1r5g5b5_unorm_pack16.optimal_general_nearest,Fail
  dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_optimal_nearest,Fail
  dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2r10g10b10_unorm_pack32.a2r10g10b10_unorm_pack32.optimal_optimal_nearest,Fail
diff --git a/src/freedreno/computerator/ir3_asm.c b/src/freedreno/computerator/ir3_asm.c

index e1e845a..a976bed 100644 (file)
--- a/src/freedreno/computerator/ir3_asm.c
+++ b/src/freedreno/computerator/ir3_asm.c
@@ -42,7 +42,7 @@ ir3_asm_assemble(struct ir3_compiler *c, FILE *in)
         kernel->base.num_bufs = kernel->info.num_bufs;
         memcpy(kernel->base.buf_sizes, kernel->info.buf_sizes, sizeof(kernel->base.buf_sizes));
  
-       unsigned sz = v->info.sizedwords * 4;
+       unsigned sz = v->info.size;
  
         v->bo = fd_bo_new(c->dev, sz,
                         DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c

index 84aa8eb..2f2612d 100644 (file)
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -942,15 +942,24 @@ void * ir3_assemble(struct ir3_shader_variant *v)
          * doesn't try to decode the following data as instructions (such as the
          * next stage's shader in turnip)
          */
-       info->sizedwords = MAX2(v->instrlen * compiler->instr_align,
-                       instr_count + 4) * sizeof(instr_t) / 4;
+       info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) *
+               sizeof(instr_t);
+       info->sizedwords = info->size / 4;
+
+       if (v->constant_data_size) {
+               /* Make sure that where we're about to place the constant_data is safe
+                * to indirectly upload from.
+                */
+               info->constant_data_offset = align(info->size, v->shader->compiler->const_upload_unit * 16);
+               info->size = info->constant_data_offset + v->constant_data_size;
+       }
  
         /* Pad out the size so that when turnip uploads the shaders in
          * sequence, the starting offset of the next one is properly aligned.
          */
-       info->sizedwords = align(info->sizedwords, compiler->instr_align * sizeof(instr_t) / 4);
+       info->size = align(info->size, compiler->instr_align * sizeof(instr_t));
  
-       ptr = dwords = rzalloc_size(v, 4 * info->sizedwords);
+       ptr = dwords = rzalloc_size(v, info->size);
  
         foreach_block (block, &shader->block_list) {
                 unsigned sfu_delay = 0;
@@ -1003,6 +1012,14 @@ void * ir3_assemble(struct ir3_shader_variant *v)
                 }
         }
  
+       /* Append the immediates after the end of the program.  This lets us emit
+        * the immediates as an indirect load, while avoiding creating another BO.
+        */
+       if (v->constant_data_size)
+               memcpy(&ptr[info->constant_data_offset / 4], v->constant_data, v->constant_data_size);
+       ralloc_free(v->constant_data);
+       v->constant_data = NULL;
+
         return ptr;
  
  fail:
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h

index cb42636..262f2a2 100644 (file)
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -45,6 +45,13 @@ struct ir3_block;
  
  struct ir3_info {
         void *data;              /* used internally in ir3 assembler */
+       /* Size in bytes of the shader binary, including NIR constants and
+        * padding
+        */
+       uint32_t size;
+       /* byte offset from start of the shader to the NIR constant data. */
+       uint32_t constant_data_offset;
+       /* Size in dwords of the instructions. */
         uint16_t sizedwords;
         uint16_t instrs_count;   /* expanded to account for rpt's */
         uint16_t nops_count;     /* # of nop instructions, including nopN */
diff --git a/src/freedreno/ir3/ir3_disk_cache.c b/src/freedreno/ir3/ir3_disk_cache.c

index 7872671..29a2c8c 100644 (file)
--- a/src/freedreno/ir3/ir3_disk_cache.c
+++ b/src/freedreno/ir3/ir3_disk_cache.c
@@ -126,8 +126,8 @@ retrieve_variant(struct blob_reader *blob, struct ir3_shader_variant *v)
          * pointers need special handling:
          */
  
-       v->bin = rzalloc_size(v, 4 * v->info.sizedwords);
-       blob_copy_bytes(blob, v->bin, 4 * v->info.sizedwords);
+       v->bin = rzalloc_size(v, v->info.size);
+       blob_copy_bytes(blob, v->bin, v->info.size);
  
         if (!v->binning_pass) {
                 blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
@@ -147,7 +147,9 @@ store_variant(struct blob *blob, struct ir3_shader_variant *v)
          * pointers need special handling:
          */
  
-       blob_write_bytes(blob, v->bin, 4 * v->info.sizedwords);
+       blob_write_bytes(blob, v->bin, v->info.size);
+
+       /* No saving constant_data, it's already baked into bin at this point. */
  
         if (!v->binning_pass) {
                 blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c

index 29ab296..d6d891a 100644 (file)
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -495,11 +495,25 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
                 progress |= OPT(s, nir_lower_tex, &tex_options);
         }
  
+       /* Move large constant variables to the constants attached to the NIR
+        * shader, which we will upload in the immediates range.  This generates
+        * amuls, so we need to clean those up after.
+        *
+        * Passing no size_align, we would get packed values, which if we end up
+        * having to load with LDC would result in extra reads to unpack from
+        * straddling loads.  Align everything to vec4 to avoid that, though we
+        * could theoretically do better.
+        */
+       OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes, 32 /* bytes */);
+       OPT_V(s, ir3_nir_lower_load_constant, so);
+
         if (!so->binning_pass)
                 OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
  
         progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
  
+       OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
+
         /* UBO offset lowering has to come after we've decided what will
          * be left as load_ubo
          */
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h

index d716e53..17dc4aa 100644 (file)
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -59,6 +59,7 @@ void ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s);
  
  void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
                 struct ir3_const_state *const_state);
+bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
  void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
  bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
  bool ir3_nir_fixup_load_uniform(nir_shader *nir);
diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c

index 8e7f9aa..a1c06b9 100644 (file)
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@@ -530,3 +530,94 @@ ir3_nir_fixup_load_uniform(nir_shader *nir)
                         fixup_load_uniform_filter, fixup_load_uniform_instr,
                         NULL);
  }
+static nir_ssa_def *
+ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data)
+{
+       struct ir3_const_state *const_state = data;
+       nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
+
+       /* Pick a UBO index to use as our constant data.  Skip UBO 0 since that's
+        * reserved for gallium's cb0.
+        */
+       if (const_state->constant_data_ubo == -1) {
+               if (b->shader->info.num_ubos == 0)
+                       b->shader->info.num_ubos++;
+               const_state->constant_data_ubo = b->shader->info.num_ubos++;
+       }
+
+       unsigned num_components = instr->num_components;
+       if (nir_dest_bit_size(instr->dest) == 16) {
+               /* We can't do 16b loads -- either from LDC (32-bit only in any of our
+                * traces, and disasm that doesn't look like it really supports it) or
+                * from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
+                * automatic 32b-to-16b conversions when we ask for 16b from it).
+                * Instead, we'll load 32b from a UBO and unpack from there.
+                */
+               num_components = DIV_ROUND_UP(num_components, 2);
+       }
+       unsigned base = nir_intrinsic_base(instr);
+       nir_intrinsic_instr *load =
+               nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo);
+       load->num_components = num_components;
+       nir_ssa_dest_init(&load->instr, &load->dest,
+                       load->num_components, 32,
+                       instr->dest.ssa.name);
+
+       load->src[0] = nir_src_for_ssa(nir_imm_int(b,
+                                       const_state->constant_data_ubo));
+       load->src[1] = nir_src_for_ssa(nir_iadd_imm(b,
+                                       nir_ssa_for_src(b, instr->src[0], 1), base));
+
+       nir_intrinsic_set_align(load,
+                       nir_intrinsic_align_mul(instr),
+                       nir_intrinsic_align_offset(instr));
+       nir_intrinsic_set_range_base(load, base);
+       nir_intrinsic_set_range(load, nir_intrinsic_range(instr));
+
+       nir_builder_instr_insert(b, &load->instr);
+
+       nir_ssa_def *result = &load->dest.ssa;
+       if (nir_dest_bit_size(instr->dest) == 16) {
+               result = nir_bitcast_vector(b, result, 16);
+               result = nir_channels(b, result, BITSET_MASK(instr->num_components));
+       }
+
+       return result;
+}
+
+static bool
+ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
+{
+        return (instr->type == nir_instr_type_intrinsic &&
+                nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_constant);
+}
+
+/* Lowers load_constant intrinsics to UBO accesses so we can run them through
+ * the general "upload to const file or leave as UBO access" code.
+ */
+bool
+ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v)
+{
+       struct ir3_const_state *const_state = ir3_const_state(v);
+
+       const_state->constant_data_ubo = -1;
+
+       bool progress = nir_shader_lower_instructions(nir,
+                       ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
+                       const_state);
+
+       if (progress) {
+               struct ir3_compiler *compiler = v->shader->compiler;
+
+               /* Save a copy of the NIR constant data to the variant for
+                       * inclusion in the final assembly.
+                       */
+               v->constant_data_size = align(nir->constant_data_size,
+                               compiler->const_upload_unit * 4 * sizeof(uint32_t));
+               v->constant_data = rzalloc_size(v, v->constant_data_size);
+               memcpy(v->constant_data, nir->constant_data,
+                               nir->constant_data_size);
+       }
+
+       return progress;
+}
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h

index 36aba4f..bba3c62 100644 (file)
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -157,6 +157,9 @@ struct ir3_const_state {
         unsigned num_ubos;
         unsigned num_driver_params;   /* scalar */
  
+       /* UBO that should be mapped to the NIR shader's constant_data (or -1). */
+       int32_t constant_data_ubo;
+
         struct {
                 /* user const start at zero */
                 unsigned ubo;
@@ -504,6 +507,12 @@ struct ir3_shader_variant {
         gl_shader_stage type;
         struct ir3_shader *shader;
  
+       /* variant's copy of nir->constant_data (since we don't track the NIR in
+        * the variant, and shader->nir is before the opt pass).  Moves to v->bin
+        * after assembly.
+        */
+       void *constant_data;
+
         /*
          * Below here is serialized when written to disk cache:
          */
@@ -525,6 +534,8 @@ struct ir3_shader_variant {
  
         struct ir3_info info;
  
+       uint32_t constant_data_size;
+
         /* Levels of nesting of flow control:
          */
         unsigned branchstack;
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c

index dfcaca9..41d9c81 100644 (file)
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -3013,7 +3013,8 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
  {
     const struct tu_program_descriptor_linkage *link =
        &pipeline->program.link[type];
-   const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
+   const struct ir3_const_state *const_state = &link->const_state;
+   const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
  
     if (link->push_consts.count > 0) {
        unsigned num_units = link->push_consts.count;
@@ -3048,9 +3049,14 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
        debug_assert((offset % 16) == 0);
  
        /* Dig out the descriptor from the descriptor state and read the VA from
-       * it.
+       * it.  All our UBOs are bindless with the exception of the NIR
+       * constant_data, which is uploaded once in the pipeline.
         */
-      assert(state->range[i].ubo.bindless);
+      if (!state->range[i].ubo.bindless) {
+         assert(state->range[i].ubo.block == const_state->constant_data_ubo);
+         continue;
+      }
+
        uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
           descriptors_state->dynamic_descriptors :
           descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c

index b2f8c63..dde1123 100644 (file)
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -453,19 +453,61 @@ tu6_emit_xs_config(struct tu_cs *cs,
      */
     size = MIN2(size + base, xs->constlen) - base;
  
-   if (size <= 0)
-      return;
+   if (size > 0) {
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
+      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
+                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                 CP_LOAD_STATE6_0_NUM_UNIT(size));
+      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
  
-   tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
-                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
-                  CP_LOAD_STATE6_0_NUM_UNIT(size));
-   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+      tu_cs_emit_array(cs, const_state->immediates, size * 4);
+   }
+
+   if (const_state->constant_data_ubo != -1) {
+      uint64_t iova = binary_iova + xs->info.constant_data_offset;
  
-   tu_cs_emit_array(cs, const_state->immediates, size * 4);
+      /* Upload UBO state for the constant data. */
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
+      tu_cs_emit(cs,
+                 CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
+                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
+                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                 CP_LOAD_STATE6_0_NUM_UNIT(1));
+      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+      int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
+      tu_cs_emit_qw(cs,
+                    iova |
+                    (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
+
+      /* Upload the constant data to the const file if needed. */
+      const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
+
+      for (int i = 0; i < ubo_state->num_enabled; i++) {
+         if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
+             ubo_state->range[i].ubo.bindless) {
+            continue;
+         }
+
+         uint32_t start = ubo_state->range[i].start;
+         uint32_t end = ubo_state->range[i].end;
+         uint32_t size = MIN2(end - start,
+                              (16 * xs->constlen) - ubo_state->range[i].offset);
+
+         tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
+         tu_cs_emit(cs,
+                    CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
+                    CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+                    CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
+                    CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                    CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
+         tu_cs_emit_qw(cs, iova + start);
+      }
+   }
  }
  
  static void
@@ -1939,12 +1981,12 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
     if (builder) {
        for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) {
           if (builder->variants[i])
-            size += builder->variants[i]->info.sizedwords;
+            size += builder->variants[i]->info.size / 4;
        }
  
-      size += builder->binning_variant->info.sizedwords;
+      size += builder->binning_variant->info.size / 4;
     } else {
-      size += compute->info.sizedwords;
+      size += compute->info.size / 4;
     }
  
     tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
@@ -2016,12 +2058,12 @@ tu_upload_variant(struct tu_pipeline *pipeline,
        return 0;
  
     /* this expects to get enough alignment because shaders are allocated first
-    * and sizedwords is always aligned correctly
+    * and total size is always aligned correctly
      * note: an assert in tu6_emit_xs_config validates the alignment
      */
-   tu_cs_alloc(&pipeline->cs, variant->info.sizedwords, 1, &memory);
+   tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
  
-   memcpy(memory.map, variant->bin, sizeof(uint32_t) * variant->info.sizedwords);
+   memcpy(memory.map, variant->bin, variant->info.size);
     return memory.iova;
  }
  
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.c b/src/gallium/drivers/freedreno/a6xx/fd6_const.c

index 020fbf5..78b7b05 100644 (file)
--- a/src/gallium/drivers/freedreno/a6xx/fd6_const.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.c
@@ -248,6 +248,16 @@ fd6_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
         OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
  
         for (int i = 0; i < num_ubos; i++) {
+               /* NIR constant data is packed into the end of the shader. */
+               if (i == const_state->constant_data_ubo) {
+                       int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16);
+                       OUT_RELOC(ring, v->bo,
+                                       v->info.constant_data_offset,
+                                       (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32,
+                                       0);
+                       continue;
+               }
+
                 struct pipe_constant_buffer *cb = &constbuf->cb[i];
  
                 /* If we have user pointers (constbuf 0, aka GL uniforms), upload them
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h

index 2c9c560..4dc36c4 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_const.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h
@@ -107,6 +107,44 @@ ir3_user_consts_size(struct ir3_ubo_analysis_state *state,
  }
  
  /**
+ * Uploads the referenced subranges of the nir constant_data to the hardware's
+ * constant buffer.
+ */
+static inline void
+ir3_emit_constant_data(struct fd_screen *screen,
+               const struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+       const struct ir3_const_state *const_state = ir3_const_state(v);
+       const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
+
+       for (unsigned i = 0; i < state->num_enabled; i++) {
+               unsigned ubo = state->range[i].ubo.block;
+               if (ubo != const_state->constant_data_ubo)
+                       continue;
+
+               uint32_t size = state->range[i].end - state->range[i].start;
+
+               /* Pre-a6xx, we might have ranges enabled in the shader that aren't
+                * used in the binning variant.
+                */
+               if (16 * v->constlen <= state->range[i].offset)
+                       continue;
+
+               /* and even if the start of the const buffer is before
+                * first_immediate, the end may not be:
+                */
+               size = MIN2(size, (16 * v->constlen) - state->range[i].offset);
+
+               if (size == 0)
+                       continue;
+
+               emit_const_bo(ring, v, state->range[i].offset / 4,
+                               v->info.constant_data_offset + state->range[i].start,
+                               size / 4, v->bo);
+       }
+}
+
+/**
   * Uploads sub-ranges of UBOs to the hardware's constant buffer (UBO access
   * outside of these ranges will be done using full UBO accesses in the
   * shader).
@@ -121,8 +159,10 @@ ir3_emit_user_consts(struct fd_screen *screen, const struct ir3_shader_variant *
         for (unsigned i = 0; i < state->num_enabled; i++) {
                 assert(!state->range[i].ubo.bindless);
                 unsigned ubo = state->range[i].ubo.block;
-               if (!(constbuf->enabled_mask & (1 << ubo)))
+               if (!(constbuf->enabled_mask & (1 << ubo)) ||
+                               ubo == const_state->constant_data_ubo) {
                         continue;
+               }
                 struct pipe_constant_buffer *cb = &constbuf->cb[ubo];
  
                 uint32_t size = state->range[i].end - state->range[i].start;
@@ -176,6 +216,12 @@ ir3_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
                 struct fd_bo *bos[params];
  
                 for (uint32_t i = 0; i < params; i++) {
+                       if (i == const_state->constant_data_ubo) {
+                               bos[i] = v->bo;
+                               offsets[i] = v->info.constant_data_offset;
+                               continue;
+                       }
+
                         struct pipe_constant_buffer *cb = &constbuf->cb[i];
  
                         /* If we have user pointers (constbuf 0, aka GL uniforms), upload
@@ -299,6 +345,11 @@ ir3_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v
  
         if (size > 0)
                 emit_const_user(ring, v, base, size, const_state->immediates);
+
+       /* NIR constant data has the same lifetime as immediates, so upload it
+        * now, too.
+        */
+       ir3_emit_constant_data(screen, v, ring);
  }
  
  static inline void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c

index cb28ed5..5a79a76 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -86,9 +86,7 @@ upload_shader_variant(struct ir3_shader_variant *v)
  
         assert(!v->bo);
  
-       unsigned sz = v->info.sizedwords * 4;
-
-       v->bo = fd_bo_new(compiler->dev, sz,
+       v->bo = fd_bo_new(compiler->dev, v->info.size,
                         DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
                         DRM_FREEDRENO_GEM_TYPE_KMEM,
                         "%s:%s", ir3_shader_stage(v), info->name);
@@ -96,7 +94,7 @@ upload_shader_variant(struct ir3_shader_variant *v)
         /* Always include shaders in kernel crash dumps. */
         fd_bo_mark_for_dump(v->bo);
  
-       memcpy(fd_bo_map(v->bo), v->bin, sz);
+       memcpy(fd_bo_map(v->bo), v->bin, v->info.size);
  }
  
  struct ir3_shader_variant *
author	Eric Anholt <eric@anholt.net>
	Tue, 7 Jul 2020 18:56:35 +0000 (11:56 -0700)
committer	Eric Anholt <eric@anholt.net>
	Mon, 16 Nov 2020 21:55:41 +0000 (13:55 -0800)
.gitlab-ci/deqp-freedreno-a307-fails.txt		patch \| blob \| history
.gitlab-ci/deqp-freedreno-a630-fails.txt		patch \| blob \| history
src/freedreno/computerator/ir3_asm.c		patch \| blob \| history
src/freedreno/ir3/ir3.c		patch \| blob \| history
src/freedreno/ir3/ir3.h		patch \| blob \| history
src/freedreno/ir3/ir3_disk_cache.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir.c		patch \| blob \| history
src/freedreno/ir3/ir3_nir.h		patch \| blob \| history
src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c		patch \| blob \| history
src/freedreno/ir3/ir3_shader.h		patch \| blob \| history
src/freedreno/vulkan/tu_cmd_buffer.c		patch \| blob \| history
src/freedreno/vulkan/tu_pipeline.c		patch \| blob \| history
src/gallium/drivers/freedreno/a6xx/fd6_const.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_const.h		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_gallium.c		patch \| blob \| history