freedreno+turnip: Upload large shader constants as a UBO.
authorEric Anholt <eric@anholt.net>
Tue, 7 Jul 2020 18:56:35 +0000 (11:56 -0700)
committerEric Anholt <eric@anholt.net>
Mon, 16 Nov 2020 21:55:41 +0000 (13:55 -0800)
Right now if the shader indirects on some large constant array, we see NIR
load_consts (usually from the const file) of its contents into general
registers, then indirection on the GPRs.  This often results in register
allocation failures, as it's easy to go beyond the ~256 dwords of
registers per invocation.

By moving the large constants to a UBO, we can load an arbitrary number of
them.  They also can be theoretically moved to the constant reg file (~2k
dwords), though you're unlikely to hit this path without an indirect load
on your large constant, and we don't yet let UBO indirect loads get moved
to constant regs.

This possibly won't work out right if we have 16-bit load_constants, but
without other MRs in flight we won't see 16-bit temps to be lowered to
this.

This allows 2 kerbal-space-program shaders to compile that previously
would fail, and fixes the new dEQP-VK and -GLES2 tests I wrote that
dynamically index a 40-element temporary array of float/vec2/vec3/vec4
with constant element initializers.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/2789
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5810>

15 files changed:
.gitlab-ci/deqp-freedreno-a307-fails.txt
.gitlab-ci/deqp-freedreno-a630-fails.txt
src/freedreno/computerator/ir3_asm.c
src/freedreno/ir3/ir3.c
src/freedreno/ir3/ir3.h
src/freedreno/ir3/ir3_disk_cache.c
src/freedreno/ir3/ir3_nir.c
src/freedreno/ir3/ir3_nir.h
src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
src/freedreno/ir3/ir3_shader.h
src/freedreno/vulkan/tu_cmd_buffer.c
src/freedreno/vulkan/tu_pipeline.c
src/gallium/drivers/freedreno/a6xx/fd6_const.c
src/gallium/drivers/freedreno/ir3/ir3_const.h
src/gallium/drivers/freedreno/ir3/ir3_gallium.c

index 060d10c..fa6a12d 100644 (file)
@@ -388,10 +388,6 @@ dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_highp,Fail
 dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_mediump,Fail
 dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_highp,Fail
 dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_mediump,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_fragment,Fail
 dEQP-GLES3.functional.shaders.linkage.varying.rules.differing_interpolation_2,Fail
 dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler2d_vertex,Fail
 dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler3d_vertex,Fail
index 4d8c2a6..2a555e2 100644 (file)
@@ -1,8 +1,4 @@
 
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_vertex,Fail
 dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a1r5g5b5_unorm_pack16.a1r5g5b5_unorm_pack16.optimal_general_nearest,Fail
 dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_optimal_nearest,Fail
 dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2r10g10b10_unorm_pack32.a2r10g10b10_unorm_pack32.optimal_optimal_nearest,Fail
index e1e845a..a976bed 100644 (file)
@@ -42,7 +42,7 @@ ir3_asm_assemble(struct ir3_compiler *c, FILE *in)
        kernel->base.num_bufs = kernel->info.num_bufs;
        memcpy(kernel->base.buf_sizes, kernel->info.buf_sizes, sizeof(kernel->base.buf_sizes));
 
-       unsigned sz = v->info.sizedwords * 4;
+       unsigned sz = v->info.size;
 
        v->bo = fd_bo_new(c->dev, sz,
                        DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
index 84aa8eb..2f2612d 100644 (file)
@@ -942,15 +942,24 @@ void * ir3_assemble(struct ir3_shader_variant *v)
         * doesn't try to decode the following data as instructions (such as the
         * next stage's shader in turnip)
         */
-       info->sizedwords = MAX2(v->instrlen * compiler->instr_align,
-                       instr_count + 4) * sizeof(instr_t) / 4;
+       info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) *
+               sizeof(instr_t);
+       info->sizedwords = info->size / 4;
+
+       if (v->constant_data_size) {
+               /* Make sure that where we're about to place the constant_data is safe
+                * to indirectly upload from.
+                */
+               info->constant_data_offset = align(info->size, v->shader->compiler->const_upload_unit * 16);
+               info->size = info->constant_data_offset + v->constant_data_size;
+       }
 
        /* Pad out the size so that when turnip uploads the shaders in
         * sequence, the starting offset of the next one is properly aligned.
         */
-       info->sizedwords = align(info->sizedwords, compiler->instr_align * sizeof(instr_t) / 4);
+       info->size = align(info->size, compiler->instr_align * sizeof(instr_t));
 
-       ptr = dwords = rzalloc_size(v, 4 * info->sizedwords);
+       ptr = dwords = rzalloc_size(v, info->size);
 
        foreach_block (block, &shader->block_list) {
                unsigned sfu_delay = 0;
@@ -1003,6 +1012,14 @@ void * ir3_assemble(struct ir3_shader_variant *v)
                }
        }
 
+       /* Append the immediates after the end of the program.  This lets us emit
+        * the immediates as an indirect load, while avoiding creating another BO.
+        */
+       if (v->constant_data_size)
+               memcpy(&ptr[info->constant_data_offset / 4], v->constant_data, v->constant_data_size);
+       ralloc_free(v->constant_data);
+       v->constant_data = NULL;
+
        return ptr;
 
 fail:
index cb42636..262f2a2 100644 (file)
@@ -45,6 +45,13 @@ struct ir3_block;
 
 struct ir3_info {
        void *data;              /* used internally in ir3 assembler */
+       /* Size in bytes of the shader binary, including NIR constants and
+        * padding
+        */
+       uint32_t size;
+       /* byte offset from start of the shader to the NIR constant data. */
+       uint32_t constant_data_offset;
+       /* Size in dwords of the instructions. */
        uint16_t sizedwords;
        uint16_t instrs_count;   /* expanded to account for rpt's */
        uint16_t nops_count;     /* # of nop instructions, including nopN */
index 7872671..29a2c8c 100644 (file)
@@ -126,8 +126,8 @@ retrieve_variant(struct blob_reader *blob, struct ir3_shader_variant *v)
         * pointers need special handling:
         */
 
-       v->bin = rzalloc_size(v, 4 * v->info.sizedwords);
-       blob_copy_bytes(blob, v->bin, 4 * v->info.sizedwords);
+       v->bin = rzalloc_size(v, v->info.size);
+       blob_copy_bytes(blob, v->bin, v->info.size);
 
        if (!v->binning_pass) {
                blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
@@ -147,7 +147,9 @@ store_variant(struct blob *blob, struct ir3_shader_variant *v)
         * pointers need special handling:
         */
 
-       blob_write_bytes(blob, v->bin, 4 * v->info.sizedwords);
+       blob_write_bytes(blob, v->bin, v->info.size);
+
+       /* No saving constant_data, it's already baked into bin at this point. */
 
        if (!v->binning_pass) {
                blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
index 29ab296..d6d891a 100644 (file)
@@ -495,11 +495,25 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
                progress |= OPT(s, nir_lower_tex, &tex_options);
        }
 
+       /* Move large constant variables to the constants attached to the NIR
+        * shader, which we will upload in the immediates range.  This generates
+        * amuls, so we need to clean those up after.
+        *
+        * Passing no size_align, we would get packed values, which if we end up
+        * having to load with LDC would result in extra reads to unpack from
+        * straddling loads.  Align everything to vec4 to avoid that, though we
+        * could theoretically do better.
+        */
+       OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes, 32 /* bytes */);
+       OPT_V(s, ir3_nir_lower_load_constant, so);
+
        if (!so->binning_pass)
                OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
 
        progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
 
+       OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
+
        /* UBO offset lowering has to come after we've decided what will
         * be left as load_ubo
         */
index d716e53..17dc4aa 100644 (file)
@@ -59,6 +59,7 @@ void ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s);
 
 void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
                struct ir3_const_state *const_state);
+bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
 void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
 bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
 bool ir3_nir_fixup_load_uniform(nir_shader *nir);
index 8e7f9aa..a1c06b9 100644 (file)
@@ -530,3 +530,94 @@ ir3_nir_fixup_load_uniform(nir_shader *nir)
                        fixup_load_uniform_filter, fixup_load_uniform_instr,
                        NULL);
 }
+static nir_ssa_def *
+ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data)
+{
+       struct ir3_const_state *const_state = data;
+       nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
+
+       /* Pick a UBO index to use as our constant data.  Skip UBO 0 since that's
+        * reserved for gallium's cb0.
+        */
+       if (const_state->constant_data_ubo == -1) {
+               if (b->shader->info.num_ubos == 0)
+                       b->shader->info.num_ubos++;
+               const_state->constant_data_ubo = b->shader->info.num_ubos++;
+       }
+
+       unsigned num_components = instr->num_components;
+       if (nir_dest_bit_size(instr->dest) == 16) {
+               /* We can't do 16b loads -- either from LDC (32-bit only in any of our
+                * traces, and disasm that doesn't look like it really supports it) or
+                * from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
+                * automatic 32b-to-16b conversions when we ask for 16b from it).
+                * Instead, we'll load 32b from a UBO and unpack from there.
+                */
+               num_components = DIV_ROUND_UP(num_components, 2);
+       }
+       unsigned base = nir_intrinsic_base(instr);
+       nir_intrinsic_instr *load =
+               nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo);
+       load->num_components = num_components;
+       nir_ssa_dest_init(&load->instr, &load->dest,
+                       load->num_components, 32,
+                       instr->dest.ssa.name);
+
+       load->src[0] = nir_src_for_ssa(nir_imm_int(b,
+                                       const_state->constant_data_ubo));
+       load->src[1] = nir_src_for_ssa(nir_iadd_imm(b,
+                                       nir_ssa_for_src(b, instr->src[0], 1), base));
+
+       nir_intrinsic_set_align(load,
+                       nir_intrinsic_align_mul(instr),
+                       nir_intrinsic_align_offset(instr));
+       nir_intrinsic_set_range_base(load, base);
+       nir_intrinsic_set_range(load, nir_intrinsic_range(instr));
+
+       nir_builder_instr_insert(b, &load->instr);
+
+       nir_ssa_def *result = &load->dest.ssa;
+       if (nir_dest_bit_size(instr->dest) == 16) {
+               result = nir_bitcast_vector(b, result, 16);
+               result = nir_channels(b, result, BITSET_MASK(instr->num_components));
+       }
+
+       return result;
+}
+
+static bool
+ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
+{
+        return (instr->type == nir_instr_type_intrinsic &&
+                nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_constant);
+}
+
+/* Lowers load_constant intrinsics to UBO accesses so we can run them through
+ * the general "upload to const file or leave as UBO access" code.
+ */
+bool
+ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v)
+{
+       struct ir3_const_state *const_state = ir3_const_state(v);
+
+       const_state->constant_data_ubo = -1;
+
+       bool progress = nir_shader_lower_instructions(nir,
+                       ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
+                       const_state);
+
+       if (progress) {
+               struct ir3_compiler *compiler = v->shader->compiler;
+
+               /* Save a copy of the NIR constant data to the variant for
+                       * inclusion in the final assembly.
+                       */
+               v->constant_data_size = align(nir->constant_data_size,
+                               compiler->const_upload_unit * 4 * sizeof(uint32_t));
+               v->constant_data = rzalloc_size(v, v->constant_data_size);
+               memcpy(v->constant_data, nir->constant_data,
+                               nir->constant_data_size);
+       }
+
+       return progress;
+}
index 36aba4f..bba3c62 100644 (file)
@@ -157,6 +157,9 @@ struct ir3_const_state {
        unsigned num_ubos;
        unsigned num_driver_params;   /* scalar */
 
+       /* UBO that should be mapped to the NIR shader's constant_data (or -1). */
+       int32_t constant_data_ubo;
+
        struct {
                /* user const start at zero */
                unsigned ubo;
@@ -504,6 +507,12 @@ struct ir3_shader_variant {
        gl_shader_stage type;
        struct ir3_shader *shader;
 
+       /* variant's copy of nir->constant_data (since we don't track the NIR in
+        * the variant, and shader->nir is before the opt pass).  Moves to v->bin
+        * after assembly.
+        */
+       void *constant_data;
+
        /*
         * Below here is serialized when written to disk cache:
         */
@@ -525,6 +534,8 @@ struct ir3_shader_variant {
 
        struct ir3_info info;
 
+       uint32_t constant_data_size;
+
        /* Levels of nesting of flow control:
         */
        unsigned branchstack;
index dfcaca9..41d9c81 100644 (file)
@@ -3013,7 +3013,8 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
 {
    const struct tu_program_descriptor_linkage *link =
       &pipeline->program.link[type];
-   const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
+   const struct ir3_const_state *const_state = &link->const_state;
+   const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
 
    if (link->push_consts.count > 0) {
       unsigned num_units = link->push_consts.count;
@@ -3048,9 +3049,14 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
       debug_assert((offset % 16) == 0);
 
       /* Dig out the descriptor from the descriptor state and read the VA from
-       * it.
+       * it.  All our UBOs are bindless with the exception of the NIR
+       * constant_data, which is uploaded once in the pipeline.
        */
-      assert(state->range[i].ubo.bindless);
+      if (!state->range[i].ubo.bindless) {
+         assert(state->range[i].ubo.block == const_state->constant_data_ubo);
+         continue;
+      }
+
       uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
          descriptors_state->dynamic_descriptors :
          descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
index b2f8c63..dde1123 100644 (file)
@@ -453,19 +453,61 @@ tu6_emit_xs_config(struct tu_cs *cs,
     */
    size = MIN2(size + base, xs->constlen) - base;
 
-   if (size <= 0)
-      return;
+   if (size > 0) {
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
+      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
+                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                 CP_LOAD_STATE6_0_NUM_UNIT(size));
+      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 
-   tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
-                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
-                  CP_LOAD_STATE6_0_NUM_UNIT(size));
-   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+      tu_cs_emit_array(cs, const_state->immediates, size * 4);
+   }
+
+   if (const_state->constant_data_ubo != -1) {
+      uint64_t iova = binary_iova + xs->info.constant_data_offset;
 
-   tu_cs_emit_array(cs, const_state->immediates, size * 4);
+      /* Upload UBO state for the constant data. */
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
+      tu_cs_emit(cs,
+                 CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
+                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
+                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                 CP_LOAD_STATE6_0_NUM_UNIT(1));
+      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+      int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
+      tu_cs_emit_qw(cs,
+                    iova |
+                    (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
+
+      /* Upload the constant data to the const file if needed. */
+      const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
+
+      for (int i = 0; i < ubo_state->num_enabled; i++) {
+         if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
+             ubo_state->range[i].ubo.bindless) {
+            continue;
+         }
+
+         uint32_t start = ubo_state->range[i].start;
+         uint32_t end = ubo_state->range[i].end;
+         uint32_t size = MIN2(end - start,
+                              (16 * xs->constlen) - ubo_state->range[i].offset);
+
+         tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
+         tu_cs_emit(cs,
+                    CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
+                    CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+                    CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
+                    CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+                    CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
+         tu_cs_emit_qw(cs, iova + start);
+      }
+   }
 }
 
 static void
@@ -1939,12 +1981,12 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
    if (builder) {
       for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) {
          if (builder->variants[i])
-            size += builder->variants[i]->info.sizedwords;
+            size += builder->variants[i]->info.size / 4;
       }
 
-      size += builder->binning_variant->info.sizedwords;
+      size += builder->binning_variant->info.size / 4;
    } else {
-      size += compute->info.sizedwords;
+      size += compute->info.size / 4;
    }
 
    tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
@@ -2016,12 +2058,12 @@ tu_upload_variant(struct tu_pipeline *pipeline,
       return 0;
 
    /* this expects to get enough alignment because shaders are allocated first
-    * and sizedwords is always aligned correctly
+    * and total size is always aligned correctly
     * note: an assert in tu6_emit_xs_config validates the alignment
     */
-   tu_cs_alloc(&pipeline->cs, variant->info.sizedwords, 1, &memory);
+   tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
 
-   memcpy(memory.map, variant->bin, sizeof(uint32_t) * variant->info.sizedwords);
+   memcpy(memory.map, variant->bin, variant->info.size);
    return memory.iova;
 }
 
index 020fbf5..78b7b05 100644 (file)
@@ -248,6 +248,16 @@ fd6_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
        OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 
        for (int i = 0; i < num_ubos; i++) {
+               /* NIR constant data is packed into the end of the shader. */
+               if (i == const_state->constant_data_ubo) {
+                       int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16);
+                       OUT_RELOC(ring, v->bo,
+                                       v->info.constant_data_offset,
+                                       (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32,
+                                       0);
+                       continue;
+               }
+
                struct pipe_constant_buffer *cb = &constbuf->cb[i];
 
                /* If we have user pointers (constbuf 0, aka GL uniforms), upload them
index 2c9c560..4dc36c4 100644 (file)
@@ -107,6 +107,44 @@ ir3_user_consts_size(struct ir3_ubo_analysis_state *state,
 }
 
 /**
+ * Uploads the referenced subranges of the nir constant_data to the hardware's
+ * constant buffer.
+ */
+static inline void
+ir3_emit_constant_data(struct fd_screen *screen,
+               const struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+       const struct ir3_const_state *const_state = ir3_const_state(v);
+       const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
+
+       for (unsigned i = 0; i < state->num_enabled; i++) {
+               unsigned ubo = state->range[i].ubo.block;
+               if (ubo != const_state->constant_data_ubo)
+                       continue;
+
+               uint32_t size = state->range[i].end - state->range[i].start;
+
+               /* Pre-a6xx, we might have ranges enabled in the shader that aren't
+                * used in the binning variant.
+                */
+               if (16 * v->constlen <= state->range[i].offset)
+                       continue;
+
+               /* and even if the start of the const buffer is before
+                * first_immediate, the end may not be:
+                */
+               size = MIN2(size, (16 * v->constlen) - state->range[i].offset);
+
+               if (size == 0)
+                       continue;
+
+               emit_const_bo(ring, v, state->range[i].offset / 4,
+                               v->info.constant_data_offset + state->range[i].start,
+                               size / 4, v->bo);
+       }
+}
+
+/**
  * Uploads sub-ranges of UBOs to the hardware's constant buffer (UBO access
  * outside of these ranges will be done using full UBO accesses in the
  * shader).
@@ -121,8 +159,10 @@ ir3_emit_user_consts(struct fd_screen *screen, const struct ir3_shader_variant *
        for (unsigned i = 0; i < state->num_enabled; i++) {
                assert(!state->range[i].ubo.bindless);
                unsigned ubo = state->range[i].ubo.block;
-               if (!(constbuf->enabled_mask & (1 << ubo)))
+               if (!(constbuf->enabled_mask & (1 << ubo)) ||
+                               ubo == const_state->constant_data_ubo) {
                        continue;
+               }
                struct pipe_constant_buffer *cb = &constbuf->cb[ubo];
 
                uint32_t size = state->range[i].end - state->range[i].start;
@@ -176,6 +216,12 @@ ir3_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
                struct fd_bo *bos[params];
 
                for (uint32_t i = 0; i < params; i++) {
+                       if (i == const_state->constant_data_ubo) {
+                               bos[i] = v->bo;
+                               offsets[i] = v->info.constant_data_offset;
+                               continue;
+                       }
+
                        struct pipe_constant_buffer *cb = &constbuf->cb[i];
 
                        /* If we have user pointers (constbuf 0, aka GL uniforms), upload
@@ -299,6 +345,11 @@ ir3_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v
 
        if (size > 0)
                emit_const_user(ring, v, base, size, const_state->immediates);
+
+       /* NIR constant data has the same lifetime as immediates, so upload it
+        * now, too.
+        */
+       ir3_emit_constant_data(screen, v, ring);
 }
 
 static inline void
index cb28ed5..5a79a76 100644 (file)
@@ -86,9 +86,7 @@ upload_shader_variant(struct ir3_shader_variant *v)
 
        assert(!v->bo);
 
-       unsigned sz = v->info.sizedwords * 4;
-
-       v->bo = fd_bo_new(compiler->dev, sz,
+       v->bo = fd_bo_new(compiler->dev, v->info.size,
                        DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
                        DRM_FREEDRENO_GEM_TYPE_KMEM,
                        "%s:%s", ir3_shader_stage(v), info->name);
@@ -96,7 +94,7 @@ upload_shader_variant(struct ir3_shader_variant *v)
        /* Always include shaders in kernel crash dumps. */
        fd_bo_mark_for_dump(v->bo);
 
-       memcpy(fd_bo_map(v->bo), v->bin, sz);
+       memcpy(fd_bo_map(v->bo), v->bin, v->info.size);
 }
 
 struct ir3_shader_variant *