tu: Fall back to ldg for variable-size inline uniform blocks
authorConnor Abbott <cwabbott0@gmail.com>
Wed, 30 Nov 2022 16:43:15 +0000 (17:43 +0100)
committerMarge Bot <emma+marge@anholt.net>
Mon, 12 Dec 2022 17:38:19 +0000 (17:38 +0000)
With descriptor buffers, we have no way to know how big the descriptor
set actually is, so we have no idea how many constants we can safely
push. If we use a UBO then it will still get pushed, because we normally
assume that we can freely access UBOs without any fear of faults due to
the range checking. This does the easiest thing of using raw pointer
loads, although performance will fall off a cliff, because we don't have
many better options.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19849>

src/freedreno/vulkan/tu_cmd_buffer.c
src/freedreno/vulkan/tu_descriptor_set.c
src/freedreno/vulkan/tu_shader.c
src/freedreno/vulkan/tu_shader.h

index d60e6b7..7962f5e 100644 (file)
@@ -4454,7 +4454,7 @@ tu6_user_consts_size(const struct tu_pipeline *pipeline,
       dwords += 4 + num_units;
    }
 
-   dwords += 4 * link->tu_const_state.num_inline_ubos;
+   dwords += 8 * link->tu_const_state.num_inline_ubos;
 
    return dwords;
 }
@@ -4492,22 +4492,19 @@ tu6_emit_user_consts(struct tu_cs *cs,
    for (unsigned i = 0; i < link->tu_const_state.num_inline_ubos; i++) {
       const struct tu_inline_ubo *ubo = &link->tu_const_state.ubos[i];
 
-      /* With variable-count inline uniform blocks, we allocate constant space
-       * based on the maximum size specified in the descriptor set layout, but
-       * the actual size may be smaller. In that case we have to avoid
-       * fetching out-of-bounds of the buffer that contains the descriptor
-       * set to avoid faults.
-       */
-      unsigned num_units = MIN2(ubo->size_vec4,
-                                (descriptors->sets[ubo->base]->size - ubo->offset) / 16);
-
-      if (num_units > 0) {
-         tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
-         tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(ubo->const_offset_vec4) |
-               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-               CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-               CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
-               CP_LOAD_STATE6_0_NUM_UNIT(num_units));
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), ubo->push_address ? 7 : 3);
+      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(ubo->const_offset_vec4) |
+            CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+            CP_LOAD_STATE6_0_STATE_SRC(ubo->push_address ? SS6_DIRECT : SS6_INDIRECT) |
+            CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
+            CP_LOAD_STATE6_0_NUM_UNIT(ubo->size_vec4));
+      if (ubo->push_address) {
+         tu_cs_emit(cs, 0);
+         tu_cs_emit(cs, 0);
+         tu_cs_emit_qw(cs, descriptors->sets[ubo->base]->va + ubo->offset);
+         tu_cs_emit(cs, 0);
+         tu_cs_emit(cs, 0);
+      } else {
          tu_cs_emit_qw(cs, descriptors->sets[ubo->base]->va + ubo->offset);
       }
    }
index 5297df6..6e32127 100644 (file)
@@ -399,6 +399,8 @@ static void
 sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
                                   const struct tu_descriptor_set_layout *layout)
 {
+   SHA1_UPDATE_VALUE(ctx, layout->has_variable_descriptors);
+
    for (uint16_t i = 0; i < layout->binding_count; i++)
       sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i],
                                                 layout);
index 5fd5f31..2ee1682 100644 (file)
@@ -638,10 +638,12 @@ lower_inline_ubo(nir_builder *b, nir_instr *instr, void *cb_data)
    struct tu_const_state *const_state = &shader->const_state;
 
    unsigned base = UINT_MAX;
+   bool use_load = false;
    for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
       if (const_state->ubos[i].base == binding.desc_set &&
           const_state->ubos[i].offset == binding_layout->offset) {
          base = const_state->ubos[i].const_offset_vec4 * 4;
+         use_load = const_state->ubos[i].push_address;
          break;
       }
    }
@@ -659,12 +661,21 @@ lower_inline_ubo(nir_builder *b, nir_instr *instr, void *cb_data)
    nir_ssa_def *offset = intrin->src[1].ssa;
 
    b->cursor = nir_before_instr(instr);
-   nir_ssa_def *uniform =
-      nir_load_uniform(b, intrin->num_components,
-                       intrin->dest.ssa.bit_size,
-                       nir_ishr_imm(b, offset, 2), .base = base);
+   nir_ssa_def *val;
+
+   if (use_load) {
+      nir_ssa_def *base_addr =
+         nir_load_uniform(b, 2, 32, nir_imm_int(b, 0), .base = base);
+      val = nir_load_global_ir3(b, intrin->num_components,
+                                intrin->dest.ssa.bit_size,
+                                base_addr, nir_ishr_imm(b, offset, 2));
+   } else {
+      val = nir_load_uniform(b, intrin->num_components,
+                             intrin->dest.ssa.bit_size,
+                             nir_ishr_imm(b, offset, 2), .base = base);
+   }
 
-   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, uniform);
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, val);
    nir_instr_remove(instr);
    return true;
 }
@@ -767,13 +778,34 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
          if (binding->size == 0)
             continue;
 
+         /* If we don't know the size at compile time due to a variable
+          * descriptor count, then with descriptor buffers we cannot know
+          * how much space the real inline uniform has. In this case we fall
+          * back to pushing the address and using ldg, which is slower than
+          * setting up a descriptor but setting up our own descriptor with
+          * descriptor_buffer is also painful and has to be done on the GPU
+          * and doesn't avoid the UBO getting pushed anyway and faulting if a
+          * out-of-bounds access is hidden behind an if and not dynamically
+          * executed. Given the small max size, there shouldn't be much reason
+          * to use variable size anyway.
+          */
+         bool push_address = desc_layout->has_variable_descriptors &&
+            b == desc_layout->binding_count - 1;
+
+         if (push_address) {
+            perf_debug(dev,
+                       "falling back to ldg for variable-sized inline "
+                       "uniform block");
+         }
+
          assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos));
-         unsigned size_vec4 = DIV_ROUND_UP(binding->size, 16);
+         unsigned size_vec4 = push_address ? 1 : DIV_ROUND_UP(binding->size, 16);
          const_state->ubos[const_state->num_inline_ubos++] = (struct tu_inline_ubo) {
             .base = set,
             .offset = binding->offset,
             .const_offset_vec4 = reserved_consts_vec4,
             .size_vec4 = size_vec4,
+            .push_address = push_address,
          };
 
          reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit);
index a4015cf..51f87f7 100644 (file)
@@ -18,6 +18,9 @@ struct tu_inline_ubo
    unsigned base;
    unsigned offset;
 
+   /* If true, push the base address instead */
+   bool push_address;
+
    /* Push it to this location in the const file, in vec4s */
    unsigned const_offset_vec4;