tu: Support higher descriptor set count for A7XX
authorMark Collins <mark@igalia.com>
Tue, 3 Oct 2023 18:22:49 +0000 (18:22 +0000)
committerMarge Bot <emma+marge@anholt.net>
Sat, 7 Oct 2023 18:15:02 +0000 (18:15 +0000)
Allows for the descriptor set count to vary at runtime depending on
the specific GPU to allow for 7 usable descriptor sets on A7XX with
one reserved for dynamic offsets.

Passing VK-CTS: dEQP-VK.binding_model.*

Signed-off-by: Mark Collins <mark@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25534>

src/freedreno/vulkan/tu_clear_blit.cc
src/freedreno/vulkan/tu_cmd_buffer.cc
src/freedreno/vulkan/tu_cmd_buffer.h
src/freedreno/vulkan/tu_descriptor_set.cc
src/freedreno/vulkan/tu_descriptor_set.h
src/freedreno/vulkan/tu_device.cc
src/freedreno/vulkan/tu_device.h
src/freedreno/vulkan/tu_pipeline.cc
src/freedreno/vulkan/tu_shader.cc

index b58b118cadd72b63947cf7cd0d626f7df330a46e..c5b48b773a9f4cae2006c18d951c2343d3637afa 100644 (file)
@@ -825,8 +825,8 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit,
          .cs_ibo = true,
          .gfx_ibo = true,
          .gfx_shared_const = true,
-         .cs_bindless = 0x1f,
-         .gfx_bindless = 0x1f,));
+         .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
+         .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
 
    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
index e83c7d1ddba38e36c4b6e7f357f1410a3a3d89de..f07e86157357707c9cc709d049354a109cf5efa8 100644 (file)
@@ -183,8 +183,8 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
       tu_emit_event_write<CHIP>(cmd_buffer, cs, FD_CACHE_INVALIDATE);
    if (flushes & TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE) {
       tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
-            .cs_bindless = 0x1f,
-            .gfx_bindless = 0x1f,
+            .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
+            .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,
       ));
    }
    if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
@@ -1146,8 +1146,8 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
          .gfx_ibo = true,
          .cs_shared_const = true,
          .gfx_shared_const = true,
-         .cs_bindless = 0x1f,
-         .gfx_bindless = 0x1f,));
+         .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
+         .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
 
    tu_cs_emit_wfi(cs);
 
@@ -2395,19 +2395,22 @@ tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
       tu_cs_emit_array(cs, (const uint32_t*)descriptors_state->set_iova, 2 * descriptors_state->max_sets_bound);
    }
 
-   /* Dynamic descriptors get the last descriptor set. */
+   /* Dynamic descriptors get the reserved descriptor set. */
    if (descriptors_state->dynamic_bound) {
-      tu_cs_emit_pkt4(cs, sp_bindless_base_reg + 4 * 2, 2);
-      tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]);
+      int reserved_set_idx = cmd->device->physical_device->reserved_set_idx;
+      assert(reserved_set_idx >= 0); /* reserved set must be bound */
+
+      tu_cs_emit_pkt4(cs, sp_bindless_base_reg + reserved_set_idx * 2, 2);
+      tu_cs_emit_qw(cs, descriptors_state->set_iova[reserved_set_idx]);
       if (CHIP == A6XX) {
-         tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + 4 * 2, 2);
-         tu_cs_emit_qw(cs, descriptors_state->set_iova[MAX_SETS]);
+         tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + reserved_set_idx * 2, 2);
+         tu_cs_emit_qw(cs, descriptors_state->set_iova[reserved_set_idx]);
       }
    }
 
    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
-      .cs_bindless = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE ? 0x1f : 0,
-      .gfx_bindless = bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS ? 0x1f : 0,
+      .cs_bindless = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE ? CHIP == A6XX ? 0x1f : 0xff : 0,
+      .gfx_bindless = bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS ? CHIP == A6XX ? 0x1f : 0xff : 0,
    ));
 
    if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
@@ -2539,6 +2542,7 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
    if (layout->dynamic_offset_size) {
       /* allocate and fill out dynamic descriptor set */
       struct tu_cs_memory dynamic_desc_set;
+      int reserved_set_idx = cmd->device->physical_device->reserved_set_idx;
       VkResult result = tu_cs_alloc(&cmd->sub_cs,
                                     layout->dynamic_offset_size / (4 * A6XX_TEX_CONST_DWORDS),
                                     A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
@@ -2549,7 +2553,8 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
 
       memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
              layout->dynamic_offset_size);
-      descriptors_state->set_iova[MAX_SETS] = dynamic_desc_set.iova | BINDLESS_DESCRIPTOR_64B;
+      assert(reserved_set_idx >= 0); /* reserved set must be bound */
+      descriptors_state->set_iova[reserved_set_idx] = dynamic_desc_set.iova | BINDLESS_DESCRIPTOR_64B;
       descriptors_state->dynamic_bound = true;
    }
 
index 3c55af0fd9c572e30957f752e87a401eb3e07bcb..7538ad3a71c9f4714ae4b2f94ed0dda3fd990c4d 100644 (file)
@@ -52,7 +52,7 @@ struct tu_descriptor_state
    struct tu_descriptor_set *sets[MAX_SETS];
    struct tu_descriptor_set push_set;
    uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
-   uint64_t set_iova[MAX_SETS + 1];
+   uint64_t set_iova[MAX_SETS];
    uint32_t max_sets_bound;
    bool dynamic_bound;
 };
index 8b21c3a90b101bb82e4a8eb495fcdd00dee94d8a..3dd346bb08017276b75001e1e70129165845f987 100644 (file)
@@ -491,7 +491,6 @@ tu_pipeline_layout_init(struct tu_pipeline_layout *layout)
    unsigned dynamic_offset_size = 0;
 
    for (uint32_t set = 0; set < layout->num_sets; set++) {
-      assert(set < MAX_SETS);
       layout->set[set].dynamic_offset_start = dynamic_offset_size;
 
       if (layout->set[set].layout)
@@ -548,7 +547,7 @@ tu_CreatePipelineLayout(VkDevice _device,
       TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout,
                      pCreateInfo->pSetLayouts[set]);
 
-      assert(set < MAX_SETS);
+      assert(set < device->physical_device->usable_sets);
       layout->set[set].layout = set_layout;
       if (set_layout)
          vk_descriptor_set_layout_ref(&set_layout->vk);
@@ -1431,7 +1430,7 @@ tu_CreateDescriptorUpdateTemplate(
       /* descriptorSetLayout should be ignored for push descriptors
        * and instead it refers to pipelineLayout and set.
        */
-      assert(pCreateInfo->set < MAX_SETS);
+      assert(pCreateInfo->set < device->physical_device->usable_sets);
       set_layout = pipeline_layout->set[pCreateInfo->set].layout;
    } else {
       TU_FROM_HANDLE(tu_descriptor_set_layout, _set_layout,
index 55fe6b9fb6ea350330ac32a7fe5cf99840fff698..c272b084e06d767c128e0d5316aa9df04ecf902c 100644 (file)
 
 #include "vk_descriptor_set_layout.h"
 
-/* The hardware supports 5 descriptor sets, but we reserve 1 for dynamic
- * descriptors and input attachments.
+/* The hardware supports up to 8 descriptor sets since A7XX.
+ * Note: This is the maximum across generations, not the maximum for a
+ * particular generation so it should only be used for allocation.
  */
-#define MAX_SETS 4
+#define MAX_SETS 8
 
 /* I have no idea what the maximum size is, but the hardware supports very
  * large numbers of descriptors (at least 2^16). This limit is based on
index 1bdf34164b3736bcca7403ad61a2176a275b286a..d334379963409b4d21b22a1d4b4d303fae4b7630 100644 (file)
@@ -628,6 +628,8 @@ tu_physical_device_init(struct tu_physical_device *device,
 
       device->ccu_offset_bypass = depth_cache_size;
       device->ccu_offset_gmem = device->gmem_size - color_cache_size;
+
+      device->usable_sets = device->reserved_set_idx = device->info->a6xx.max_sets - 1;
       break;
    }
    default:
@@ -1065,7 +1067,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
       .maxSamplerAllocationCount = 64 * 1024,
       .bufferImageGranularity = 64,          /* A cache line */
       .sparseAddressSpaceSize = 0,
-      .maxBoundDescriptorSets = MAX_SETS,
+      .maxBoundDescriptorSets = pdevice->usable_sets,
       .maxPerStageDescriptorSamplers = max_descriptor_set_size,
       .maxPerStageDescriptorUniformBuffers = max_descriptor_set_size,
       .maxPerStageDescriptorStorageBuffers = max_descriptor_set_size,
@@ -1327,10 +1329,10 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          properties->bufferlessPushDescriptors = true;
          properties->allowSamplerImageViewPostSubmitCreation = true;
          properties->descriptorBufferOffsetAlignment = A6XX_TEX_CONST_DWORDS * 4;
-         properties->maxDescriptorBufferBindings = MAX_SETS;
-         properties->maxResourceDescriptorBufferBindings = MAX_SETS;
-         properties->maxSamplerDescriptorBufferBindings = MAX_SETS;
-         properties->maxEmbeddedImmutableSamplerBindings = MAX_SETS;
+         properties->maxDescriptorBufferBindings = pdevice->usable_sets;
+         properties->maxResourceDescriptorBufferBindings = pdevice->usable_sets;
+         properties->maxSamplerDescriptorBufferBindings = pdevice->usable_sets;
+         properties->maxEmbeddedImmutableSamplerBindings = pdevice->usable_sets;
          properties->maxEmbeddedImmutableSamplers = max_descriptor_set_size;
          properties->bufferCaptureReplayDescriptorDataSize = 0;
          properties->imageCaptureReplayDescriptorDataSize = 0;
index 4f14416856666cadb686cb398b5181a90497b335..ad5ff44e13f215255e3b77362b8c0813314dbbf1 100644 (file)
@@ -89,6 +89,11 @@ struct tu_physical_device
    uint32_t ccu_offset_gmem;
    uint32_t ccu_offset_bypass;
 
+   /* Amount of usable descriptor sets, this excludes any reserved set */
+   uint32_t usable_sets;
+   /* Index of the reserved descriptor set, may be -1 if unset */
+   int32_t reserved_set_idx;
+
    bool has_set_iova;
    uint64_t va_start;
    uint64_t va_size;
index d51e8093a69bd7e59c257cc1fbc81a7582f05247..1f4eb6051b2508775604bd176e6f4791e426d3d7 100644 (file)
@@ -110,7 +110,8 @@ tu6_load_state_size(struct tu_pipeline *pipeline,
 }
 
 static void
-tu6_emit_load_state(struct tu_pipeline *pipeline,
+tu6_emit_load_state(struct tu_device *device,
+                    struct tu_pipeline *pipeline,
                     struct tu_pipeline_layout *layout)
 {
    unsigned size = tu6_load_state_size(pipeline, layout);
@@ -165,7 +166,8 @@ tu6_emit_load_state(struct tu_pipeline *pipeline,
             continue;
          switch (binding->type) {
          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-            base = MAX_SETS;
+            assert(device->physical_device->reserved_set_idx >= 0);
+            base = device->physical_device->reserved_set_idx;
             offset = (layout->set[i].dynamic_offset_start +
                       binding->dynamic_offset_offset) / 4;
             FALLTHROUGH;
@@ -201,7 +203,8 @@ tu6_emit_load_state(struct tu_pipeline *pipeline,
             break;
          }
          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-            base = MAX_SETS;
+            assert(device->physical_device->reserved_set_idx >= 0);
+            base = device->physical_device->reserved_set_idx;
             offset = (layout->set[i].dynamic_offset_start +
                       binding->dynamic_offset_offset) / 4;
             FALLTHROUGH;
@@ -404,19 +407,20 @@ tu6_emit_dynamic_offset(struct tu_cs *cs,
                         const struct tu_shader *shader,
                         struct tu_pipeline_builder *builder)
 {
+   const struct tu_physical_device *phys_dev = cs->device->physical_device;
    if (!xs || shader->const_state.dynamic_offset_loc == UINT32_MAX)
       return;
 
-   tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + MAX_SETS);
+   tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets);
    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) |
               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
               CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
-              CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(MAX_SETS, 4)));
+              CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 
-   for (unsigned i = 0; i < MAX_SETS; i++) {
+   for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
       unsigned dynamic_offset_start =
          builder->layout.set[i].dynamic_offset_start / (A6XX_TEX_CONST_DWORDS * 4);
       tu_cs_emit(cs, i < builder->layout.num_sets ? dynamic_offset_start : 0);
@@ -2235,9 +2239,9 @@ tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
          struct tu_graphics_lib_pipeline *library = builder->libraries[i];
          builder->layout.num_sets = MAX2(builder->layout.num_sets,
                                          library->num_sets);
+         assert(builder->layout.num_sets <= builder->device->physical_device->usable_sets);
          for (unsigned j = 0; j < library->num_sets; j++) {
-            if (library->layouts[i])
-               builder->layout.set[i].layout = library->layouts[i];
+            builder->layout.set[i].layout = library->layouts[i];
          }
 
          builder->layout.push_constant_size = library->push_constant_size;
@@ -3920,7 +3924,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
          /* Blob doesn't preload state on A7XX, likely preloading either
           * doesn't work or doesn't provide benefits.
           */
-         tu6_emit_load_state(*pipeline, &builder->layout);
+         tu6_emit_load_state(builder->device, *pipeline, &builder->layout);
       }
    }
 
@@ -4370,7 +4374,7 @@ tu_compute_pipeline_create(VkDevice device,
       pipeline->local_size[i] = v->local_size[i];
 
    if (CHIP == A6XX) {
-      tu6_emit_load_state(&pipeline->base, layout);
+      tu6_emit_load_state(dev, &pipeline->base, layout);
    }
 
    tu_append_executable(&pipeline->base, v, nir_initial_disasm);
index 6745da7cd249fa598e4dff0eb5bfe86059d09148..a032e0a8e3dd9c36ac9cc7b56d4612f2242ea734 100644 (file)
@@ -167,7 +167,8 @@ lower_load_push_constant(struct tu_device *dev,
 }
 
 static void
-lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
+lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b,
+                            nir_intrinsic_instr *instr,
                             struct tu_shader *shader,
                             const struct tu_pipeline_layout *layout)
 {
@@ -203,7 +204,8 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
          base = nir_imm_int(b, (layout->set[set].dynamic_offset_start +
             binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS));
       }
-      set = MAX_SETS;
+      assert(dev->physical_device->reserved_set_idx >= 0);
+      set = dev->physical_device->reserved_set_idx;
       break;
    default:
       base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS));
@@ -288,7 +290,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
       descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
    }
 
-   nir_def *results[MAX_SETS + 1] = { NULL };
+   nir_def *results[MAX_SETS] = { NULL };
 
    if (nir_scalar_is_const(scalar_idx)) {
       nir_def *bindless =
@@ -298,7 +300,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
    }
 
    nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp);
-   for (unsigned i = 0; i < MAX_SETS + 1; i++) {
+   for (unsigned i = 0; i < dev->physical_device->info->a6xx.max_sets; i++) {
       /* if (base_idx == i) { ... */
       nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i));
 
@@ -336,7 +338,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
 
    nir_def *result =
       nir_undef(b, intrin->def.num_components, intrin->def.bit_size);
-   for (int i = MAX_SETS; i >= 0; i--) {
+   for (int i = dev->physical_device->info->a6xx.max_sets - 1; i >= 0; i--) {
       nir_pop_if(b, NULL);
       if (info->has_dest)
          result = nir_if_phi(b, results[i], result);
@@ -433,7 +435,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
       return true;
 
    case nir_intrinsic_vulkan_resource_index:
-      lower_vulkan_resource_index(b, instr, shader, layout);
+      lower_vulkan_resource_index(dev, b, instr, shader, layout);
       return true;
    case nir_intrinsic_vulkan_resource_reindex:
       lower_vulkan_resource_reindex(b, instr);
@@ -715,7 +717,8 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev,
 
    if (layout->independent_sets) {
       const_state->dynamic_offset_loc = reserved_consts_vec4 * 4;
-      reserved_consts_vec4 += DIV_ROUND_UP(MAX_SETS, 4);
+      assert(dev->physical_device->reserved_set_idx >= 0);
+      reserved_consts_vec4 += DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4);
    } else {
       const_state->dynamic_offset_loc = UINT32_MAX;
    }