v3dv: implement VK_EXT_inline_uniform_block
authorIago Toral Quiroga <itoral@igalia.com>
Thu, 24 Mar 2022 09:05:17 +0000 (10:05 +0100)
committerMarge Bot <emma+marge@anholt.net>
Mon, 28 Mar 2022 10:44:13 +0000 (10:44 +0000)
Inline uniform blocks store their contents in pool memory rather
than a separate buffer, and are intended to provide a way in which
some platforms may provide more efficient access to the uniform
data, similar to push constants but with more flexible size
constraints.

We implement these in a similar way as push constants: for constant
access we copy the data in the uniform stream (using the new
QUNIFORM_UNIFORM_UBO_*) enums to identify the inline buffer from
which we need to copy and for indirect access we fallback to
regular UBO access.

Because at NIR level there is no distinction between inline and
regular UBOs and the compiler isn't aware of Vulkan descriptor
sets, we use the UBO index on UBO load intrinsics to identify
inline UBOs, just like we do for push constants. Particularly,
we reserve indices 1..MAX_INLINE_UNIFORM_BUFFERS for this,
however, unlike push constants, inline buffers are accessed
through descriptor sets, and therefore we need to make sure
they are located in the first slots of the UBO descriptor map.
This means we store them in the first MAX_INLINE_UNIFORM_BUFFERS
slots of the map, with regular UBOs always coming after these
slots.

Reviewed-by: Alejandro PiƱeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15575>

12 files changed:
docs/features.txt
src/broadcom/common/v3d_limits.h
src/broadcom/compiler/nir_to_vir.c
src/broadcom/compiler/v3d_compiler.h
src/broadcom/compiler/vir.c
src/broadcom/vulkan/v3dv_descriptor_set.c
src/broadcom/vulkan/v3dv_device.c
src/broadcom/vulkan/v3dv_limits.h
src/broadcom/vulkan/v3dv_pipeline.c
src/broadcom/vulkan/v3dv_private.h
src/broadcom/vulkan/v3dv_uniforms.c
src/gallium/drivers/v3d/v3d_screen.c

index 1b5dc75..b9976c0 100644 (file)
@@ -483,7 +483,7 @@ Vulkan 1.3 -- all DONE: anv, radv, lvp
   VK_EXT_4444_formats                                   DONE (anv, lvp, radv, tu, v3dv)
   VK_EXT_extended_dynamic_state                         DONE (anv, lvp, radv, tu)
   VK_EXT_extended_dynamic_state2                        DONE (anv, lvp, radv, tu)
-  VK_EXT_inline_uniform_block                           DONE (anv, radv)
+  VK_EXT_inline_uniform_block                           DONE (anv, radv, v3dv)
   VK_EXT_pipeline_creation_cache_control                DONE (anv, radv, v3dv)
   VK_EXT_pipeline_creation_feedback                     DONE (anv, radv, v3dv)
   VK_EXT_private_data                                   DONE (anv, lvp, radv, tu, v3dv)
index 465802c..38993fb 100644 (file)
@@ -67,4 +67,7 @@
 /* Sub-pixel precission bits in the rasterizer */
 #define V3D_COORD_SHIFT 6
 
+/* Size of a cache line */
+#define V3D_NON_COHERENT_ATOM_SIZE 256
+
 #endif /* V3D_LIMITS_H */
index 1afe020..af8201e 100644 (file)
@@ -2638,41 +2638,54 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
                        vir_MOV(c, color_reads_for_sample[component]));
 }
 
+static bool
+try_emit_uniform(struct v3d_compile *c,
+                 int offset,
+                 int num_components,
+                 nir_dest *dest,
+                 enum quniform_contents contents)
+{
+        /* Even though ldunif is strictly 32-bit we can still use it
+         * to load scalar 8-bit/16-bit uniforms so long as their offset
+         * is 32-bit aligned. In this case, ldunif would still load
+         * 32-bit into the destination with the 8-bit/16-bit uniform
+         * data in the LSB and garbage in the MSB, but that is fine
+         * because we should only be accessing the valid bits of the
+         * destination.
+         *
+         * FIXME: if in the future we improve our register allocator to
+         * pack 2 16-bit variables in the MSB and LSB of the same
+         * register then this optimization would not be valid as is,
+         * since the load clobbers the MSB.
+         */
+        if (offset % 4 != 0)
+                return false;
+
+        /* We need dwords */
+        offset = offset / 4;
+
+        for (int i = 0; i < num_components; i++) {
+                ntq_store_dest(c, dest, i,
+                               vir_uniform(c, contents, offset + i));
+        }
+
+        return true;
+}
+
 static void
 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
+        /* We scalarize general TMU access for anything that is not 32-bit. */
+        assert(nir_dest_bit_size(instr->dest) == 32 ||
+               instr->num_components == 1);
+
+        /* Try to emit ldunif if possible, otherwise fallback to general TMU */
         if (nir_src_is_const(instr->src[0])) {
                 int offset = (nir_intrinsic_base(instr) +
                              nir_src_as_uint(instr->src[0]));
 
-                /* Even though ldunif is strictly 32-bit we can still use it
-                 * to load scalar 8-bit/16-bit uniforms so long as their offset
-                 * is * 32-bit aligned. In this case, ldunif would still load
-                 * 32-bit into the destination with the 8-bit/16-bit uniform
-                 * data in the LSB and garbage in the MSB, but that is fine
-                 * because we should only be accessing the valid bits of the
-                 * destination.
-                 *
-                 * FIXME: if in the future we improve our register allocator to
-                 * pack 2 16-bit variables in the MSB and LSB of the same
-                 * register then this optimization would not be valid as is,
-                 * since the load clobbers the MSB.
-                 */
-                if (offset % 4 == 0) {
-                        /* We need dwords */
-                        offset = offset / 4;
-
-                        /* We scalarize general TMU access for anything that
-                         * is not 32-bit.
-                         */
-                        assert(nir_dest_bit_size(instr->dest) == 32 ||
-                               instr->num_components == 1);
-
-                        for (int i = 0; i < instr->num_components; i++) {
-                                ntq_store_dest(c, &instr->dest, i,
-                                               vir_uniform(c, QUNIFORM_UNIFORM,
-                                                           offset + i));
-                        }
+                if (try_emit_uniform(c, offset, instr->num_components,
+                                     &instr->dest, QUNIFORM_UNIFORM)) {
                         return;
                 }
         }
@@ -2680,6 +2693,41 @@ ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
         ntq_emit_tmu_general(c, instr, false);
 }
 
+static bool
+ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+        if (c->compiler->max_inline_uniform_buffers <= 0)
+                return false;
+
+        /* On Vulkan we use indices 1..MAX_INLINE_UNIFORM_BUFFERS for inline
+         * uniform buffers which we want to handle more like push constants
+         * than regular UBO. OpenGL doesn't implement this feature.
+         */
+        assert(c->key->environment == V3D_ENVIRONMENT_VULKAN);
+        uint32_t index = nir_src_as_uint(instr->src[0]);
+        if (index == 0 || index > c->compiler->max_inline_uniform_buffers)
+                return false;
+
+        /* We scalarize general TMU access for anything that is not 32-bit */
+        assert(nir_dest_bit_size(instr->dest) == 32 ||
+               instr->num_components == 1);
+
+        if (nir_src_is_const(instr->src[1])) {
+                /* Index 0 is reserved for push constants */
+                assert(index > 0);
+                uint32_t inline_index = index - 1;
+                int offset = nir_src_as_uint(instr->src[1]);
+                if (try_emit_uniform(c, offset, instr->num_components,
+                                     &instr->dest,
+                                     QUNIFORM_INLINE_UBO_0 + inline_index)) {
+                        return true;
+                }
+        }
+
+        /* Fallback to regular UBO load */
+        return false;
+}
+
 static void
 ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
@@ -3199,6 +3247,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_ubo:
+           if (ntq_emit_inline_ubo_load(c, instr))
+                   break;
+           FALLTHROUGH;
         case nir_intrinsic_load_ssbo:
                 if (!ntq_emit_load_unifa(c, instr)) {
                         ntq_emit_tmu_general(c, instr, false);
index c978995..db4a4c4 100644 (file)
@@ -338,6 +338,14 @@ enum quniform_contents {
          * Current value of gl_ViewIndex for Multiview rendering.
          */
         QUNIFORM_VIEW_INDEX,
+
+        /**
+         * Inline uniform buffers
+         */
+         QUNIFORM_INLINE_UBO_0,
+         QUNIFORM_INLINE_UBO_1,
+         QUNIFORM_INLINE_UBO_2,
+         QUNIFORM_INLINE_UBO_3,
 };
 
 static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
@@ -574,6 +582,7 @@ enum v3d_compilation_result {
  */
 struct v3d_compiler {
         const struct v3d_device_info *devinfo;
+        uint32_t max_inline_uniform_buffers;
         struct ra_regs *regs;
         struct ra_class *reg_class_any[3];
         struct ra_class *reg_class_r5[3];
@@ -1045,7 +1054,8 @@ vir_has_uniform(struct qinst *inst)
         return inst->uniform != ~0;
 }
 
-const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
+const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo,
+                                             uint32_t max_inline_uniform_buffers);
 void v3d_compiler_free(const struct v3d_compiler *compiler);
 void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);
 
index 4992a7f..9d4fc58 100644 (file)
@@ -517,13 +517,15 @@ vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
 }
 
 const struct v3d_compiler *
-v3d_compiler_init(const struct v3d_device_info *devinfo)
+v3d_compiler_init(const struct v3d_device_info *devinfo,
+                  uint32_t max_inline_uniform_buffers)
 {
         struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
         if (!compiler)
                 return NULL;
 
         compiler->devinfo = devinfo;
+        compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
 
         if (!vir_init_reg_sets(compiler)) {
                 ralloc_free(compiler);
index 1c5b785..72e7eb8 100644 (file)
  * binding layout, and array_index, it returns the map region assigned to it
  * from the descriptor pool bo.
  */
-static void*
+static void *
 descriptor_bo_map(struct v3dv_device *device,
                   struct v3dv_descriptor_set *set,
                   const struct v3dv_descriptor_set_binding_layout *binding_layout,
                   uint32_t array_index)
 {
-   assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
+   /* Inline uniform blocks use BO memory to store UBO contents, not
+    * descriptor data, so their descriptor BO size is 0 even though they
+    * do use BO memory.
+    */
+   uint32_t bo_size = v3dv_X(device, descriptor_bo_size)(binding_layout->type);
+   assert(bo_size > 0 ||
+          binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
+
    return set->pool->bo->map +
       set->base_offset + binding_layout->descriptor_offset +
-      array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type);
+      array_index * bo_size;
 }
 
 static bool
@@ -102,7 +109,7 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat
  * It also returns the descriptor type, so the caller could do extra
  * validation or adding extra offsets if the bo contains more that one field.
  */
-static struct v3dv_cl_reloc
+struct v3dv_cl_reloc
 v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
                                       struct v3dv_descriptor_state *descriptor_state,
                                       struct v3dv_descriptor_map *map,
@@ -125,8 +132,10 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
    const struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set->layout->binding[binding_number];
 
-   assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
-   *out_type = binding_layout->type;
+   assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT ||
+          v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
+   if (out_type)
+      *out_type = binding_layout->type;
 
    uint32_t array_index = map->array_index[index];
    assert(array_index < binding_layout->array_size);
@@ -364,6 +373,10 @@ v3dv_CreateDescriptorPool(VkDevice _device,
    uint32_t bo_size = 0;
    uint32_t descriptor_count = 0;
 
+   const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO);
+
    assert(pCreateInfo->poolSizeCount > 0);
    for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
       /* Verify supported descriptor type */
@@ -379,6 +392,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
       case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
       case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
          break;
       default:
          unreachable("Unimplemented descriptor type");
@@ -386,9 +400,28 @@ v3dv_CreateDescriptorPool(VkDevice _device,
       }
 
       assert(pCreateInfo->pPoolSizes[i].descriptorCount > 0);
-      descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
-      bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) *
-         pCreateInfo->pPoolSizes[i].descriptorCount;
+      if (pCreateInfo->pPoolSizes[i].type ==
+          VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+         /* Inline uniform blocks are specified to use the descriptor array
+          * size as the size in bytes of the block.
+          */
+         assert(inline_info);
+         descriptor_count++;
+         bo_size += pCreateInfo->pPoolSizes[i].descriptorCount;
+      } else {
+         descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
+         bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) *
+            pCreateInfo->pPoolSizes[i].descriptorCount;
+      }
+   }
+
+   /* We align all our buffers to V3D_NON_COHERENT_ATOM_SIZE, make sure we
+    * allocate enough memory to honor that requirement for all our inline
+    * buffers too.
+    */
+   if (inline_info) {
+      bo_size += V3D_NON_COHERENT_ATOM_SIZE *
+                 inline_info->maxInlineUniformBlockBindings;
    }
 
    if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
@@ -599,6 +632,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
       case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
       case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
          /* Nothing here, just to keep the descriptor type filtering below */
          break;
       default:
@@ -624,16 +658,36 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
          samplers_offset += sizeof(struct v3dv_sampler) * binding->descriptorCount;
       }
 
-      descriptor_count += binding->descriptorCount;
-      dynamic_offset_count += binding->descriptorCount *
-         set_layout->binding[binding_number].dynamic_offset_count;
-
       set_layout->shader_stages |= binding->stageFlags;
 
-      set_layout->binding[binding_number].descriptor_offset = set_layout->bo_size;
-      set_layout->bo_size +=
-         v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) *
-         binding->descriptorCount;
+      if (binding->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+         dynamic_offset_count += binding->descriptorCount *
+            set_layout->binding[binding_number].dynamic_offset_count;
+
+         descriptor_count += binding->descriptorCount;
+
+         set_layout->binding[binding_number].descriptor_offset =
+            set_layout->bo_size;
+         set_layout->bo_size +=
+            v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) *
+            binding->descriptorCount;
+      } else {
+         /* We align all our buffers, inline buffers too. We made sure to take
+          * this account when calculating total BO size requirements at pool
+          * creation time.
+          */
+         set_layout->bo_size = align(set_layout->bo_size,
+                                     V3D_NON_COHERENT_ATOM_SIZE);
+
+         set_layout->binding[binding_number].descriptor_offset =
+            set_layout->bo_size;
+
+         /* Inline uniform blocks are not arrayed, instead descriptorCount
+          * specifies the size of the buffer in bytes.
+          */
+         set_layout->bo_size += binding->descriptorCount;
+         descriptor_count++;
+      }
    }
 
    free(bindings);
@@ -931,6 +985,31 @@ write_buffer_view_descriptor(struct v3dv_device *device,
           sizeof(bview->texture_shader_state));
 }
 
+static void
+write_inline_uniform_descriptor(struct v3dv_device *device,
+                                struct v3dv_descriptor *descriptor,
+                                struct v3dv_descriptor_set *set,
+                                const struct v3dv_descriptor_set_binding_layout *binding_layout,
+                                const void *data,
+                                size_t offset,
+                                size_t size)
+{
+   assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
+   descriptor->type = VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;
+   descriptor->buffer = NULL;
+
+   void *desc_map = descriptor_bo_map(device, set, binding_layout, 0);
+   memcpy(desc_map + offset, data, size);
+
+   /* Inline uniform buffers allocate BO space in the pool for all inline
+    * buffers it may allocate and then this space is assigned to individual
+    * descriptors when they are written, so we define the range of an inline
+    * buffer as the largest range of data that the client has written to it.
+    */
+   descriptor->offset = 0;
+   descriptor->range = MAX2(descriptor->range, offset + size);
+}
+
 VKAPI_ATTR void VKAPI_CALL
 v3dv_UpdateDescriptorSets(VkDevice  _device,
                           uint32_t descriptorWriteCount,
@@ -949,9 +1028,20 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
       struct v3dv_descriptor *descriptor = set->descriptors;
 
       descriptor += binding_layout->descriptor_index;
-      descriptor += writeset->dstArrayElement;
 
-      for (uint32_t j = 0; j < writeset->descriptorCount; ++j) {
+      /* Inline uniform blocks are not arrayed, instead they use dstArrayElement
+       * to specify the byte offset of the uniform update and descriptorCount
+       * to specify the size (in bytes) of the update.
+       */
+      uint32_t descriptor_count;
+      if (writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+         descriptor += writeset->dstArrayElement;
+         descriptor_count = writeset->descriptorCount;
+      } else {
+         descriptor_count = 1;
+      }
+
+      for (uint32_t j = 0; j < descriptor_count; ++j) {
          switch(writeset->descriptorType) {
 
          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
@@ -1006,6 +1096,18 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
                                          writeset->dstArrayElement + j);
             break;
          }
+         case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: {
+            const VkWriteDescriptorSetInlineUniformBlock *inline_write =
+               vk_find_struct_const(writeset->pNext,
+                                    WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
+            assert(inline_write->dataSize == writeset->descriptorCount);
+            write_inline_uniform_descriptor(device, descriptor, set,
+                                            binding_layout,
+                                            inline_write->pData,
+                                            writeset->dstArrayElement, /* offset */
+                                            inline_write->dataSize);
+            break;
+         }
          default:
             unreachable("unimplemented descriptor type");
             break;
@@ -1032,9 +1134,25 @@ v3dv_UpdateDescriptorSets(VkDevice  _device,
       struct v3dv_descriptor *dst_descriptor = dst_set->descriptors;
 
       src_descriptor += src_binding_layout->descriptor_index;
-      src_descriptor += copyset->srcArrayElement;
-
       dst_descriptor += dst_binding_layout->descriptor_index;
+
+      if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         /* {src,dst}ArrayElement specifies src/dst start offset and
+          * descriptorCount specifies size (in bytes) to copy.
+          */
+         const void *src_data = src_set->pool->bo->map +
+                                src_set->base_offset +
+                                src_binding_layout->descriptor_offset +
+                                copyset->srcArrayElement;
+         write_inline_uniform_descriptor(device, dst_descriptor, dst_set,
+                                         dst_binding_layout,
+                                         src_data,
+                                         copyset->dstArrayElement,
+                                         copyset->descriptorCount);
+         continue;
+      }
+
+      src_descriptor += copyset->srcArrayElement;
       dst_descriptor += copyset->dstArrayElement;
 
       for (uint32_t j = 0; j < copyset->descriptorCount; j++) {
@@ -1179,8 +1297,7 @@ v3dv_UpdateDescriptorSetWithTemplate(
 
       struct v3dv_descriptor *descriptor =
          set->descriptors +
-         binding_layout->descriptor_index +
-         entry->array_element;
+         binding_layout->descriptor_index;
 
       switch (entry->type) {
       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
@@ -1190,7 +1307,8 @@ v3dv_UpdateDescriptorSetWithTemplate(
          for (uint32_t j = 0; j < entry->array_count; j++) {
             const VkDescriptorBufferInfo *info =
                pData + entry->offset + j * entry->stride;
-            write_buffer_descriptor(descriptor + j, entry->type, info);
+            write_buffer_descriptor(descriptor + entry->array_element + j,
+                                    entry->type, info);
          }
          break;
 
@@ -1204,9 +1322,9 @@ v3dv_UpdateDescriptorSetWithTemplate(
                pData + entry->offset + j * entry->stride;
             V3DV_FROM_HANDLE(v3dv_image_view, iview, info->imageView);
             V3DV_FROM_HANDLE(v3dv_sampler, sampler, info->sampler);
-            write_image_descriptor(device, descriptor + j, entry->type,
-                                   set, binding_layout, iview, sampler,
-                                   entry->array_element + j);
+            write_image_descriptor(device, descriptor + entry->array_element + j,
+                                   entry->type, set, binding_layout, iview,
+                                   sampler, entry->array_element + j);
          }
          break;
 
@@ -1216,12 +1334,22 @@ v3dv_UpdateDescriptorSetWithTemplate(
             const VkBufferView *_bview =
                pData + entry->offset + j * entry->stride;
             V3DV_FROM_HANDLE(v3dv_buffer_view, bview, *_bview);
-            write_buffer_view_descriptor(device, descriptor + j, entry->type,
-                                         set, binding_layout, bview,
+            write_buffer_view_descriptor(device,
+                                         descriptor + entry->array_element + j,
+                                         entry->type, set, binding_layout, bview,
                                          entry->array_element + j);
          }
          break;
 
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: {
+         write_inline_uniform_descriptor(device, descriptor, set,
+                                         binding_layout,
+                                         pData + entry->offset,
+                                         entry->array_element, /* offset */
+                                         entry->array_count);  /* size */
+         break;
+      }
+
       default:
          unreachable("Unsupported descriptor type");
       }
index b62703b..4205b71 100644 (file)
@@ -153,6 +153,7 @@ get_device_extensions(const struct v3dv_physical_device *device,
       .EXT_4444_formats                    = true,
       .EXT_color_write_enable              = true,
       .EXT_custom_border_color             = true,
+      .EXT_inline_uniform_block            = true,
       .EXT_external_memory_dma_buf         = true,
       .EXT_host_query_reset                = true,
       .EXT_image_drm_format_modifier       = true,
@@ -812,7 +813,8 @@ physical_device_init(struct v3dv_physical_device *device,
    if (result != VK_SUCCESS)
       goto fail;
 
-   device->compiler = v3d_compiler_init(&device->devinfo);
+   device->compiler = v3d_compiler_init(&device->devinfo,
+                                        MAX_INLINE_UNIFORM_BUFFERS);
    device->next_program_id = 0;
 
    ASSERTED int len =
@@ -1089,6 +1091,20 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
 {
    v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
 
+   VkPhysicalDeviceVulkan13Features vk13 = {
+      .inlineUniformBlock  = true,
+      /* Inline buffers work like push constants, so after their are bound
+       * some of their contents may be copied into the uniform stream as soon
+       * as the next draw/dispatch is recorded in the command buffer. This means
+       * that if the client updates the buffer contents after binding it to
+       * a command buffer, the next queue submit of that command buffer may
+       * not use the latest update to the buffer contents, but the data that
+       * was present in the buffer at the time it was bound to the command
+       * buffer.
+       */
+      .descriptorBindingInlineUniformBlockUpdateAfterBind = false,
+   };
+
    VkPhysicalDeviceVulkan12Features vk12 = {
       .hostQueryReset = true,
       .uniformAndStorageBuffer8BitAccess = true,
@@ -1173,6 +1189,15 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT: {
+         VkPhysicalDeviceInlineUniformBlockFeaturesEXT *features =
+            (VkPhysicalDeviceInlineUniformBlockFeaturesEXT *)ext;
+         features->inlineUniformBlock = vk13.inlineUniformBlock;
+         features->descriptorBindingInlineUniformBlockUpdateAfterBind =
+            vk13.descriptorBindingInlineUniformBlockUpdateAfterBind;
+         break;
+      }
+
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: {
           VkPhysicalDeviceColorWriteEnableFeaturesEXT *features = (void *) ext;
           features->colorWriteEnable = true;
@@ -1385,7 +1410,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxPushConstantsSize                     = MAX_PUSH_CONSTANTS_SIZE,
       .maxMemoryAllocationCount                 = mem_size / page_size,
       .maxSamplerAllocationCount                = 64 * 1024,
-      .bufferImageGranularity                   = 256, /* A cache line */
+      .bufferImageGranularity                   = V3D_NON_COHERENT_ATOM_SIZE,
       .sparseAddressSpaceSize                   = 0,
       .maxBoundDescriptorSets                   = MAX_SETS,
       .maxPerStageDescriptorSamplers            = V3D_MAX_TEXTURE_SAMPLERS,
@@ -1499,7 +1524,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .standardSampleLocations                  = false,
       .optimalBufferCopyOffsetAlignment         = 32,
       .optimalBufferCopyRowPitchAlignment       = 32,
-      .nonCoherentAtomSize                      = 256,
+      .nonCoherentAtomSize                      = V3D_NON_COHERENT_ATOM_SIZE,
    };
 
    *pProperties = (VkPhysicalDeviceProperties) {
@@ -1575,6 +1600,18 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          };
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES_EXT: {
+         VkPhysicalDeviceInlineUniformBlockProperties *props =
+            (VkPhysicalDeviceInlineUniformBlockProperties *)ext;
+         props->maxInlineUniformBlockSize = 4096;
+         props->maxPerStageDescriptorInlineUniformBlocks =
+            MAX_INLINE_UNIFORM_BUFFERS;
+         props->maxDescriptorSetInlineUniformBlocks =
+            MAX_INLINE_UNIFORM_BUFFERS;
+         props->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 0;
+         props->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = 0;
+         break;
+      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: {
          VkPhysicalDeviceProvokingVertexPropertiesEXT *props =
             (VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext;
@@ -2516,7 +2553,7 @@ v3dv_CreateBuffer(VkDevice  _device,
 
    buffer->size = pCreateInfo->size;
    buffer->usage = pCreateInfo->usage;
-   buffer->alignment = 256; /* nonCoherentAtomSize */
+   buffer->alignment = V3D_NON_COHERENT_ATOM_SIZE;
 
    /* Limit allocations to 32-bit */
    const VkDeviceSize aligned_size = align64(buffer->size, buffer->alignment);
index 7e67d12..52bf2dd 100644 (file)
@@ -44,6 +44,7 @@
 #define MAX_INPUT_ATTACHMENTS 4
 
 #define MAX_UNIFORM_BUFFERS 12
+#define MAX_INLINE_UNIFORM_BUFFERS 4
 #define MAX_STORAGE_BUFFERS 8
 
 #define MAX_DYNAMIC_UNIFORM_BUFFERS 8
index acfc696..17c8e33 100644 (file)
@@ -465,17 +465,19 @@ descriptor_map_add(struct v3dv_descriptor_map *map,
                    int binding,
                    int array_index,
                    int array_size,
+                   int start_index,
                    uint8_t return_size)
 {
    assert(array_index < array_size);
    assert(return_size == 16 || return_size == 32);
 
-   unsigned index = 0;
-   for (unsigned i = 0; i < map->num_desc; i++) {
-      if (set == map->set[i] &&
-          binding == map->binding[i] &&
-          array_index == map->array_index[i]) {
-         assert(array_size == map->array_size[i]);
+   unsigned index = start_index;
+   for (; index < map->num_desc; index++) {
+      if (map->used[index] &&
+          set == map->set[index] &&
+          binding == map->binding[index] &&
+          array_index == map->array_index[index]) {
+         assert(array_size == map->array_size[index]);
          if (return_size != map->return_size[index]) {
             /* It the return_size is different it means that the same sampler
              * was used for operations with different precision
@@ -485,18 +487,21 @@ descriptor_map_add(struct v3dv_descriptor_map *map,
             map->return_size[index] = 32;
          }
          return index;
+      } else if (!map->used[index]) {
+         break;
       }
-      index++;
    }
 
-   assert(index == map->num_desc);
+   assert(index < DESCRIPTOR_MAP_SIZE);
+   assert(!map->used[index]);
 
-   map->set[map->num_desc] = set;
-   map->binding[map->num_desc] = binding;
-   map->array_index[map->num_desc] = array_index;
-   map->array_size[map->num_desc] = array_size;
-   map->return_size[map->num_desc] = return_size;
-   map->num_desc++;
+   map->used[index] = true;
+   map->set[index] = set;
+   map->binding[index] = binding;
+   map->array_index[index] = array_index;
+   map->array_size[index] = array_size;
+   map->return_size[index] = return_size;
+   map->num_desc = MAX2(map->num_desc, index + 1);
 
    return index;
 }
@@ -536,8 +541,11 @@ pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline,
          &pipeline->shared_data->maps[broadcom_stage]->sampler_map :
          &pipeline->shared_data->maps[broadcom_stage]->texture_map;
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
       return &pipeline->shared_data->maps[broadcom_stage]->ubo_map;
    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
       return &pipeline->shared_data->maps[broadcom_stage]->ssbo_map;
    default:
       unreachable("Descriptor type unknown or not having a descriptor map");
@@ -563,31 +571,53 @@ lower_vulkan_resource_index(nir_builder *b,
    struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set_layout->binding[binding];
    unsigned index = 0;
-   const VkDescriptorType desc_type = nir_intrinsic_desc_type(instr);
 
-   switch (desc_type) {
+   switch (binding_layout->type) {
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: {
       struct v3dv_descriptor_map *descriptor_map =
-         pipeline_get_descriptor_map(pipeline, desc_type, shader->info.stage, false);
+         pipeline_get_descriptor_map(pipeline, binding_layout->type,
+                                     shader->info.stage, false);
 
       if (!const_val)
          unreachable("non-constant vulkan_resource_index array index");
 
+      /* At compile-time we will need to know if we are processing a UBO load
+       * for an inline or a regular UBO so we can handle inline loads like
+       * push constants. At the level of NIR level however, the inline
+       * information is gone, so we rely on the index to make this distinction.
+       * Particularly, we reserve indices 1..MAX_INLINE_UNIFORM_BUFFERS for
+       * inline buffers. This means that at the descriptor map level
+       * we store inline buffers at slots 0..MAX_INLINE_UNIFORM_BUFFERS - 1,
+       * and regular UBOs at indices starting from MAX_INLINE_UNIFORM_BUFFERS.
+       */
+      uint32_t start_index = 0;
+      if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+          binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
+         start_index = MAX_INLINE_UNIFORM_BUFFERS;
+      }
+
       index = descriptor_map_add(descriptor_map, set, binding,
                                  const_val->u32,
                                  binding_layout->array_size,
+                                 start_index,
                                  32 /* return_size: doesn't really apply for this case */);
 
-      if (desc_type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
-         /* skip index 0 which is used for push constants */
+      /* We always reserve index 0 for push constants */
+      if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+          binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+          binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
          index++;
       }
+
       break;
    }
 
    default:
-      unreachable("unsupported desc_type for vulkan_resource_index");
+      unreachable("unsupported descriptor type for vulkan_resource_index");
       break;
    }
 
@@ -698,6 +728,7 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
                          deref->var->data.binding,
                          array_index,
                          binding_layout->array_size,
+                         0,
                          return_size);
 
    if (is_sampler)
@@ -807,6 +838,7 @@ lower_image_deref(nir_builder *b,
                          deref->var->data.binding,
                          array_index,
                          binding_layout->array_size,
+                         0,
                          32 /* return_size: doesn't apply for textures */);
 
    /* Note: we don't need to do anything here in relation to the precision and
@@ -1752,12 +1784,12 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
     */
    UNUSED unsigned index =
       descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map,
-                         -1, -1, -1, 0, 16);
+                         -1, -1, -1, 0, 0, 16);
    assert(index == V3DV_NO_SAMPLER_16BIT_IDX);
 
    index =
       descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map,
-                         -2, -2, -2, 0, 32);
+                         -2, -2, -2, 0, 0, 32);
    assert(index == V3DV_NO_SAMPLER_32BIT_IDX);
 
    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
index 3002d21..ff94f1d 100644 (file)
@@ -1353,8 +1353,8 @@ struct v3dv_descriptor {
 
       struct {
          struct v3dv_buffer *buffer;
-         uint32_t offset;
-         uint32_t range;
+         size_t offset;
+         size_t range;
       };
 
       struct v3dv_buffer_view *buffer_view;
@@ -1727,8 +1727,8 @@ struct v3dv_pipeline_layout {
  * FIXME: one alternative would be to allocate the map as big as you need for
  * each descriptor type. That would means more individual allocations.
  */
-#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS, \
-                                 MAX_UNIFORM_BUFFERS,      \
+#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS,                         \
+                                 MAX_UNIFORM_BUFFERS + MAX_INLINE_UNIFORM_BUFFERS, \
                                  MAX_STORAGE_BUFFERS)
 
 
@@ -1739,6 +1739,7 @@ struct v3dv_descriptor_map {
    int binding[DESCRIPTOR_MAP_SIZE];
    int array_index[DESCRIPTOR_MAP_SIZE];
    int array_size[DESCRIPTOR_MAP_SIZE];
+   bool used[DESCRIPTOR_MAP_SIZE];
 
    /* NOTE: the following is only for sampler, but this is the easier place to
     * put it.
@@ -2073,6 +2074,14 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat
                                    uint32_t index,
                                    uint32_t *dynamic_offset);
 
+struct v3dv_cl_reloc
+v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
+                                      struct v3dv_descriptor_state *descriptor_state,
+                                      struct v3dv_descriptor_map *map,
+                                      struct v3dv_pipeline_layout *pipeline_layout,
+                                      uint32_t index,
+                                      VkDescriptorType *out_type);
+
 const struct v3dv_sampler *
 v3dv_descriptor_map_get_sampler(struct v3dv_descriptor_state *descriptor_state,
                                 struct v3dv_descriptor_map *map,
index 863f7d0..8f14d01 100644 (file)
@@ -56,7 +56,8 @@ struct state_bo_list {
    struct v3dv_bo *states[MAX_TOTAL_STATES];
 };
 
-#define MAX_TOTAL_UNIFORM_BUFFERS (1 + MAX_UNIFORM_BUFFERS * MAX_STAGES)
+#define MAX_TOTAL_UNIFORM_BUFFERS (1 + (MAX_UNIFORM_BUFFERS + \
+                                        MAX_INLINE_UNIFORM_BUFFERS) * MAX_STAGES)
 #define MAX_TOTAL_STORAGE_BUFFERS (MAX_STORAGE_BUFFERS * MAX_STAGES)
 struct buffer_bo_list {
    struct v3dv_bo *ubo[MAX_TOTAL_UNIFORM_BUFFERS];
@@ -247,10 +248,12 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
 
    uint32_t dynamic_offset = 0;
 
-   /* For ubos, index is shifted, as 0 is reserved for push constants.
+   /* For ubos, index is shifted, as 0 is reserved for push constants
+    * and 1..MAX_INLINE_UNIFORM_BUFFERS are reserved for inline uniform
+    * buffers.
     */
-   if (content == QUNIFORM_UBO_ADDR &&
-       v3d_unit_data_get_unit(data) == 0) {
+   uint32_t index = v3d_unit_data_get_unit(data);
+   if (content == QUNIFORM_UBO_ADDR && index == 0) {
       /* This calls is to ensure that the push_constant_ubo is
        * updated. It already take into account it is should do the
        * update or not
@@ -266,40 +269,97 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
                                offset + dynamic_offset);
       buffer_bos->ubo[0] = resource->bo;
    } else {
-      uint32_t index =
-         content == QUNIFORM_UBO_ADDR ?
-         v3d_unit_data_get_unit(data) - 1 :
-         data;
+      if (content == QUNIFORM_UBO_ADDR) {
+         /* We reserve index 0 for push constants and artificially increase our
+          * indices by one for that reason, fix that now before accessing the
+          * descriptor map.
+          */
+         assert(index > 0);
+         index--;
+      } else {
+         index = data;
+      }
 
       struct v3dv_descriptor *descriptor =
          v3dv_descriptor_map_get_descriptor(descriptor_state, map,
                                             pipeline->layout,
                                             index, &dynamic_offset);
+
+      /* Inline UBO descriptors store UBO data in descriptor pool memory,
+       * instead of an external buffer.
+       */
       assert(descriptor);
-      assert(descriptor->buffer);
-      assert(descriptor->buffer->mem);
-      assert(descriptor->buffer->mem->bo);
 
       if (content == QUNIFORM_GET_SSBO_SIZE ||
           content == QUNIFORM_GET_UBO_SIZE) {
          cl_aligned_u32(uniforms, descriptor->range);
       } else {
-         cl_aligned_u32(uniforms, descriptor->buffer->mem->bo->offset +
-                                  descriptor->buffer->mem_offset +
-                                  descriptor->offset +
-                                  offset + dynamic_offset);
+         /* Inline uniform buffers store their contents in pool memory instead
+          * of an external buffer.
+          */
+         struct v3dv_bo *bo;
+         uint32_t addr;
+         if (descriptor->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+            assert(dynamic_offset == 0);
+            struct v3dv_cl_reloc reloc =
+               v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device,
+                                                     descriptor_state, map,
+                                                     pipeline->layout, index,
+                                                     NULL);
+            bo = reloc.bo;
+            addr = reloc.bo->offset + reloc.offset + offset;
+         } else {
+            assert(descriptor->buffer);
+            assert(descriptor->buffer->mem);
+            assert(descriptor->buffer->mem->bo);
+
+            bo = descriptor->buffer->mem->bo;
+            addr = bo->offset +
+                   descriptor->buffer->mem_offset +
+                   descriptor->offset +
+                   offset + dynamic_offset;
+         }
+
+         cl_aligned_u32(uniforms, addr);
 
          if (content == QUNIFORM_UBO_ADDR) {
-            assert(index + 1 < MAX_TOTAL_UNIFORM_BUFFERS);
-            buffer_bos->ubo[index + 1] = descriptor->buffer->mem->bo;
+            assert(index < MAX_TOTAL_UNIFORM_BUFFERS);
+            buffer_bos->ubo[index] = bo;
          } else {
             assert(index < MAX_TOTAL_STORAGE_BUFFERS);
-            buffer_bos->ssbo[index] = descriptor->buffer->mem->bo;
+            buffer_bos->ssbo[index] = bo;
          }
       }
    }
 }
 
+static void
+write_inline_uniform(struct v3dv_cl_out **uniforms,
+                     uint32_t index,
+                     uint32_t offset,
+                     struct v3dv_cmd_buffer *cmd_buffer,
+                     struct v3dv_pipeline *pipeline,
+                     enum broadcom_shader_stage stage)
+{
+   assert(index < MAX_INLINE_UNIFORM_BUFFERS);
+
+   struct v3dv_descriptor_state *descriptor_state =
+      v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);
+
+   struct v3dv_descriptor_map *map =
+      &pipeline->shared_data->maps[stage]->ubo_map;
+
+   struct v3dv_cl_reloc reloc =
+      v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device,
+                                            descriptor_state, map,
+                                            pipeline->layout, index,
+                                            NULL);
+
+   /* Offset comes in 32-bit units */
+   uint32_t *addr = reloc.bo->map + reloc.offset + 4 * offset;
+   cl_aligned_u32(uniforms, *addr);
+}
+
 static uint32_t
 get_texture_size_from_image_view(struct v3dv_image_view *image_view,
                                  enum quniform_contents contents,
@@ -432,6 +492,15 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
          cl_aligned_u32(&uniforms, cmd_buffer->push_constants_data[data]);
          break;
 
+      case QUNIFORM_INLINE_UBO_0:
+      case QUNIFORM_INLINE_UBO_1:
+      case QUNIFORM_INLINE_UBO_2:
+      case QUNIFORM_INLINE_UBO_3:
+         write_inline_uniform(&uniforms,
+                              uinfo->contents[i] - QUNIFORM_INLINE_UBO_0, data,
+                              cmd_buffer, pipeline, variant->stage);
+         break;
+
       case QUNIFORM_VIEWPORT_X_SCALE:
          cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
          break;
index f3f0828..92af23f 100644 (file)
@@ -184,7 +184,7 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
                 return screen->devinfo.ver >= 40;
 
         case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
-                return 256;
+                return V3D_NON_COHERENT_ATOM_SIZE;
 
         case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
                 if (screen->devinfo.ver < 40)
@@ -872,7 +872,7 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
 
         v3d_resource_screen_init(pscreen);
 
-        screen->compiler = v3d_compiler_init(&screen->devinfo);
+        screen->compiler = v3d_compiler_init(&screen->devinfo, 0);
 
 #ifdef ENABLE_SHADER_CACHE
         v3d_disk_cache_init(screen);