pvr: Write descriptor set addrs table dev addr into shareds
authorKarmjit Mahil <Karmjit.Mahil@imgtec.com>
Wed, 7 Dec 2022 16:13:30 +0000 (16:13 +0000)
committerMarge Bot <emma+marge@anholt.net>
Mon, 20 Feb 2023 13:34:02 +0000 (13:34 +0000)
Previously UBOs and various buffers, as well as the native
descriptor sets were DMAed into the shared registers. This added
complexity in allocating the registers and various other places.
We also ended up being in situations were we wouldn't know the size
of a buffer by the time the shaders were being compiled. It would
be possible to determine the size by inspecting the shader but
that would introduce more complexity in the compiler.
To get things working sooner, avoid extra complexity for
now, a different approach was devised.

The driver will write the addresses of the currently bound
descriptor sets into a device buffer. The device buffer is referred
to as the descriptor set addrs table. The dev addr of the table is
written into a shared register. To access the buffers the shader
will first get the address of the descriptor set from the in memory
table. Then get the primary descriptor from the descriptor set. And
finally access the in memory buffer with the address it read from
the descriptor. Essentially there's three level of indirection and
all the buffers are in memory. The shader will know what offset the
primary descriptor is located based on the descriptor set layout.
The descriptor set address could have been written into the shareds
directly but that would require extra handling on the compiler side
so opted to just write the table address instead.

Signed-off-by: Karmjit Mahil <Karmjit.Mahil@imgtec.com>
Reviewed-by: Frank Binns <frank.binns@imgtec.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21331>

src/imagination/vulkan/pds/pvr_pds.h
src/imagination/vulkan/pds/pvr_xgl_pds.c
src/imagination/vulkan/pvr_cmd_buffer.c
src/imagination/vulkan/pvr_pipeline.c

index 7f5396b..ccf95e0 100644 (file)
@@ -96,6 +96,10 @@ enum pvr_pds_vertex_attrib_program_type {
    PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT
 };
 
+enum pvr_pds_addr_literal_type {
+   PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE,
+};
+
 /*****************************************************************************
  Structure definitions
 *****************************************************************************/
@@ -881,6 +885,11 @@ struct pvr_pds_descriptor_set {
                                    */
 };
 
+struct pvr_pds_addr_literal {
+   enum pvr_pds_addr_literal_type type;
+   unsigned int destination;
+};
+
 #define PVR_BUFFER_TYPE_UBO (0)
 #define PVR_BUFFER_TYPE_COMPILE_TIME (1)
 #define PVR_BUFFER_TYPE_BLEND_CONSTS (2)
@@ -914,6 +923,9 @@ struct pvr_pds_descriptor_program_input {
    unsigned int descriptor_set_count;
    struct pvr_pds_descriptor_set descriptor_sets[8];
 
+   unsigned int addr_literal_count;
+   struct pvr_pds_addr_literal addr_literals[8];
+
    /* "State" buffers, including:
     * compile-time constants
     * blend constants
@@ -1002,6 +1014,9 @@ struct pvr_pds_vertex_primary_program_input {
 #define PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_WORKGROUP (13)
 #define PVR_PDS_CONST_MAP_ENTRY_TYPE_COND_RENDER (14)
 
+#define PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER (15)
+#define PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL (16)
+
 /* We pack all the following structs tightly into a buffer using += sizeof(x)
  * offsets, this can lead to data that is not native aligned. Supplying the
  * packed attribute indicates that unaligned accesses may be required, and the
@@ -1135,6 +1150,18 @@ struct pvr_pds_const_map_entry_cond_render {
    uint32_t cond_render_pred_temp;
 } PVR_ALIGNED;
 
+struct pvr_pds_const_map_entry_addr_literal_buffer {
+   uint8_t type;
+   uint8_t const_offset;
+
+   uint32_t size;
+} PVR_ALIGNED;
+
+struct pvr_pds_const_map_entry_addr_literal {
+   uint8_t type;
+   enum pvr_pds_addr_literal_type addr_type;
+} PVR_ALIGNED;
+
 struct pvr_pds_info {
    uint32_t temps_required;
    uint32_t code_size_in_dwords;
index e9360cf..dbd18d6 100644 (file)
@@ -1514,6 +1514,13 @@ void pvr_pds_generate_descriptor_upload_program(
    num_consts64 = input_program->descriptor_set_count;
    total_dma_count = input_program->descriptor_set_count;
 
+   /* 1 DOUTD for buffer containing address literals. */
+   if (input_program->addr_literal_count > 0) {
+      num_consts32++;
+      num_consts64++;
+      total_dma_count++;
+   }
+
    pvr_init_pds_const_map_entry_write_state(info, &entry_write_state);
 
    for (unsigned int index = 0; index < input_program->buffer_count; index++) {
@@ -1543,6 +1550,67 @@ void pvr_pds_generate_descriptor_upload_program(
    next_const64 = 0;
    next_const32 = num_consts64 * 2;
 
+   if (input_program->addr_literal_count > 0) {
+      bool last_dma = (++running_dma_count == total_dma_count);
+      bool halt = last_dma && !input_program->secondary_program_present;
+
+      unsigned int size_in_dwords = input_program->addr_literal_count *
+                                    sizeof(uint64_t) / sizeof(uint32_t);
+      unsigned int destination = input_program->addr_literals[0].destination;
+
+      struct pvr_pds_const_map_entry_addr_literal_buffer
+         *addr_literal_buffer_entry;
+
+      addr_literal_buffer_entry = pvr_prepare_next_pds_const_map_entry(
+         &entry_write_state,
+         sizeof(*addr_literal_buffer_entry));
+
+      addr_literal_buffer_entry->type =
+         PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER;
+      addr_literal_buffer_entry->size = size_in_dwords * sizeof(uint32_t);
+      addr_literal_buffer_entry->const_offset = next_const64 * 2;
+
+      for (unsigned int i = 0; i < input_program->addr_literal_count; i++) {
+         struct pvr_pds_const_map_entry_addr_literal *addr_literal_entry;
+
+         /* Check that the destinations for the addr literals are contiguous.
+          * Not supporting non contiguous ranges as that would either require a
+          * single large buffer with wasted memory for DMA, or multiple buffers
+          * to DMA.
+          */
+         if (i > 0) {
+            const uint32_t current_addr_literal_destination =
+               input_program->addr_literals[i].destination;
+            const uint32_t previous_addr_literal_destination =
+               input_program->addr_literals[i - 1].destination;
+
+            /* 2 regs to store 64 bits address. */
+            assert(current_addr_literal_destination ==
+                   previous_addr_literal_destination + 2);
+         }
+
+         addr_literal_entry =
+            pvr_prepare_next_pds_const_map_entry(&entry_write_state,
+                                                 sizeof(*addr_literal_entry));
+
+         addr_literal_entry->type = PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL;
+         addr_literal_entry->addr_type = input_program->addr_literals[i].type;
+      }
+
+      PVR_PDS_MODE_TOGGLE(code_section,
+                          instruction,
+                          pvr_encode_burst_cs(&entry_write_state,
+                                              last_dma,
+                                              halt,
+                                              next_const32,
+                                              next_const64,
+                                              size_in_dwords,
+                                              destination));
+
+      next_const64++;
+      next_const32++;
+   }
+
    /* For each descriptor set perform a DOUTD. */
    for (unsigned int descriptor_index = 0;
         descriptor_index < input_program->descriptor_set_count;
index b19221b..ed1ce3d 100644 (file)
@@ -3207,13 +3207,208 @@ static VkResult pvr_setup_descriptor_mappings_old(
 }
 
 static VkResult
-pvr_setup_descriptor_mappings_new(uint32_t *const descriptor_data_offset_out)
+pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer *const cmd_buffer,
+                                     enum pvr_stage_allocation stage,
+                                     pvr_dev_addr_t *addr_out)
 {
-   *descriptor_data_offset_out = ~0;
+   uint64_t bound_desc_sets[PVR_MAX_DESCRIPTOR_SETS];
+   const struct pvr_descriptor_state *desc_state;
+   struct pvr_bo *bo;
+   VkResult result;
+
+   switch (stage) {
+   case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
+   case PVR_STAGE_ALLOCATION_FRAGMENT:
+      desc_state = &cmd_buffer->state.gfx_desc_state;
+      break;
+
+   case PVR_STAGE_ALLOCATION_COMPUTE:
+      desc_state = &cmd_buffer->state.compute_desc_state;
+      break;
 
-   pvr_finishme("Implement new desc set path.");
+   default:
+      unreachable("Unsupported stage.");
+      break;
+   }
 
-   return VK_ERROR_UNKNOWN;
+   for (uint32_t set = 0; set < ARRAY_SIZE(bound_desc_sets); set++) {
+      if (!(desc_state->valid_mask & BITFIELD_BIT(set))) {
+         bound_desc_sets[set] = PVR_DEV_ADDR_INVALID.addr;
+      } else {
+         bound_desc_sets[set] =
+            desc_state->descriptor_sets[set]->pvr_bo->vma->dev_addr.addr;
+      }
+   }
+
+   result = pvr_cmd_buffer_upload_general(cmd_buffer,
+                                          bound_desc_sets,
+                                          sizeof(bound_desc_sets),
+                                          &bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   *addr_out = bo->vma->dev_addr;
+   return VK_SUCCESS;
+}
+
+static VkResult
+pvr_process_addr_literal(struct pvr_cmd_buffer *cmd_buffer,
+                         enum pvr_pds_addr_literal_type addr_literal_type,
+                         enum pvr_stage_allocation stage,
+                         pvr_dev_addr_t *addr_out)
+{
+   VkResult result;
+
+   switch (addr_literal_type) {
+   case PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE: {
+      /* TODO: Maybe we want to free pvr_bo? And only when the data
+       * section is written successfully we link all bos to the command
+       * buffer.
+       */
+      result =
+         pvr_cmd_buffer_upload_desc_set_table(cmd_buffer, stage, addr_out);
+      if (result != VK_SUCCESS)
+         return result;
+
+      break;
+   }
+
+   default:
+      unreachable("Invalid add literal type.");
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult pvr_setup_descriptor_mappings_new(
+   struct pvr_cmd_buffer *const cmd_buffer,
+   enum pvr_stage_allocation stage,
+   const struct pvr_stage_allocation_descriptor_state *descriptor_state,
+   uint32_t *const descriptor_data_offset_out)
+{
+   const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
+   const uint8_t *entries;
+   uint32_t *dword_buffer;
+   uint64_t *qword_buffer;
+   struct pvr_bo *pvr_bo;
+   VkResult result;
+
+   if (!pds_info->data_size_in_dwords)
+      return VK_SUCCESS;
+
+   result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
+                                     cmd_buffer->device->heaps.pds_heap,
+                                     pds_info->data_size_in_dwords << 2,
+                                     PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                                     &pvr_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   dword_buffer = (uint32_t *)pvr_bo->bo->map;
+   qword_buffer = (uint64_t *)pvr_bo->bo->map;
+
+   entries = (uint8_t *)pds_info->entries;
+
+   switch (stage) {
+   case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
+   case PVR_STAGE_ALLOCATION_FRAGMENT:
+   case PVR_STAGE_ALLOCATION_COMPUTE:
+      break;
+
+   default:
+      unreachable("Unsupported stage.");
+      break;
+   }
+
+   for (uint32_t i = 0; i < pds_info->entry_count; i++) {
+      const struct pvr_const_map_entry *const entry_header =
+         (struct pvr_const_map_entry *)entries;
+
+      switch (entry_header->type) {
+      case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
+         const struct pvr_const_map_entry_literal32 *const literal =
+            (struct pvr_const_map_entry_literal32 *)entries;
+
+         PVR_WRITE(dword_buffer,
+                   literal->literal_value,
+                   literal->const_offset,
+                   pds_info->data_size_in_dwords);
+
+         entries += sizeof(*literal);
+         break;
+      }
+
+      case PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER: {
+         const struct pvr_pds_const_map_entry_addr_literal_buffer
+            *const addr_literal_buffer_entry =
+               (struct pvr_pds_const_map_entry_addr_literal_buffer *)entries;
+         struct pvr_device *device = cmd_buffer->device;
+         struct pvr_bo *addr_literal_buffer_bo;
+         uint32_t addr_literal_count = 0;
+         uint64_t *addr_literal_buffer;
+
+         result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
+                                           device->heaps.general_heap,
+                                           addr_literal_buffer_entry->size,
+                                           PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                                           &addr_literal_buffer_bo);
+         if (result != VK_SUCCESS)
+            return result;
+
+         addr_literal_buffer = (uint64_t *)addr_literal_buffer_bo->bo->map;
+
+         entries += sizeof(*addr_literal_buffer_entry);
+
+         PVR_WRITE(qword_buffer,
+                   addr_literal_buffer_bo->vma->dev_addr.addr,
+                   addr_literal_buffer_entry->const_offset,
+                   pds_info->data_size_in_dwords);
+
+         for (uint32_t j = i + 1; j < pds_info->entry_count; j++) {
+            const struct pvr_const_map_entry *const entry_header =
+               (struct pvr_const_map_entry *)entries;
+            const struct pvr_pds_const_map_entry_addr_literal *addr_literal;
+            pvr_dev_addr_t dev_addr;
+
+            if (entry_header->type != PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL)
+               break;
+
+            addr_literal =
+               (struct pvr_pds_const_map_entry_addr_literal *)entries;
+
+            result = pvr_process_addr_literal(cmd_buffer,
+                                              addr_literal->addr_type,
+                                              stage,
+                                              &dev_addr);
+            if (result != VK_SUCCESS)
+               return result;
+
+            addr_literal_buffer[addr_literal_count++] = dev_addr.addr;
+
+            entries += sizeof(*addr_literal);
+         }
+
+         assert(addr_literal_count * sizeof(uint64_t) ==
+                addr_literal_buffer_entry->size);
+
+         i += addr_literal_count;
+
+         pvr_bo_cpu_unmap(device, addr_literal_buffer_bo);
+         break;
+      }
+
+      default:
+         unreachable("Unsupported map entry type.");
+      }
+   }
+
+   pvr_bo_cpu_unmap(cmd_buffer->device, pvr_bo);
+
+   *descriptor_data_offset_out =
+      pvr_bo->vma->dev_addr.addr -
+      cmd_buffer->device->heaps.pds_heap->base_addr.addr;
+
+   return VK_SUCCESS;
 }
 
 static VkResult pvr_setup_descriptor_mappings(
@@ -3234,7 +3429,10 @@ static VkResult pvr_setup_descriptor_mappings(
                                                descriptor_data_offset_out);
    }
 
-   return pvr_setup_descriptor_mappings_new(descriptor_data_offset_out);
+   return pvr_setup_descriptor_mappings_new(cmd_buffer,
+                                            stage,
+                                            descriptor_state,
+                                            descriptor_data_offset_out);
 }
 
 static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
index 9975a4a..0c88aa5 100644 (file)
@@ -538,6 +538,15 @@ size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)
     *             (pvr_const_map_entry_literal32)
     *
     *  3. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
+    *
+    *  4. Max. number of PDS address literals (8) * (
+    *         size of entry
+    *             (pvr_const_map_entry_descriptor_set_addrs_table)
+    *
+    *  5. Max. number of address literals with single buffer entry to DOUTD
+              size of entry
+                  (pvr_pds_const_map_entry_addr_literal_buffer) +
+              8 * size of entry (pvr_pds_const_map_entry_addr_literal)
     */
 
    /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to
@@ -549,7 +558,9 @@ size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)
            PVR_PDS_MAX_BUFFERS *
               (sizeof(struct pvr_const_map_entry_constant_buffer) +
                sizeof(struct pvr_const_map_entry_literal32)) +
-           sizeof(struct pvr_const_map_entry_doutu_address));
+           sizeof(struct pvr_const_map_entry_doutu_address) +
+           sizeof(struct pvr_pds_const_map_entry_addr_literal_buffer) +
+           8 * sizeof(struct pvr_pds_const_map_entry_addr_literal));
 }
 
 /* This is a const pointer to an array of PVR_PDS_MAX_BUFFERS pvr_pds_buffer
@@ -636,6 +647,23 @@ static VkResult pvr_pds_descriptor_program_setup_buffers(
    return VK_SUCCESS;
 }
 
+/**
+ * \brief Indicates the layout of shared registers allocated by the driver.
+ *
+ * 'present' fields indicate if a certain resource was allocated for, and
+ * whether it will be present in the shareds.
+ * 'offset' fields indicate at which shared reg the resource starts at.
+ */
+struct pvr_sh_reg_layout {
+   /* If this is present, it will always take up 2 sh regs in size and contain
+    * the device address of the descriptor set addrs table.
+    */
+   struct {
+      bool present;
+      uint32_t offset;
+   } descriptor_set_addrs_table;
+};
+
 static VkResult pvr_pds_descriptor_program_create_and_upload(
    struct pvr_device *const device,
    const VkAllocationCallbacks *const allocator,
@@ -644,6 +672,7 @@ static VkResult pvr_pds_descriptor_program_create_and_upload(
    const struct pvr_explicit_constant_usage *const explicit_const_usage,
    const struct pvr_pipeline_layout *const layout,
    enum pvr_stage_allocation stage,
+   const struct pvr_sh_reg_layout *sh_reg_layout,
    struct pvr_stage_allocation_descriptor_state *const descriptor_state)
 {
    const size_t const_entries_size_in_bytes =
@@ -708,8 +737,21 @@ static VkResult pvr_pds_descriptor_program_create_and_upload(
             };
       }
    } else {
-      pvr_finishme("Implement new desc set path.");
-      return VK_ERROR_UNKNOWN;
+      uint32_t addr_literals = 0;
+
+      if (sh_reg_layout->descriptor_set_addrs_table.present) {
+         program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
+            .type = PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE,
+            .destination = sh_reg_layout->descriptor_set_addrs_table.offset,
+         };
+         addr_literals++;
+      }
+
+      /* TODO: Add support for other allocation types. E.g. blend constants
+       * and push constants.
+       */
+
+      program.addr_literal_count = addr_literals;
    }
 
    entries_buffer = vk_alloc2(&device->vk.alloc,
@@ -1047,6 +1089,55 @@ static void pvr_pipeline_finish(struct pvr_pipeline *pipeline)
    vk_object_base_finish(&pipeline->base);
 }
 
+/* How many shared regs it takes to store a pvr_dev_addr_t.
+ * Each shared reg is 32 bits.
+ */
+#define PVR_DEV_ADDR_SIZE_IN_SH_REGS \
+   DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t))
+
+/**
+ * \brief Allocates shared registers.
+ *
+ * \return How many sh regs are required.
+ */
+static uint32_t
+pvr_pipeline_alloc_shareds(const struct pvr_device *device,
+                           const struct pvr_pipeline_layout *layout,
+                           enum pvr_stage_allocation stage,
+                           struct pvr_sh_reg_layout *const sh_reg_layout_out)
+{
+   ASSERTED const uint64_t reserved_shared_size =
+      device->pdevice->dev_runtime_info.reserved_shared_size;
+   ASSERTED const uint64_t max_coeff =
+      device->pdevice->dev_runtime_info.max_coeffs;
+
+   struct pvr_sh_reg_layout reg_layout = { 0 };
+   uint32_t next_free_sh_reg = 0;
+
+   reg_layout.descriptor_set_addrs_table.present =
+      !!(layout->shader_stage_mask & BITFIELD_BIT(stage));
+
+   if (reg_layout.descriptor_set_addrs_table.present) {
+      reg_layout.descriptor_set_addrs_table.offset = next_free_sh_reg;
+      next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
+   }
+
+   /* TODO: Add allocation for blend constants, push constants, and other buffer
+    * types.
+    */
+
+   *sh_reg_layout_out = reg_layout;
+
+   /* FIXME: We might need to take more things into consideration.
+    * See pvr_calc_fscommon_size_and_tiles_in_flight().
+    */
+   assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
+
+   return next_free_sh_reg;
+}
+
+#undef PVR_DEV_ADDR_SIZE_IN_SH_REGS
+
 /******************************************************************************
    Compute pipeline functions
  ******************************************************************************/
@@ -1063,6 +1154,7 @@ static VkResult pvr_compute_pipeline_compile(
    uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
    struct pvr_explicit_constant_usage explicit_const_usage;
    uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
+   struct pvr_sh_reg_layout sh_reg_layout;
    struct rogue_ubo_data ubo_data;
    uint32_t barrier_coefficient;
    uint32_t usc_temps;
@@ -1104,6 +1196,15 @@ static VkResult pvr_compute_pipeline_compile(
       explicit_const_usage = build_info.explicit_conts_usage;
 
    } else {
+      uint32_t sh_count;
+
+      sh_count = pvr_pipeline_alloc_shareds(device,
+                                            compute_pipeline->base.layout,
+                                            PVR_STAGE_ALLOCATION_COMPUTE,
+                                            &sh_reg_layout);
+
+      compute_pipeline->shader_state.const_shared_reg_count = sh_count;
+
       /* FIXME: Compile and upload the shader. */
       /* FIXME: Initialize the shader state and setup build info. */
       abort();
@@ -1117,6 +1218,7 @@ static VkResult pvr_compute_pipeline_compile(
       &explicit_const_usage,
       compute_pipeline->base.layout,
       PVR_STAGE_ALLOCATION_COMPUTE,
+      &sh_reg_layout,
       &compute_pipeline->descriptor_state);
    if (result != VK_SUCCESS)
       goto err_free_shader;
@@ -1429,6 +1531,31 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device,
    struct rogue_build_ctx *ctx;
    VkResult result;
 
+   const bool old_path =
+      pvr_hard_code_shader_required(&device->pdevice->dev_info);
+
+   /* Vars needed for the new path. */
+   /* TODO: These need to be passed into the compiler so that it knows which
+    * shared regs to use to access specific resources.
+    */
+   struct pvr_sh_reg_layout vert_sh_reg_layout;
+   struct pvr_sh_reg_layout frag_sh_reg_layout;
+   uint32_t vert_sh_count = 0;
+   uint32_t frag_sh_count = 0;
+
+   if (!old_path) {
+      vert_sh_count =
+         pvr_pipeline_alloc_shareds(device,
+                                    gfx_pipeline->base.layout,
+                                    PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
+                                    &vert_sh_reg_layout);
+
+      frag_sh_count = pvr_pipeline_alloc_shareds(device,
+                                                 gfx_pipeline->base.layout,
+                                                 PVR_STAGE_ALLOCATION_FRAGMENT,
+                                                 &frag_sh_reg_layout);
+   }
+
    /* Setup shared build context. */
    ctx = rogue_build_context_create(compiler);
    if (!ctx)
@@ -1531,6 +1658,17 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device,
       pvr_vertex_state_init(gfx_pipeline,
                             &ctx->common_data[MESA_SHADER_VERTEX],
                             &ctx->stage_data.vs);
+
+      if (!old_path) {
+         struct pvr_vertex_shader_state *vertex_state =
+            &gfx_pipeline->shader_state.vertex;
+
+         /* FIXME: For now we just overwrite it but the compiler shouldn't be
+          * returning the sh count since the driver is in charge of allocating
+          * them.
+          */
+         vertex_state->stage_state.const_shared_reg_count = vert_sh_count;
+      }
    }
 
    result = pvr_gpu_upload_usc(device,
@@ -1551,6 +1689,17 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device,
    } else {
       pvr_fragment_state_init(gfx_pipeline,
                               &ctx->common_data[MESA_SHADER_FRAGMENT]);
+
+      if (!old_path) {
+         struct pvr_fragment_shader_state *fragment_state =
+            &gfx_pipeline->shader_state.fragment;
+
+         /* FIXME: For now we just overwrite it but the compiler shouldn't be
+          * returning the sh count since the driver is in charge of allocating
+          * them.
+          */
+         fragment_state->stage_state.const_shared_reg_count = frag_sh_count;
+      }
    }
 
    result = pvr_gpu_upload_usc(device,
@@ -1605,6 +1754,7 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device,
       &vert_explicit_const_usage,
       gfx_pipeline->base.layout,
       PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
+      &vert_sh_reg_layout,
       &gfx_pipeline->shader_state.vertex.descriptor_state);
    if (result != VK_SUCCESS)
       goto err_free_vertex_attrib_program;
@@ -1627,6 +1777,7 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device,
       &frag_explicit_const_usage,
       gfx_pipeline->base.layout,
       PVR_STAGE_ALLOCATION_FRAGMENT,
+      &frag_sh_reg_layout,
       &gfx_pipeline->shader_state.fragment.descriptor_state);
    if (result != VK_SUCCESS)
       goto err_free_vertex_descriptor_program;