return workgroup_size;
}
+void pvr_compute_update_kernel_private(
+ struct pvr_cmd_buffer *cmd_buffer,
+ struct pvr_sub_cmd_compute *const sub_cmd,
+ struct pvr_private_compute_pipeline *pipeline,
+ const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
+{
+ const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
+ const struct pvr_device_runtime_info *dev_runtime_info =
+ &pdevice->dev_runtime_info;
+ struct pvr_csb *csb = &sub_cmd->control_stream;
+
+ struct pvr_compute_kernel_info info = {
+ .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
+ .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
+ .pds_temp_size =
+ DIV_ROUND_UP(pipeline->pds_temps_used << 2U,
+ PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
+
+ .pds_data_size =
+ DIV_ROUND_UP(pipeline->pds_data_size_dw << 2U,
+ PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
+ .pds_data_offset = pipeline->pds_data_offset,
+ .pds_code_offset = pipeline->pds_code_offset,
+
+ .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
+
+ .usc_unified_size =
+ DIV_ROUND_UP(pipeline->coeff_regs_count << 2U,
+ PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
+
+ /* clang-format off */
+ .global_size = {
+ global_workgroup_size[0],
+ global_workgroup_size[1],
+ global_workgroup_size[2]
+ },
+ /* clang-format on */
+ };
+
+ uint32_t work_size = pipeline->workgroup_size.width *
+ pipeline->workgroup_size.height *
+ pipeline->workgroup_size.depth;
+ uint32_t coeff_regs;
+
+ if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
+ /* Enforce a single workgroup per cluster through allocation starvation.
+ */
+ coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
+ } else {
+ coeff_regs = pipeline->coeff_regs_count;
+ }
+
+ info.usc_common_size =
+ DIV_ROUND_UP(coeff_regs << 2U,
+ PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
+
+ /* Use a whole slot per workgroup. */
+ work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
+
+ coeff_regs += pipeline->const_shared_regs_count;
+
+ if (pipeline->const_shared_regs_count > 0)
+ info.sd_type = PVRX(CDMCTRL_SD_TYPE_USC);
+
+ work_size =
+ pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
+
+ info.local_size[0] = work_size;
+ info.local_size[1] = 1U;
+ info.local_size[2] = 1U;
+
+ info.max_instances =
+ pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
+
+ pvr_compute_generate_control_stream(csb, sub_cmd, &info);
+}
+
/* TODO: Wire up the base_workgroup variant program when implementing
* VK_KHR_device_group. The values will also need patching into the program.
*/
};
struct pvr_private_compute_pipeline {
+ /* Used by pvr_compute_update_kernel_private(). */
+ uint32_t pds_code_offset;
+ uint32_t pds_data_offset;
+ uint32_t pds_data_size_dw;
+ uint32_t pds_temps_used;
+ uint32_t coeff_regs_count;
+ VkExtent3D workgroup_size;
+
/* Used by pvr_compute_update_shared_private(). */
uint32_t pds_shared_update_code_offset;
uint32_t pds_shared_update_data_offset;
uint32_t pds_shared_update_data_size_dw;
+
+ /* Used by both pvr_compute_update_{kernel,shared}_private(). */
uint32_t const_shared_regs_count;
pvr_dev_addr_t const_buffer_addr;
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_compute *const sub_cmd,
struct pvr_private_compute_pipeline *pipeline);
+void pvr_compute_update_kernel_private(
+ struct pvr_cmd_buffer *cmd_buffer,
+ struct pvr_sub_cmd_compute *const sub_cmd,
+ struct pvr_private_compute_pipeline *pipeline,
+ const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS]);
#define PVR_FROM_HANDLE(__pvr_type, __name, __handle) \
VK_FROM_HANDLE(__pvr_type, __name, __handle)