From: Rajnesh Kanwal Date: Tue, 17 May 2022 16:19:31 +0000 (+0100) Subject: pvr: Add support to process transfer and blit cmds X-Git-Tag: upstream/23.3.3~9831 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=480bdff4b526c2aabd86a42ddb02665e9b7c69e2;p=platform%2Fupstream%2Fmesa.git pvr: Add support to process transfer and blit cmds Co-authored-by: Karmjit Mahil Co-authored-by: Matt Coster Co-authored-by: Sarah Walker Signed-off-by: Rajnesh Kanwal Signed-off-by: Karmjit Mahil Signed-off-by: Matt Coster Signed-off-by: Sarah Walker Acked-by: Frank Binns Part-of: --- diff --git a/src/imagination/common/pvr_device_info.c b/src/imagination/common/pvr_device_info.c index d1fd295..c1dd1db 100644 --- a/src/imagination/common/pvr_device_info.c +++ b/src/imagination/common/pvr_device_info.c @@ -53,6 +53,8 @@ const struct pvr_device_features pvr_device_features_4_V_2_51 = { .has_num_clusters = true, .has_num_raster_pipes = true, .has_num_user_clip_planes = true, + .has_pbe_filterable_f16 = true, + .has_pbe_yuv = true, .has_slc_cache_line_size_bits = true, .has_slc_mcu_cache_controls = true, .has_tf_bicubic_filter = true, @@ -96,6 +98,7 @@ const struct pvr_device_enhancements pvr_device_enhancements_4_40_2_51 = { .has_ern35421 = true, .has_ern38020 = true, .has_ern38748 = true, + .has_ern42064 = true, .has_ern42307 = true, }; @@ -126,6 +129,7 @@ const struct pvr_device_ident pvr_device_ident_33_V_11_3 = { const struct pvr_device_features pvr_device_features_33_V_11_3 = { .has_common_store_size_in_dwords = true, .has_compute = true, + .has_ipf_creq_pf = true, .has_isp_max_tiles_in_flight = true, .has_isp_samples_per_pixel = true, .has_max_instances_per_pds_task = true, @@ -136,6 +140,8 @@ const struct pvr_device_features pvr_device_features_33_V_11_3 = { .has_num_raster_pipes = true, .has_num_user_clip_planes = true, .has_pbe2_in_xe = true, + .has_pbe_filterable_f16 = true, + .has_pbe_yuv = true, .has_roguexe = true, .has_screen_size8K = true, .has_simple_internal_parameter_format = true, @@ -205,6 +211,7 @@ const struct pvr_device_features pvr_device_features_36_V_104_796 = { .has_compute_overlap = true, .has_gpu_multicore_support = true, .has_gs_rta_support = true, + .has_ipf_creq_pf = true, .has_isp_max_tiles_in_flight = true, .has_isp_samples_per_pixel = true, .has_max_instances_per_pds_task = true, @@ -216,6 +223,8 @@ const struct pvr_device_features pvr_device_features_36_V_104_796 = { .has_num_user_clip_planes = true, .has_paired_tiles = true, .has_pbe2_in_xe = true, + .has_pbe_filterable_f16 = true, + .has_pbe_yuv = true, .has_pds_ddmadt = true, .has_roguexe = true, .has_screen_size8K = true, diff --git a/src/imagination/common/pvr_device_info.h b/src/imagination/common/pvr_device_info.h index 4ed26df..ed8286d 100644 --- a/src/imagination/common/pvr_device_info.h +++ b/src/imagination/common/pvr_device_info.h @@ -257,6 +257,7 @@ struct pvr_device_features { bool has_eight_output_registers : 1; bool has_gpu_multicore_support : 1; bool has_gs_rta_support : 1; + bool has_ipf_creq_pf : 1; bool has_isp_max_tiles_in_flight : 1; bool has_isp_samples_per_pixel : 1; bool has_max_instances_per_pds_task : 1; @@ -268,10 +269,13 @@ struct pvr_device_features { bool has_num_user_clip_planes : 1; bool has_paired_tiles : 1; bool has_pbe2_in_xe : 1; + bool has_pbe_filterable_f16 : 1; + bool has_pbe_yuv : 1; bool has_pds_ddmadt : 1; bool has_roguexe : 1; bool has_screen_size8K : 1; bool has_simple_internal_parameter_format : 1; + bool has_simple_internal_parameter_format_v1 : 1; bool has_simple_internal_parameter_format_v2 : 1; bool has_simple_parameter_format_version : 1; bool has_slc_cache_line_size_bits : 1; @@ -327,6 +331,7 @@ struct pvr_device_enhancements { bool has_ern35421 : 1; bool has_ern38020 : 1; bool has_ern38748 : 1; + bool has_ern42064 : 1; bool has_ern42307 : 1; bool has_ern45493 : 1; }; diff --git a/src/imagination/common/pvr_util.h b/src/imagination/common/pvr_util.h index e534bc0..f94a319 100644 --- a/src/imagination/common/pvr_util.h +++ b/src/imagination/common/pvr_util.h @@ -27,9 +27,18 @@ #include #include +#include "pvr_types.h" + #include "util/bitscan.h" #include "util/macros.h" +static inline bool pvr_dev_addr_is_aligned(pvr_dev_addr_t addr, + const uint32_t alignment) +{ + assert(util_is_power_of_two_nonzero(alignment)); + return ((uintptr_t)(addr.addr) & (alignment - 1)) == 0; +} + static inline bool ptr_is_aligned(const void *const ptr, const uint32_t alignment) { diff --git a/src/imagination/csbgen/rogue_cr.xml b/src/imagination/csbgen/rogue_cr.xml index d1f4aef..a551f95 100644 --- a/src/imagination/csbgen/rogue_cr.xml +++ b/src/imagination/csbgen/rogue_cr.xml @@ -499,7 +499,7 @@ SOFTWARE. - + diff --git a/src/imagination/csbgen/rogue_ipf.xml b/src/imagination/csbgen/rogue_ipf.xml index 7b71b6c..90d7d06 100644 --- a/src/imagination/csbgen/rogue_ipf.xml +++ b/src/imagination/csbgen/rogue_ipf.xml @@ -26,6 +26,40 @@ SOFTWARE. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -37,4 +71,100 @@ SOFTWARE. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/imagination/include/hwdef/rogue_hw_defs.h b/src/imagination/include/hwdef/rogue_hw_defs.h index 146bc60..f238b67 100644 --- a/src/imagination/include/hwdef/rogue_hw_defs.h +++ b/src/imagination/include/hwdef/rogue_hw_defs.h @@ -125,6 +125,16 @@ */ #define ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES 7U +/* Size of the image state in 64-bit units. */ +#define ROGUE_MAXIMUM_IMAGE_STATE_SIZE_IN_ULONGLONGS 2U + +/* Size of the image state in dwords. The last 64-bit word is optional for + * non-YUV textures. + */ +#define ROGUE_MAXIMUM_IMAGE_STATE_SIZE \ + (ROGUE_MAXIMUM_IMAGE_STATE_SIZE_IN_ULONGLONGS * \ + (sizeof(uint64_t) / sizeof(uint32_t))) + #define PVR_NUM_PBE_EMIT_REGS 8U #endif /* ROGUE_HW_DEFS_H */ diff --git a/src/imagination/vulkan/meson.build b/src/imagination/vulkan/meson.build index 1cb990d..c4ce9d3 100644 --- a/src/imagination/vulkan/meson.build +++ b/src/imagination/vulkan/meson.build @@ -57,6 +57,7 @@ pvr_files = files( 'pvr_pass.c', 'pvr_pipeline.c', 'pvr_pipeline_cache.c', + 'pvr_transfer_frag_store.c', 'pvr_query.c', 'pvr_query_compute.c', 'pvr_queue.c', diff --git a/src/imagination/vulkan/pds/pvr_pds.c b/src/imagination/vulkan/pds/pvr_pds.c index 9523ea3..5baad53 100644 --- a/src/imagination/vulkan/pds/pvr_pds.c +++ b/src/imagination/vulkan/pds/pvr_pds.c @@ -433,22 +433,18 @@ void pvr_pds_pixel_shader_sa_initialize( * \param dest_offset Destination offset in the attribute. * \param dma_size The size of the DMA in words. * \param src_address Source address for the burst. + * \param last Last DMA in program. * \param dev_info PVR device info structure. * \returns The number of DMA transfers required. */ - uint32_t pvr_pds_encode_dma_burst(uint32_t *dma_control, uint64_t *dma_address, uint32_t dest_offset, uint32_t dma_size, uint64_t src_address, + bool last, const struct pvr_device_info *dev_info) { - /* Simplified for MS2. */ - - /* Force to 1 DMA. */ - const uint32_t num_kicks = 1; - dma_control[0] = dma_size << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_SHIFT; dma_control[0] |= dest_offset @@ -457,12 +453,15 @@ uint32_t pvr_pds_encode_dma_burst(uint32_t *dma_control, dma_control[0] |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_CMODE_CACHED | PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_DEST_COMMON_STORE; + if (last) + dma_control[0] |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_LAST_EN; + dma_address[0] = src_address; - if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls)) { + if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls)) dma_address[0] |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC0_SLCMODE_CACHED; - } - return num_kicks; + /* Force to 1 DMA. */ + return 1; } /* FIXME: use the csbgen interface and pvr_csb_pack. diff --git a/src/imagination/vulkan/pds/pvr_pds.h b/src/imagination/vulkan/pds/pvr_pds.h index b000e83..00c424a 100644 --- a/src/imagination/vulkan/pds/pvr_pds.h +++ b/src/imagination/vulkan/pds/pvr_pds.h @@ -624,6 +624,7 @@ uint32_t pvr_pds_encode_dma_burst(uint32_t *dma_control, uint32_t dest_offset, uint32_t dma_size, uint64_t src_address, + bool last, const struct pvr_device_info *dev_info); void pvr_pds_setup_doutu(struct pvr_pds_usc_task_control *usc_task_control, diff --git a/src/imagination/vulkan/pvr_blit.c b/src/imagination/vulkan/pvr_blit.c index a13785e..e25da8e 100644 --- a/src/imagination/vulkan/pvr_blit.c +++ b/src/imagination/vulkan/pvr_blit.c @@ -28,6 +28,7 @@ #include "pvr_clear.h" #include "pvr_csb.h" #include "pvr_formats.h" +#include "pvr_job_transfer.h" #include "pvr_private.h" #include "pvr_shader_factory.h" #include "pvr_static_shaders.h" @@ -114,35 +115,161 @@ void pvr_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, assert(!"Unimplemented"); } +static struct pvr_transfer_cmd * +pvr_transfer_cmd_alloc(struct pvr_cmd_buffer *cmd_buffer) +{ + struct pvr_transfer_cmd *transfer_cmd; + + transfer_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc, + sizeof(*transfer_cmd), + 8U, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!transfer_cmd) { + cmd_buffer->state.status = + vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); + return NULL; + } + + /* transfer_cmd->mapping_count is already set to zero. */ + transfer_cmd->filter = PVR_FILTER_POINT; + transfer_cmd->resolve_op = PVR_RESOLVE_BLEND; + transfer_cmd->addr_mode = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE); + transfer_cmd->cmd_buffer = cmd_buffer; + + return transfer_cmd; +} + +static void pvr_setup_buffer_surface(struct pvr_transfer_cmd_surface *surface, + VkRect2D *rect, + pvr_dev_addr_t dev_addr, + VkDeviceSize offset, + VkFormat vk_format, + uint32_t width, + uint32_t height) +{ + surface->dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, offset); + surface->width = width; + surface->height = height; + surface->stride = width; + surface->vk_format = vk_format; + surface->mem_layout = PVR_MEMLAYOUT_LINEAR; + surface->sample_count = 1; + + /* Initialize rectangle extent. Also, rectangle.offset should be set to + * zero, as the offset is already adjusted in the device address above. We + * don't explicitly set offset to zero as transfer_cmd is zero allocated. + */ + rect->extent.width = width; + rect->extent.height = height; +} + +static VkResult pvr_cmd_copy_buffer_region(struct pvr_cmd_buffer *cmd_buffer, + pvr_dev_addr_t src_addr, + VkDeviceSize src_offset, + pvr_dev_addr_t dst_addr, + VkDeviceSize dst_offset, + VkDeviceSize size) +{ + VkDeviceSize offset = 0; + + while (offset < size) { + VkDeviceSize remaining_size = size - offset; + struct pvr_transfer_cmd *transfer_cmd; + uint32_t texel_width; + VkDeviceSize texels; + VkFormat vk_format; + VkResult result; + uint32_t height; + uint32_t width; + + if (remaining_size >= 16U) { + vk_format = VK_FORMAT_R32G32B32A32_UINT; + texel_width = 16U; + } else if (remaining_size >= 4U) { + vk_format = VK_FORMAT_R32_UINT; + texel_width = 4U; + } else { + vk_format = VK_FORMAT_R8_UINT; + texel_width = 1U; + } + + texels = remaining_size / texel_width; + + /* Try to do max-width rects, fall back to a 1-height rect for the + * remainder. + */ + if (texels > PVR_MAX_TRANSFER_SIZE_IN_TEXELS) { + width = PVR_MAX_TRANSFER_SIZE_IN_TEXELS; + height = texels / PVR_MAX_TRANSFER_SIZE_IN_TEXELS; + height = MIN2(height, PVR_MAX_TRANSFER_SIZE_IN_TEXELS); + } else { + width = texels; + height = 1; + } + + transfer_cmd = pvr_transfer_cmd_alloc(cmd_buffer); + if (!transfer_cmd) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + if (!(transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FILL)) { + pvr_setup_buffer_surface(&transfer_cmd->src, + &transfer_cmd->mappings[0].src_rect, + src_addr, + offset + src_offset, + vk_format, + width, + height); + transfer_cmd->src_present = true; + } + + pvr_setup_buffer_surface(&transfer_cmd->dst, + &transfer_cmd->scissor, + dst_addr, + offset + dst_offset, + vk_format, + width, + height); + + if (transfer_cmd->src_present) + transfer_cmd->mappings[0].dst_rect = transfer_cmd->scissor; + + transfer_cmd->mapping_count++; + transfer_cmd->cmd_buffer = cmd_buffer; + + result = pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd); + if (result != VK_SUCCESS) { + vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd); + return result; + } + + offset += width * height * texel_width; + } + + return VK_SUCCESS; +} + void pvr_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer, const VkCopyBufferInfo2 *pCopyBufferInfo) { PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); PVR_FROM_HANDLE(pvr_buffer, src, pCopyBufferInfo->srcBuffer); PVR_FROM_HANDLE(pvr_buffer, dst, pCopyBufferInfo->dstBuffer); - const size_t regions_size = - pCopyBufferInfo->regionCount * sizeof(*pCopyBufferInfo->pRegions); - struct pvr_transfer_cmd *transfer_cmd; PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); - transfer_cmd = vk_alloc(&cmd_buffer->vk.pool->alloc, - sizeof(*transfer_cmd) + regions_size, - 8U, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (!transfer_cmd) { - cmd_buffer->state.status = - vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); + for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) { + VkResult result; - return; + result = + pvr_cmd_copy_buffer_region(cmd_buffer, + src->dev_addr, + pCopyBufferInfo->pRegions[i].srcOffset, + dst->dev_addr, + pCopyBufferInfo->pRegions[i].dstOffset, + pCopyBufferInfo->pRegions[i].size); + if (result != VK_SUCCESS) + return; } - - transfer_cmd->src = src; - transfer_cmd->dst = dst; - transfer_cmd->region_count = pCopyBufferInfo->regionCount; - memcpy(transfer_cmd->regions, pCopyBufferInfo->pRegions, regions_size); - - pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd); } /** diff --git a/src/imagination/vulkan/pvr_cmd_buffer.c b/src/imagination/vulkan/pvr_cmd_buffer.c index 7a01b54..c7444ad 100644 --- a/src/imagination/vulkan/pvr_cmd_buffer.c +++ b/src/imagination/vulkan/pvr_cmd_buffer.c @@ -32,6 +32,7 @@ #include "hwdef/rogue_hw_defs.h" #include "hwdef/rogue_hw_utils.h" #include "pvr_bo.h" +#include "pvr_common.h" #include "pvr_csb.h" #include "pvr_csb_enum_helpers.h" #include "pvr_device_info.h" @@ -604,37 +605,6 @@ err_csb_finish: return result; } -struct pvr_combined_image_sampler_descriptor { - /* | TEXSTATE_IMAGE_WORD0 | TEXSTATE_{STRIDE_,}IMAGE_WORD1 | */ - uint64_t image[ROGUE_NUM_TEXSTATE_IMAGE_WORDS]; - union pvr_sampler_descriptor sampler; -}; - -#define CHECK_STRUCT_FIELD_SIZE(_struct_type, _field_name, _size) \ - static_assert(sizeof(((struct _struct_type *)NULL)->_field_name) == \ - (_size), \ - "Size of '" #_field_name "' in '" #_struct_type \ - "' differs from expected") - -CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor, - image, - ROGUE_NUM_TEXSTATE_IMAGE_WORDS * sizeof(uint64_t)); -CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor, - image, - PVR_IMAGE_DESCRIPTOR_SIZE * sizeof(uint32_t)); -CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor, - image, - (pvr_cmd_length(TEXSTATE_IMAGE_WORD0) + - pvr_cmd_length(TEXSTATE_IMAGE_WORD1)) * - sizeof(uint32_t)); -CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor, - image, - (pvr_cmd_length(TEXSTATE_IMAGE_WORD0) + - pvr_cmd_length(TEXSTATE_STRIDE_IMAGE_WORD1)) * - sizeof(uint32_t)); - -#undef CHECK_STRUCT_FIELD_SIZE - static VkResult pvr_setup_texture_state_words( struct pvr_device *device, struct pvr_combined_image_sampler_descriptor *descriptor, diff --git a/src/imagination/vulkan/pvr_common.h b/src/imagination/vulkan/pvr_common.h index b6d683d..04caadb 100644 --- a/src/imagination/vulkan/pvr_common.h +++ b/src/imagination/vulkan/pvr_common.h @@ -39,9 +39,11 @@ * relevant for the driver/compiler interface (no Vulkan types). */ +#include "hwdef/rogue_hw_defs.h" #include "pvr_limits.h" #include "pvr_types.h" #include "util/list.h" +#include "util/macros.h" #include "vk_object.h" #include "vk_sync.h" @@ -146,6 +148,13 @@ enum pvr_stage_allocation { PVR_STAGE_ALLOCATION_COUNT }; +enum pvr_filter { + PVR_FILTER_DONTCARE, /* Any filtering mode is acceptable. */ + PVR_FILTER_POINT, + PVR_FILTER_LINEAR, + PVR_FILTER_BICUBIC, +}; + enum pvr_resolve_op { PVR_RESOLVE_BLEND, PVR_RESOLVE_MIN, @@ -202,6 +211,42 @@ union pvr_sampler_descriptor { } data; }; +struct pvr_combined_image_sampler_descriptor { + /* | TEXSTATE_IMAGE_WORD0 | TEXSTATE_{STRIDE_,}IMAGE_WORD1 | */ + uint64_t image[ROGUE_NUM_TEXSTATE_IMAGE_WORDS]; + union pvr_sampler_descriptor sampler; +}; + +#define CHECK_STRUCT_FIELD_SIZE(_struct_type, _field_name, _size) \ + static_assert(sizeof(((struct _struct_type *)NULL)->_field_name) == \ + (_size), \ + "Size of '" #_field_name "' in '" #_struct_type \ + "' differs from expected") + +CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor, + image, + ROGUE_NUM_TEXSTATE_IMAGE_WORDS * sizeof(uint64_t)); +CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor, + image, + PVR_IMAGE_DESCRIPTOR_SIZE * sizeof(uint32_t)); +#if 0 +/* TODO: Don't really want to include pvr_csb.h in here since this header is + * shared with the compiler. Figure out a better place for these. + */ +CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor, + image, + (pvr_cmd_length(TEXSTATE_IMAGE_WORD0) + + pvr_cmd_length(TEXSTATE_IMAGE_WORD1)) * + sizeof(uint32_t)); +CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor, + image, + (pvr_cmd_length(TEXSTATE_IMAGE_WORD0) + + pvr_cmd_length(TEXSTATE_STRIDE_IMAGE_WORD1)) * + sizeof(uint32_t)); +#endif + +#undef CHECK_STRUCT_FIELD_SIZE + struct pvr_sampler { struct vk_object_base base; diff --git a/src/imagination/vulkan/pvr_device.c b/src/imagination/vulkan/pvr_device.c index 9bec439..7a65deb 100644 --- a/src/imagination/vulkan/pvr_device.c +++ b/src/imagination/vulkan/pvr_device.c @@ -1265,6 +1265,7 @@ static VkResult pvr_pds_idfwdf_programs_create_and_upload( 0, shareds, shareds_buffer_addr.addr, + false, dev_info); /* DMA temp regs. */ diff --git a/src/imagination/vulkan/pvr_job_context.c b/src/imagination/vulkan/pvr_job_context.c index 159ab7c..f809651 100644 --- a/src/imagination/vulkan/pvr_job_context.c +++ b/src/imagination/vulkan/pvr_job_context.c @@ -25,21 +25,25 @@ #include #include #include +#include #include #include "hwdef/rogue_hw_utils.h" #include "pvr_bo.h" #include "pvr_cdm_load_sr.h" +#include "pvr_common.h" #include "pvr_csb.h" #include "pvr_job_context.h" #include "pvr_pds.h" #include "pvr_private.h" +#include "pvr_transfer_frag_store.h" #include "pvr_types.h" #include "pvr_uscgen.h" #include "pvr_vdm_load_sr.h" #include "pvr_vdm_store_sr.h" #include "pvr_winsys.h" #include "util/macros.h" +#include "util/os_file.h" #include "util/u_dynarray.h" #include "vk_alloc.h" #include "vk_log.h" @@ -1259,15 +1263,30 @@ static void pvr_transfer_eot_shaders_fini(struct pvr_device *device, static VkResult pvr_transfer_ctx_shaders_init(struct pvr_device *device, struct pvr_transfer_ctx *ctx) { - /* TODO: Setup USC fragments. */ + VkResult result; + + result = pvr_transfer_frag_store_init(device, &ctx->frag_store); + if (result != VK_SUCCESS) + goto err_out; + + result = pvr_transfer_eot_shaders_init(device, ctx); + if (result != VK_SUCCESS) + goto err_frag_store_fini; - return pvr_transfer_eot_shaders_init(device, ctx); + return VK_SUCCESS; + +err_frag_store_fini: + pvr_transfer_frag_store_fini(device, &ctx->frag_store); + +err_out: + return result; } static void pvr_transfer_ctx_shaders_fini(struct pvr_device *device, struct pvr_transfer_ctx *ctx) { pvr_transfer_eot_shaders_fini(device, ctx); + pvr_transfer_frag_store_fini(device, &ctx->frag_store); } VkResult pvr_transfer_ctx_create(struct pvr_device *const device, diff --git a/src/imagination/vulkan/pvr_job_context.h b/src/imagination/vulkan/pvr_job_context.h index 5a09f9f..b07bee7 100644 --- a/src/imagination/vulkan/pvr_job_context.h +++ b/src/imagination/vulkan/pvr_job_context.h @@ -24,8 +24,11 @@ #ifndef PVR_JOB_CONTEXT_H #define PVR_JOB_CONTEXT_H +#include "pvr_common.h" #include "pvr_private.h" +#include "pvr_transfer_frag_store.h" #include "pvr_types.h" +#include "pvr_uscgen.h" #include "pvr_winsys.h" /* Support PDS code/data loading/storing to the 'B' shared register state @@ -143,6 +146,8 @@ struct pvr_transfer_ctx { struct pvr_winsys_transfer_ctx *ws_ctx; + struct pvr_transfer_frag_store frag_store; + struct pvr_bo *usc_eot_bos[PVR_TRANSFER_MAX_RENDER_TARGETS]; struct pvr_pds_upload pds_unitex_code[PVR_TRANSFER_MAX_TEXSTATE_DMA] diff --git a/src/imagination/vulkan/pvr_job_transfer.c b/src/imagination/vulkan/pvr_job_transfer.c index a0c2e79..44a8cf9 100644 --- a/src/imagination/vulkan/pvr_job_transfer.c +++ b/src/imagination/vulkan/pvr_job_transfer.c @@ -21,84 +21,4931 @@ * SOFTWARE. */ +#include #include -#include #include +#include +#include #include +#include "pvr_csb.h" +#include "pvr_csb_enum_helpers.h" +#include "pvr_formats.h" #include "pvr_job_common.h" #include "pvr_job_context.h" #include "pvr_job_transfer.h" #include "pvr_private.h" +#include "pvr_tex_state.h" +#include "pvr_transfer_frag_store.h" +#include "pvr_types.h" +#include "pvr_uscgen.h" +#include "pvr_util.h" #include "pvr_winsys.h" +#include "util/bitscan.h" #include "util/list.h" #include "util/macros.h" +#include "util/u_math.h" +#include "util/xxhash.h" +#include "vk_format.h" +#include "vk_log.h" #include "vk_sync.h" -/* FIXME: Implement gpu based transfer support. */ -VkResult pvr_transfer_job_submit(struct pvr_device *device, - struct pvr_transfer_ctx *ctx, - struct pvr_sub_cmd_transfer *sub_cmd, - struct vk_sync *wait_sync, - struct vk_sync *signal_sync) +#define PVR_TRANSFER_MAX_PASSES 10U +#define PVR_TRANSFER_MAX_CLIP_RECTS 4U +#define PVR_TRANSFER_MAX_PREPARES_PER_SUBMIT 16U + +/* Number of triangles sent to the TSP per raster. */ +#define PVR_TRANSFER_NUM_LAYERS 1U + +#define PVR_MAX_WIDTH 16384 +#define PVR_MAX_HEIGHT 16384 + +#define PVR_MAX_CLIP_SIZE(dev_info) \ + (PVR_HAS_FEATURE(dev_info, screen_size8K) ? 8192U : 16384U) + +enum pvr_paired_tiles { + PVR_PAIRED_TILES_NONE, + PVR_PAIRED_TILES_X, + PVR_PAIRED_TILES_Y +}; + +struct pvr_transfer_pass { + uint32_t dst_offset; + + uint32_t src_offset; + uint32_t mapping_count; + struct pvr_rect_mapping mappings[PVR_TRANSFER_MAX_CUSTOM_MAPPINGS]; + bool extend_height; + bool byte_unwind; + + uint32_t clip_rects_count; + VkRect2D clip_rects[PVR_TRANSFER_MAX_CLIP_RECTS]; +}; + +/* Structure representing a layer iteration. */ +struct pvr_transfer_custom_mapping { + bool double_stride; + uint32_t texel_unwind_src; + uint32_t texel_unwind_dst; + uint32_t texel_extend_src; + uint32_t texel_extend_dst; + uint32_t pass_count; + struct pvr_transfer_pass passes[PVR_TRANSFER_MAX_PASSES]; + uint32_t max_clip_rects; + int32_t max_clip_size; + uint32_t byte_unwind_src; +}; + +struct pvr_transfer_3d_iteration { + uint32_t texture_coords[12]; +}; + +struct pvr_transfer_3d_state { + struct pvr_winsys_transfer_regs regs; + + bool empty_dst; + bool down_scale; + /* Write all channels present in the dst from the USC even if those are + * constants. + */ + bool dont_force_pbe; + + /* The rate of the shader. */ + uint32_t msaa_multiplier; + /* Top left corner of the render in ISP tiles. */ + uint32_t origin_x_in_tiles; + /* Top left corner of the render in ISP tiles. */ + uint32_t origin_y_in_tiles; + /* Width of the render in ISP tiles. */ + uint32_t width_in_tiles; + /* Height of the render in ISP tiles. */ + uint32_t height_in_tiles; + + /* Width of a sample in registers (pixel partition width). */ + uint32_t usc_pixel_width; + + /* Properties of the USC shader. */ + struct pvr_tq_shader_properties shader_props; + + /* TODO: Use pvr_dev_addr_t of an offset type for these. */ + uint32_t pds_shader_task_offset; + uint32_t tex_state_data_offset; + uint32_t uni_tex_code_offset; + + uint32_t uniform_data_size; + uint32_t tex_state_data_size; + uint32_t usc_coeff_regs; + + /* Pointer into the common store. */ + uint32_t common_ptr; + /* Pointer into the dynamic constant reg buffer. */ + uint32_t dynamic_const_reg_ptr; + /* Pointer into the USC constant reg buffer. */ + uint32_t usc_const_reg_ptr; + + uint32_t pds_coeff_task_offset; + uint32_t coeff_data_size; + + /* Number of temporary 32bit registers used by PDS. */ + uint32_t pds_temps; + + struct pvr_transfer_custom_mapping custom_mapping; + uint32_t pass_idx; + + enum pvr_filter filter; + bool custom_filter; + + enum pvr_paired_tiles pair_tiles; +}; + +struct pvr_transfer_prep_data { + uint32_t flags; + struct pvr_transfer_3d_state state; +}; + +struct pvr_transfer_submit { + uint32_t prep_count; + struct pvr_transfer_prep_data + prep_array[PVR_TRANSFER_MAX_PREPARES_PER_SUBMIT]; +}; + +static enum pvr_transfer_pbe_pixel_src pvr_pbe_src_format_raw(VkFormat format) +{ + uint32_t bpp = vk_format_get_blocksizebits(format); + + if (bpp <= 32U) + return PVR_TRANSFER_PBE_PIXEL_SRC_RAW32; + else if (bpp <= 64U) + return PVR_TRANSFER_PBE_PIXEL_SRC_RAW64; + + return PVR_TRANSFER_PBE_PIXEL_SRC_RAW128; +} + +/** + * How the PBE expects the output buffer for an RGBA space conversion. + */ +static VkResult +pvr_pbe_src_format_normal(VkFormat src_format, + VkFormat dst_format, + bool down_scale, + bool dont_force_pbe, + enum pvr_transfer_pbe_pixel_src *src_format_out) +{ + bool dst_signed = vk_format_is_sint(dst_format); + + if (vk_format_is_int(dst_format)) { + uint32_t red_width; + bool src_signed; + uint32_t count; + + if (!vk_format_is_int(src_format)) + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + + src_signed = vk_format_is_sint(src_format); + + red_width = vk_format_get_component_bits(dst_format, + UTIL_FORMAT_COLORSPACE_RGB, + 0); + + switch (red_width) { + case 8: + if (!src_signed && !dst_signed) + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_UU8888; + else if (src_signed && !dst_signed) + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_SU8888; + else if (!src_signed && dst_signed) + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_US8888; + else + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_SS8888; + + break; + + case 10: + switch (dst_format) { + case VK_FORMAT_A2B10G10R10_UINT_PACK32: + *src_format_out = src_signed ? PVR_TRANSFER_PBE_PIXEL_SRC_SU1010102 + : PVR_TRANSFER_PBE_PIXEL_SRC_UU1010102; + break; + + case VK_FORMAT_A2R10G10B10_UINT_PACK32: + *src_format_out = src_signed + ? PVR_TRANSFER_PBE_PIXEL_SRC_RBSWAP_SU1010102 + : PVR_TRANSFER_PBE_PIXEL_SRC_RBSWAP_UU1010102; + break; + + default: + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + } + break; + + case 16: + if (!src_signed && !dst_signed) + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_UU16U16; + else if (src_signed && !dst_signed) + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_SU16U16; + else if (!src_signed && dst_signed) + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_US16S16; + else + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_SS16S16; + + break; + + case 32: + if (dont_force_pbe) { + count = vk_format_get_blocksizebits(dst_format) / 32U; + } else { + count = + vk_format_get_common_color_channel_count(src_format, dst_format); + } + + if (!src_signed && !dst_signed) { + *src_format_out = (count > 2U) ? PVR_TRANSFER_PBE_PIXEL_SRC_RAW128 + : PVR_TRANSFER_PBE_PIXEL_SRC_RAW64; + } else if (src_signed && !dst_signed) { + *src_format_out = (count > 2U) ? PVR_TRANSFER_PBE_PIXEL_SRC_S4XU32 + : PVR_TRANSFER_PBE_PIXEL_SRC_SU32U32; + } else if (!src_signed && dst_signed) { + *src_format_out = (count > 2U) ? PVR_TRANSFER_PBE_PIXEL_SRC_U4XS32 + : PVR_TRANSFER_PBE_PIXEL_SRC_US32S32; + } else { + *src_format_out = (count > 2U) ? PVR_TRANSFER_PBE_PIXEL_SRC_RAW128 + : PVR_TRANSFER_PBE_PIXEL_SRC_RAW64; + } + break; + + default: + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + } + + } else if (vk_format_is_float(dst_format) || + vk_format_is_normalized(dst_format)) { + bool is_float = true; + + if (!vk_format_is_float(src_format) && + !vk_format_is_normalized(src_format)) { + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + } + + if (vk_format_is_normalized(dst_format)) { + uint32_t chan_width; + + is_float = false; + + /* Alpha only. */ + switch (dst_format) { + case VK_FORMAT_D16_UNORM: + chan_width = 16; + break; + + default: + chan_width = + vk_format_get_component_bits(dst_format, + UTIL_FORMAT_COLORSPACE_RGB, + 0U); + break; + } + + if (src_format == dst_format) { + switch (chan_width) { + case 16U: + if (down_scale) { + *src_format_out = dst_signed + ? PVR_TRANSFER_PBE_PIXEL_SRC_S16NORM + : PVR_TRANSFER_PBE_PIXEL_SRC_U16NORM; + } else { + *src_format_out = dst_signed + ? PVR_TRANSFER_PBE_PIXEL_SRC_SS16S16 + : PVR_TRANSFER_PBE_PIXEL_SRC_UU16U16; + } + break; + + case 32U: + *src_format_out = pvr_pbe_src_format_raw(dst_format); + break; + default: + is_float = true; + break; + } + } else { + switch (chan_width) { + case 16U: + *src_format_out = dst_signed + ? PVR_TRANSFER_PBE_PIXEL_SRC_S16NORM + : PVR_TRANSFER_PBE_PIXEL_SRC_U16NORM; + break; + default: + is_float = true; + break; + } + } + } + + if (is_float) { + if (vk_format_has_32bit_component(dst_format)) { + uint32_t count; + + if (dont_force_pbe) { + count = vk_format_get_blocksizebits(dst_format) / 32U; + } else { + count = vk_format_get_common_color_channel_count(src_format, + dst_format); + } + + switch (count) { + case 1U: + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_F32; + break; + case 2U: + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_F32X2; + break; + default: + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_F32X4; + break; + } + } else { + if (dst_format == VK_FORMAT_B8G8R8A8_UNORM || + dst_format == VK_FORMAT_R8G8B8A8_UNORM) { + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_F16_U8; + } else { + *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_F16F16; + } + } + } + } else { + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + } + + return VK_SUCCESS; +} + +static inline uint32_t +pvr_get_blit_flags(const struct pvr_transfer_cmd *transfer_cmd) +{ + return transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FAST2D + ? 0 + : transfer_cmd->flags; +} + +static VkResult pvr_pbe_src_format(struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_3d_state *state, + struct pvr_tq_shader_properties *prop) +{ + struct pvr_tq_layer_properties *layer = &prop->layer_props; + VkFormat dst_format = transfer_cmd->dst.vk_format; + VkFormat src_format; + bool down_scale; + + if (transfer_cmd->src_present) { + src_format = transfer_cmd->src.vk_format; + down_scale = transfer_cmd->resolve_op == PVR_RESOLVE_BLEND && + transfer_cmd->src.sample_count > 1U && + transfer_cmd->dst.sample_count <= 1U; + } else { + src_format = dst_format; + down_scale = false; + } + + /* This has to come before the rest as S8 for instance is integer and + * signedness check fails on D24S8. + */ + if (!down_scale && + (vk_format_is_depth_or_stencil(src_format) || + vk_format_is_depth_or_stencil(dst_format) || + pvr_get_blit_flags(transfer_cmd) & PVR_TRANSFER_CMD_FLAGS_DSMERGE)) { + pvr_finishme("Complete pvr_pbe_src_format()."); + } + + return pvr_pbe_src_format_normal(src_format, + dst_format, + down_scale, + state->dont_force_pbe, + &layer->pbe_format); +} + +static inline void pvr_setup_hwbg_object(const struct pvr_device_info *dev_info, + struct pvr_transfer_3d_state *state) +{ + struct pvr_winsys_transfer_regs *regs = &state->regs; + + pvr_csb_pack (®s->pds_bgnd0_base, CR_PDS_BGRND0_BASE, reg) { + reg.shader_addr = PVR_DEV_ADDR(state->pds_shader_task_offset); + assert(pvr_dev_addr_is_aligned( + reg.shader_addr, + PVRX(CR_PDS_BGRND0_BASE_SHADER_ADDR_ALIGNMENT))); + reg.texunicode_addr = PVR_DEV_ADDR(state->uni_tex_code_offset); + assert(pvr_dev_addr_is_aligned( + reg.texunicode_addr, + PVRX(CR_PDS_BGRND0_BASE_TEXUNICODE_ADDR_ALIGNMENT))); + } + + pvr_csb_pack (®s->pds_bgnd1_base, CR_PDS_BGRND1_BASE, reg) { + reg.texturedata_addr = PVR_DEV_ADDR(state->tex_state_data_offset); + assert(pvr_dev_addr_is_aligned( + reg.texturedata_addr, + PVRX(CR_PDS_BGRND1_BASE_TEXTUREDATA_ADDR_ALIGNMENT))); + } + + /* BGRND 2 not needed, background object PDS doesn't use uniform program. */ + + pvr_csb_pack (®s->pds_bgnd3_sizeinfo, CR_PDS_BGRND3_SIZEINFO, reg) { + reg.usc_sharedsize = + DIV_ROUND_UP(state->common_ptr, + PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE)); + + assert(!(state->uniform_data_size & + (PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_UNIFORMSIZE_UNIT_SIZE) - 1))); + reg.pds_uniformsize = + state->uniform_data_size / + PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_UNIFORMSIZE_UNIT_SIZE); + + assert( + !(state->tex_state_data_size & + (PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE) - 1))); + reg.pds_texturestatesize = + state->tex_state_data_size / + PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE); + + reg.pds_tempsize = + DIV_ROUND_UP(state->pds_temps, + PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE)); + } +} + +static inline bool +pvr_is_surface_aligned(pvr_dev_addr_t dev_addr, bool is_input, uint32_t bpp) { + /* 96 bpp is 32 bit granular. */ + if (bpp == 64U || bpp == 128U) { + uint64_t mask = (uint64_t)((bpp >> 3U) - 1U); + + if ((dev_addr.addr & mask) != 0ULL) + return false; + } + + if (is_input) { + if ((dev_addr.addr & + (PVRX(TEXSTATE_STRIDE_IMAGE_WORD1_TEXADDR_ALIGNMENT) - 1U)) != + 0ULL) { + return false; + } + } else { + if ((dev_addr.addr & + (PVRX(PBESTATE_STATE_WORD0_ADDRESS_LOW_ALIGNMENT) - 1U)) != 0ULL) { + return false; + } + } + + return true; +} + +static inline VkResult +pvr_mem_layout_spec(const struct pvr_transfer_cmd_surface *surface, + uint32_t load, + bool is_input, + uint32_t *width_out, + uint32_t *height_out, + uint32_t *stride_out, + enum pvr_memlayout *mem_layout_out, + pvr_dev_addr_t *dev_addr_out) +{ + const uint32_t bpp = vk_format_get_blocksizebits(surface->vk_format); + uint32_t unsigned_stride; + + *mem_layout_out = surface->mem_layout; + *height_out = surface->height; + *width_out = surface->width; + *stride_out = surface->stride; + *dev_addr_out = surface->dev_addr; + + if (surface->mem_layout != PVR_MEMLAYOUT_LINEAR && + !pvr_is_surface_aligned(*dev_addr_out, is_input, bpp)) { + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + } + + switch (surface->mem_layout) { + case PVR_MEMLAYOUT_LINEAR: + if (surface->stride == 0U) + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + + unsigned_stride = *stride_out; + + if (!pvr_is_surface_aligned(*dev_addr_out, is_input, bpp)) + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + + if (unsigned_stride < *width_out) + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + + if (!is_input) { + if (unsigned_stride == 1U) { + /* Change the setup to twiddling as that doesn't hit the stride + * limit and twiddled == strided when 1px stride. + */ + *mem_layout_out = PVR_MEMLAYOUT_TWIDDLED; + } + } + + *stride_out = unsigned_stride; + break; + + case PVR_MEMLAYOUT_TWIDDLED: + case PVR_MEMLAYOUT_3DTWIDDLED: + if (surface->stride != 0U) + mesa_logi("Ignoring stride value for twiddled/tiled surface!"); + + *stride_out = *width_out; + break; + + default: + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + } + + return VK_SUCCESS; +} + +static VkResult +pvr_pbe_setup_codegen_defaults(const struct pvr_device_info *dev_info, + const struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_3d_state *state, + struct pvr_pbe_surf_params *surface_params, + struct pvr_pbe_render_params *render_params) +{ + const struct pvr_transfer_cmd_surface *dst = &transfer_cmd->dst; + const uint8_t *swizzle; + VkFormat format; VkResult result; - result = vk_sync_wait(&device->vk, - wait_sync, - 0U, - VK_SYNC_WAIT_COMPLETE, - UINT64_MAX); + switch (dst->vk_format) { + case VK_FORMAT_D24_UNORM_S8_UINT: + case VK_FORMAT_X8_D24_UNORM_PACK32: + format = VK_FORMAT_R32_UINT; + break; + + default: + format = dst->vk_format; + break; + } + + swizzle = pvr_get_format_swizzle(format); + memcpy(surface_params->swizzle, swizzle, sizeof(surface_params->swizzle)); + + pvr_pbe_get_src_format_and_gamma(format, + PVR_PBE_GAMMA_NONE, + false, + &surface_params->source_format, + &surface_params->gamma); + + surface_params->is_normalized = vk_format_is_normalized(format); + surface_params->pbe_packmode = pvr_get_pbe_packmode(format); + surface_params->nr_components = vk_format_get_nr_components(format); + + result = pvr_mem_layout_spec(dst, + 0U, + false, + &surface_params->width, + &surface_params->height, + &surface_params->stride, + &surface_params->mem_layout, + &surface_params->addr); if (result != VK_SUCCESS) return result; - list_for_each_entry_safe (struct pvr_transfer_cmd, - transfer_cmd, - &sub_cmd->transfer_cmds, - link) { - bool src_mapped = false; - bool dst_mapped = false; - void *src_addr; - void *dst_addr; - void *ret_ptr; + surface_params->z_only_render = false; + surface_params->gamma = PVR_PBE_GAMMA_NONE; + surface_params->depth = dst->depth; + surface_params->down_scale = state->down_scale; + + if (surface_params->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED) + render_params->slice = (uint32_t)MAX2(dst->z_position, 0.0f); + else + render_params->slice = 0U; + + uint32_t tile_size_x = PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 0U); + uint32_t tile_size_y = PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 0U); + + /* If the rectangle happens to be empty / off-screen we clip away + * everything. + */ + if (state->empty_dst) { + render_params->min_x_clip = 2U * tile_size_x; + render_params->max_x_clip = 3U * tile_size_x; + render_params->min_y_clip = 2U * tile_size_y; + render_params->max_y_clip = 3U * tile_size_y; + state->origin_x_in_tiles = 0U; + state->origin_y_in_tiles = 0U; + state->height_in_tiles = 1U; + state->width_in_tiles = 1U; + } else { + const VkRect2D *scissor = &transfer_cmd->scissor; + + /* Clamp */ + render_params->min_x_clip = + MAX2(MIN2(scissor->offset.x, (int32_t)surface_params->width), 0U); + render_params->max_x_clip = + MAX2(MIN2(scissor->offset.x + scissor->extent.width, + (int32_t)surface_params->width), + 0U) - + 1U; + + render_params->min_y_clip = + MAX2(MIN2(scissor->offset.y, surface_params->height), 0U); + render_params->max_y_clip = + MAX2(MIN2(scissor->offset.y + scissor->extent.height, + surface_params->height), + 0U) - + 1U; + + if (state->custom_mapping.pass_count > 0U) { + struct pvr_transfer_pass *pass = + &state->custom_mapping.passes[state->pass_idx]; + + render_params->min_x_clip = (uint32_t)pass->clip_rects[0U].offset.x; + render_params->max_x_clip = + (uint32_t)(pass->clip_rects[0U].offset.x + + pass->clip_rects[0U].extent.width) - + 1U; + render_params->min_y_clip = (uint32_t)pass->clip_rects[0U].offset.y; + render_params->max_y_clip = + (uint32_t)(pass->clip_rects[0U].offset.y + + pass->clip_rects[0U].extent.height) - + 1U; + } + + state->origin_x_in_tiles = render_params->min_x_clip / tile_size_x; + state->origin_y_in_tiles = render_params->min_y_clip / tile_size_y; + state->width_in_tiles = + (render_params->max_x_clip + tile_size_x) / tile_size_x; + state->height_in_tiles = + (render_params->max_y_clip + tile_size_y) / tile_size_y; + + /* Be careful here as this isn't the same as ((max_x_clip - + * min_x_clip) + tile_size_x) >> tile_size_x. + */ + state->width_in_tiles -= state->origin_x_in_tiles; + state->height_in_tiles -= state->origin_y_in_tiles; + } + + render_params->source_start = PVR_PBE_STARTPOS_BIT0; + render_params->mrt_index = 0U; + + return VK_SUCCESS; +} + +static VkResult +pvr_pbe_setup_modify_defaults(const struct pvr_transfer_cmd_surface *dst, + struct pvr_transfer_3d_state *state, + uint32_t rt_idx, + struct pvr_pbe_surf_params *surf_params, + struct pvr_pbe_render_params *render_params) +{ + pvr_finishme("Implement pvr_pbe_setup_modify_defaults()."); + return VK_SUCCESS; +} + +static uint32_t +pvr_pbe_get_pixel_size(enum pvr_transfer_pbe_pixel_src pixel_format) +{ + switch (pixel_format) { + case PVR_TRANSFER_PBE_PIXEL_SRC_CONV_D24_D32: + case PVR_TRANSFER_PBE_PIXEL_SRC_CONV_D32_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_CONV_S8D24_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D24S8_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_F16_U8: + case PVR_TRANSFER_PBE_PIXEL_SRC_F32: + case PVR_TRANSFER_PBE_PIXEL_SRC_RAW32: + case PVR_TRANSFER_PBE_PIXEL_SRC_RBSWAP_SU1010102: + case PVR_TRANSFER_PBE_PIXEL_SRC_RBSWAP_UU1010102: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D24S8_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_S8_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_SS8888: + case PVR_TRANSFER_PBE_PIXEL_SRC_SU1010102: + case PVR_TRANSFER_PBE_PIXEL_SRC_SU8888: + case PVR_TRANSFER_PBE_PIXEL_SRC_SWAP_LMSB: + case PVR_TRANSFER_PBE_PIXEL_SRC_US8888: + case PVR_TRANSFER_PBE_PIXEL_SRC_UU1010102: + case PVR_TRANSFER_PBE_PIXEL_SRC_UU8888: + return 1U; + + case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32S8_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_F16F16: + case PVR_TRANSFER_PBE_PIXEL_SRC_F32X2: + case PVR_TRANSFER_PBE_PIXEL_SRC_MOV_BY45: + case PVR_TRANSFER_PBE_PIXEL_SRC_RAW64: + case PVR_TRANSFER_PBE_PIXEL_SRC_S16NORM: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D24S8_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D32S8_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_S8_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_SS16S16: + case PVR_TRANSFER_PBE_PIXEL_SRC_SU16U16: + case PVR_TRANSFER_PBE_PIXEL_SRC_SU32U32: + case PVR_TRANSFER_PBE_PIXEL_SRC_U16NORM: + case PVR_TRANSFER_PBE_PIXEL_SRC_US16S16: + case PVR_TRANSFER_PBE_PIXEL_SRC_US32S32: + case PVR_TRANSFER_PBE_PIXEL_SRC_UU16U16: + return 2U; + + case PVR_TRANSFER_PBE_PIXEL_SRC_F32X4: + case PVR_TRANSFER_PBE_PIXEL_SRC_RAW128: + case PVR_TRANSFER_PBE_PIXEL_SRC_S4XU32: + case PVR_TRANSFER_PBE_PIXEL_SRC_U4XS32: + return 4U; + + case PVR_TRANSFER_PBE_PIXEL_SRC_NUM: + default: + break; + } + + return 0U; +} - /* Map if bo is not mapped. */ - if (!transfer_cmd->src->vma->bo->map) { - src_mapped = true; - ret_ptr = device->ws->ops->buffer_map(transfer_cmd->src->vma->bo); - if (!ret_ptr) - return vk_error(device, VK_ERROR_MEMORY_MAP_FAILED); +static void pvr_pbe_setup_swizzle(const struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_3d_state *state, + struct pvr_pbe_surf_params *surf_params) +{ + bool color_fill = !!(transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FILL); + const struct pvr_transfer_cmd_surface *dst = &transfer_cmd->dst; + + const uint32_t pixel_size = + pvr_pbe_get_pixel_size(state->shader_props.layer_props.pbe_format); + + state->usc_pixel_width = MAX2(pixel_size, 1U); + + switch (dst->vk_format) { + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D24_UNORM_S8_UINT: + unreachable("Handle depth stencil format swizzle."); + break; + + default: { + const uint32_t red_width = + vk_format_get_component_bits(dst->vk_format, + UTIL_FORMAT_COLORSPACE_RGB, + 0U); + + if (transfer_cmd->src_present && vk_format_is_alpha(dst->vk_format)) { + if (vk_format_has_alpha(transfer_cmd->src.vk_format)) { + /* Modify the destination format swizzle to always source from + * src0. + */ + surf_params->swizzle[0U] = PIPE_SWIZZLE_X; + surf_params->swizzle[1U] = PIPE_SWIZZLE_0; + surf_params->swizzle[2U] = PIPE_SWIZZLE_0; + surf_params->swizzle[3U] = PIPE_SWIZZLE_1; + break; + } + + /* Source format having no alpha channel still allocates 4 output + * buffer registers. + */ + } + + if (vk_format_is_normalized(dst->vk_format)) { + if (color_fill && (dst->vk_format == VK_FORMAT_B8G8R8A8_UNORM || + dst->vk_format == VK_FORMAT_R8G8B8A8_UNORM)) { + surf_params->source_format = + PVRX(PBESTATE_SOURCE_FORMAT_8_PER_CHANNEL); + } else if (state->shader_props.layer_props.pbe_format == + PVR_TRANSFER_PBE_PIXEL_SRC_F16_U8) { + surf_params->source_format = + PVRX(PBESTATE_SOURCE_FORMAT_8_PER_CHANNEL); + } else if (red_width <= 8U) { + surf_params->source_format = + PVRX(PBESTATE_SOURCE_FORMAT_F16_PER_CHANNEL); + } + } else if (red_width == 32U && !state->dont_force_pbe) { + uint32_t count = 0U; + + if (transfer_cmd->src_present) { + VkFormat src_format = transfer_cmd->src.vk_format; + count = vk_format_get_common_color_channel_count(src_format, + dst->vk_format); + } + + switch (count) { + case 1U: + surf_params->swizzle[1U] = PIPE_SWIZZLE_0; + FALLTHROUGH; + case 2U: + surf_params->swizzle[2U] = PIPE_SWIZZLE_0; + FALLTHROUGH; + case 3U: + surf_params->swizzle[3U] = PIPE_SWIZZLE_1; + break; + + case 4U: + default: + break; + } + } else { + unreachable("Invalid case in pvr_pbe_setup_swizzle."); + } + break; + } + } +} + +/** + * Calculates the required PBE byte mask based on the incoming transfer command. + * + * @param transfer_cmd the transfer command + * @return the bytemask (active high disable mask) + */ + +static uint64_t pvr_pbe_byte_mask(const struct pvr_device_info *dev_info, + const struct pvr_transfer_cmd *transfer_cmd) +{ + uint32_t flags = pvr_get_blit_flags(transfer_cmd); + + assert(PVR_HAS_ERN(dev_info, 42064)); + + if (flags & PVR_TRANSFER_CMD_FLAGS_DSMERGE) { + uint32_t mask = 0U; + + switch (transfer_cmd->dst.vk_format) { + case VK_FORMAT_D32_SFLOAT_S8_UINT: + mask = 0xF0F0F0F0U; + break; + case VK_FORMAT_D24_UNORM_S8_UINT: + mask = 0x88888888U; + break; + default: + break; } - if (!transfer_cmd->dst->vma->bo->map) { - dst_mapped = true; - ret_ptr = device->ws->ops->buffer_map(transfer_cmd->dst->vma->bo); - if (!ret_ptr) - return vk_error(device, VK_ERROR_MEMORY_MAP_FAILED); + if ((flags & PVR_TRANSFER_CMD_FLAGS_PICKD) == 0U) + mask = ~mask; + + return mask; + } + + /* The mask is as it was inactive on cores without the ERN. This keeps the + * firmware agnostic to the feature. + */ + return 0U; +} + +static VkResult pvr_pbe_setup_emit(const struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_ctx *ctx, + struct pvr_transfer_3d_state *state, + uint32_t rt_count, + uint32_t *pbe_setup_words) +{ + struct pvr_device *const device = ctx->device; + const struct pvr_device_info *const dev_info = &device->pdevice->dev_info; + + struct pvr_winsys_transfer_regs *regs = &state->regs; + struct pvr_pds_event_program program = { + .emit_words = pbe_setup_words, + .num_emit_word_pairs = rt_count, + }; + struct pvr_pds_upload pds_upload; + uint32_t staging_buffer_size; + uint32_t *staging_buffer; + pvr_dev_addr_t addr; + VkResult result; + + /* Precondition, make sure to use a valid index for ctx->usc_eot_bos. */ + assert(rt_count <= ARRAY_SIZE(ctx->usc_eot_bos)); + assert(rt_count > 0U); + + addr.addr = ctx->usc_eot_bos[rt_count - 1U]->vma->dev_addr.addr - + device->heaps.usc_heap->base_addr.addr; + + pvr_pds_setup_doutu(&program.task_control, + addr.addr, + 0U, + PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE), + false); + + pvr_pds_set_sizes_pixel_event(&program, dev_info); + + staging_buffer_size = + (program.code_size + program.data_size) * sizeof(*staging_buffer); + + staging_buffer = vk_alloc(&device->vk.alloc, + staging_buffer_size, + 8U, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!staging_buffer) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + pvr_pds_generate_pixel_event_data_segment(&program, + staging_buffer, + dev_info); + + /* TODO: We can save some memory by generating a code segment for each + * rt_count, which at the time of writing is a maximum of 3, in + * pvr_setup_transfer_eot_shaders() when we setup the corresponding EOT + * USC programs. + */ + pvr_pds_generate_pixel_event_code_segment(&program, + staging_buffer + program.data_size, + dev_info); + + result = + pvr_cmd_buffer_upload_pds(transfer_cmd->cmd_buffer, + staging_buffer, + program.data_size, + PVRX(CR_EVENT_PIXEL_PDS_DATA_ADDR_ALIGNMENT), + staging_buffer + program.data_size, + program.code_size, + PVRX(CR_EVENT_PIXEL_PDS_CODE_ADDR_ALIGNMENT), + PVRX(CR_EVENT_PIXEL_PDS_DATA_ADDR_ALIGNMENT), + &pds_upload); + vk_free(&device->vk.alloc, staging_buffer); + if (result != VK_SUCCESS) + return result; + + pvr_csb_pack (®s->event_pixel_pds_info, CR_EVENT_PIXEL_PDS_INFO, reg) { + reg.temp_stride = 0U; + reg.const_size = + DIV_ROUND_UP(program.data_size, + PVRX(CR_EVENT_PIXEL_PDS_INFO_CONST_SIZE_UNIT_SIZE)); + reg.usc_sr_size = + DIV_ROUND_UP(rt_count * PVR_STATE_PBE_DWORDS, + PVRX(CR_EVENT_PIXEL_PDS_INFO_USC_SR_SIZE_UNIT_SIZE)); + } + + pvr_csb_pack (®s->event_pixel_pds_data, CR_EVENT_PIXEL_PDS_DATA, reg) { + reg.addr = PVR_DEV_ADDR(pds_upload.data_offset); + } + + pvr_csb_pack (®s->event_pixel_pds_code, CR_EVENT_PIXEL_PDS_CODE, reg) { + reg.addr = PVR_DEV_ADDR(pds_upload.code_offset); + } + + return VK_SUCCESS; +} + +static VkResult pvr_pbe_setup(const struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_ctx *ctx, + struct pvr_transfer_3d_state *state) +{ + struct pvr_device *const device = ctx->device; + const struct pvr_device_info *const dev_info = &device->pdevice->dev_info; + + const struct pvr_transfer_cmd_surface *dst = &transfer_cmd->dst; + uint32_t num_rts = vk_format_get_plane_count(dst->vk_format); + uint32_t pbe_setup_words[PVR_TRANSFER_MAX_RENDER_TARGETS * + ROGUE_NUM_PBESTATE_STATE_WORDS]; + struct pvr_pbe_render_params render_params; + struct pvr_pbe_surf_params surf_params; + VkResult result; + + if (state->custom_mapping.pass_count > 0U) + num_rts = state->custom_mapping.passes[state->pass_idx].clip_rects_count; + + if (PVR_HAS_FEATURE(dev_info, paired_tiles)) + state->pair_tiles = PVR_PAIRED_TILES_NONE; + + for (uint32_t i = 0U; i < num_rts; i++) { + uint64_t *pbe_regs; + uint32_t *pbe_words; + + /* Ensure the access into the pbe_wordx_mrty is made within its bounds. */ + assert(i * ROGUE_NUM_PBESTATE_REG_WORDS < + ARRAY_SIZE(state->regs.pbe_wordx_mrty)); + /* Ensure the access into pbe_setup_words is made within its bounds. */ + assert(i * ROGUE_NUM_PBESTATE_STATE_WORDS < ARRAY_SIZE(pbe_setup_words)); + + pbe_regs = &state->regs.pbe_wordx_mrty[i * ROGUE_NUM_PBESTATE_REG_WORDS]; + pbe_words = &pbe_setup_words[i * ROGUE_NUM_PBESTATE_STATE_WORDS]; + + if (PVR_HAS_ERN(dev_info, 42064)) + pbe_regs[2U] = 0UL; + + if (i == 0U) { + result = pvr_pbe_setup_codegen_defaults(dev_info, + transfer_cmd, + state, + &surf_params, + &render_params); + if (result != VK_SUCCESS) + return result; + } else { + result = pvr_pbe_setup_modify_defaults(dst, + state, + i, + &surf_params, + &render_params); + if (result != VK_SUCCESS) + return result; } - src_addr = - transfer_cmd->src->vma->bo->map + transfer_cmd->src->vma->bo_offset; - dst_addr = - transfer_cmd->dst->vma->bo->map + transfer_cmd->dst->vma->bo_offset; + pvr_pbe_setup_swizzle(transfer_cmd, state, &surf_params); - for (uint32_t i = 0; i < transfer_cmd->region_count; i++) { - VkBufferCopy2 *region = &transfer_cmd->regions[i]; + pvr_pbe_pack_state(dev_info, + &surf_params, + &render_params, + pbe_words, + pbe_regs); - memcpy(dst_addr + region->dstOffset, - src_addr + region->srcOffset, - region->size); + if (PVR_HAS_ERN(dev_info, 42064)) { + uint64_t temp_reg; + + pvr_csb_pack (&temp_reg, PBESTATE_REG_WORD2, reg) { + reg.sw_bytemask = pvr_pbe_byte_mask(dev_info, transfer_cmd); + } + + pbe_regs[2U] |= temp_reg; + } + + if (PVR_HAS_FEATURE(dev_info, paired_tiles)) { + if (pbe_regs[2U] & + (1ULL << PVRX(PBESTATE_REG_WORD2_PAIR_TILES_SHIFT))) { + if (transfer_cmd->dst.mem_layout == PVR_MEMLAYOUT_TWIDDLED) + state->pair_tiles = PVR_PAIRED_TILES_Y; + else + state->pair_tiles = PVR_PAIRED_TILES_X; + } } + } + + result = + pvr_pbe_setup_emit(transfer_cmd, ctx, state, num_rts, pbe_setup_words); + if (result != VK_SUCCESS) + return result; - if (src_mapped) - device->ws->ops->buffer_unmap(transfer_cmd->src->vma->bo); + /* Adjust tile origin and width to include all emits. */ + if (state->custom_mapping.pass_count > 0U) { + const uint32_t tile_size_x = + PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 0U); + const uint32_t tile_size_y = + PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 0U); + struct pvr_transfer_pass *pass = + &state->custom_mapping.passes[state->pass_idx]; + VkOffset2D offset = { 0U, 0U }; + VkOffset2D end = { 0U, 0U }; - if (dst_mapped) - device->ws->ops->buffer_unmap(transfer_cmd->dst->vma->bo); + for (uint32_t i = 0U; i < pass->clip_rects_count; i++) { + VkRect2D *rect = &pass->clip_rects[i]; + + offset.x = MIN2(offset.x, rect->offset.x); + offset.y = MIN2(offset.y, rect->offset.y); + end.x = MAX2(end.x, rect->offset.x + rect->extent.width); + end.y = MAX2(end.y, rect->offset.y + rect->extent.height); + } + + state->origin_x_in_tiles = (uint32_t)offset.x / tile_size_x; + state->origin_y_in_tiles = (uint32_t)offset.y / tile_size_y; + state->width_in_tiles = + DIV_ROUND_UP((uint32_t)end.x, tile_size_x) - state->origin_x_in_tiles; + state->height_in_tiles = + DIV_ROUND_UP((uint32_t)end.y, tile_size_y) - state->origin_y_in_tiles; } - /* Given we are doing CPU based copy, completion fence should always be - * signaled. This should be fixed when GPU based copy is implemented. + return VK_SUCCESS; +} + +/** + * Writes the ISP tile registers according to the MSAA state. Sets up the USC + * pixel partition allocations and the number of tiles in flight. + */ +static VkResult pvr_isp_tiles(const struct pvr_device *device, + struct pvr_transfer_3d_state *state) +{ + const struct pvr_device_runtime_info *dev_runtime_info = + &device->pdevice->dev_runtime_info; + const struct pvr_device_info *dev_info = &device->pdevice->dev_info; + const uint32_t isp_samples = + PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 1U); + uint32_t origin_x = state->origin_x_in_tiles; + uint32_t origin_y = state->origin_y_in_tiles; + uint32_t width = state->width_in_tiles; + uint32_t height = state->height_in_tiles; + uint32_t isp_tiles_in_flight; + + /* msaa_multiplier is calculated by sample_count & ~1U. Given sample + * count is always in powers of two, we can get the sample count from + * msaa_multiplier using the following logic. */ - return vk_sync_signal(&device->vk, signal_sync, 0); + const uint32_t samples = MAX2(state->msaa_multiplier, 1U); + + /* isp_samples_per_pixel feature is also know as "2x/4x for free", when + * this is present SAMPLES_PER_PIXEL is 2/4, otherwise 1. The following + * logic should end up with these numbers: + * + * |---------------------------------| + * | 4 SAMPLES / ISP PIXEL | + * |-----------------------+----+----| + * | MSAA | X* | Y* | + * | 2X | 1 | 1 | + * | 4X | 1 | 1 | + * |---------------------------------| + * | 2 SAMPLES / ISP PIXEL | + * |-----------------------+----+----| + * | MSAA | X* | Y* | + * | 2X | 1 | 1 | + * | 4X | 1 | 2 | + * | 8X | 2 | 2 | + * |-----------------------+----+----| + * | 1 SAMPLE / ISP PIXEL | + * |-----------------------+----+----| + * | MSAA | X* | Y* | + * | 2X | 1 | 2 | + * | 4X | 2 | 2 | + * |-----------------------+----+----| + */ + + origin_x <<= (state->msaa_multiplier >> (isp_samples + 1U)) & 1U; + origin_y <<= ((state->msaa_multiplier >> (isp_samples + 1U)) | + (state->msaa_multiplier >> isp_samples)) & + 1U; + width <<= (state->msaa_multiplier >> (isp_samples + 1U)) & 1U; + height <<= ((state->msaa_multiplier >> (isp_samples + 1U)) | + (state->msaa_multiplier >> isp_samples)) & + 1U; + + if (PVR_HAS_FEATURE(dev_info, paired_tiles) && + state->pair_tiles != PVR_PAIRED_TILES_NONE) { + width = ALIGN_POT(width, 2U); + height = ALIGN_POT(height, 2U); + } + + pvr_csb_pack (&state->regs.isp_mtile_size, CR_ISP_MTILE_SIZE, reg) { + reg.x = width; + reg.y = height; + } + + pvr_csb_pack (&state->regs.isp_render_origin, CR_ISP_RENDER_ORIGIN, reg) { + reg.x = origin_x; + reg.y = origin_y; + } + + pvr_setup_tiles_in_flight(dev_info, + dev_runtime_info, + pvr_cr_isp_aa_mode_type(samples), + state->usc_pixel_width, + state->pair_tiles != PVR_PAIRED_TILES_NONE, + 0, + &isp_tiles_in_flight, + &state->regs.usc_pixel_output_ctrl); + + pvr_csb_pack (&state->regs.isp_ctl, CR_ISP_CTL, reg) { + reg.process_empty_tiles = true; + + if (PVR_HAS_FEATURE(dev_info, paired_tiles)) { + if (state->pair_tiles == PVR_PAIRED_TILES_X) { + reg.pair_tiles = true; + } else if (state->pair_tiles == PVR_PAIRED_TILES_Y) { + reg.pair_tiles = true; + reg.pair_tiles_vert = true; + } + } + } + + state->regs.isp_ctl |= isp_tiles_in_flight; + + return VK_SUCCESS; +} + +static bool +pvr_int_pbe_pixel_changes_dst_rate(const struct pvr_device_info *dev_info, + enum pvr_transfer_pbe_pixel_src pbe_format) +{ + /* We don't emulate rate change from the USC with the pbe_yuv feature. */ + if (!PVR_HAS_FEATURE(dev_info, pbe_yuv) && + (pbe_format == PVR_TRANSFER_PBE_PIXEL_SRC_Y_UV_INTERLEAVED || + pbe_format == PVR_TRANSFER_PBE_PIXEL_SRC_Y_U_V)) { + return true; + } + + return false; +} + +/** + * Number of DWORDs from the unified store that floating texture coefficients + * take up. + */ +static void pvr_uv_space(const struct pvr_device_info *dev_info, + const struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_3d_state *state) +{ + const struct pvr_transfer_cmd_surface *dst = &transfer_cmd->dst; + const VkRect2D *dst_rect = &transfer_cmd->scissor; + + /* This also avoids division by 0 in pvr_dma_texture_floats(). */ + if (state->custom_mapping.pass_count == 0U && + (dst_rect->extent.width == 0U || dst_rect->extent.height == 0U || + MAX2(dst_rect->offset.x, dst_rect->offset.x + dst_rect->extent.width) < + 0U || + MIN2(dst_rect->offset.x, dst_rect->offset.x + dst_rect->extent.width) > + (int32_t)dst->width || + MAX2(dst_rect->offset.y, dst_rect->offset.y + dst_rect->extent.height) < + 0U || + MIN2(dst_rect->offset.y, dst_rect->offset.y + dst_rect->extent.height) > + (int32_t)dst->height)) { + state->empty_dst = true; + } else { + state->empty_dst = false; + + if (transfer_cmd->src_present) { + struct pvr_tq_layer_properties *layer = + &state->shader_props.layer_props; + + const VkRect2D *src_rect = &transfer_cmd->mappings[0U].src_rect; + const VkRect2D *dst_rect = &transfer_cmd->mappings[0U].dst_rect; + int32_t dst_x1 = dst_rect->offset.x + dst_rect->extent.width; + int32_t dst_y1 = dst_rect->offset.y + dst_rect->extent.height; + int32_t src_x1 = src_rect->offset.x + src_rect->extent.width; + int32_t src_y1 = src_rect->offset.y + src_rect->extent.height; + + if (state->filter > PVR_FILTER_POINT) { + layer->layer_floats = PVR_INT_COORD_SET_FLOATS_4; + } else if (src_rect->extent.width == 0U || + src_rect->extent.height == 0U) { + layer->layer_floats = PVR_INT_COORD_SET_FLOATS_0; + } else if ((src_rect->offset.x * dst_x1 != + src_x1 * dst_rect->offset.x) || + (src_rect->offset.y * dst_y1 != + src_y1 * dst_rect->offset.y) || + (src_rect->extent.width != dst_rect->extent.width) || + (src_rect->extent.height != dst_rect->extent.height)) { + layer->layer_floats = PVR_INT_COORD_SET_FLOATS_4; + } else { + layer->layer_floats = PVR_INT_COORD_SET_FLOATS_0; + } + + /* We have to adjust the rate. */ + if (layer->layer_floats != PVR_INT_COORD_SET_FLOATS_0 && + pvr_int_pbe_pixel_changes_dst_rate(dev_info, layer->pbe_format)) { + layer->layer_floats = PVR_INT_COORD_SET_FLOATS_6; + } + } + } +} + +static uint32_t pvr_int_pbe_pixel_num_sampler_and_image_states( + enum pvr_transfer_pbe_pixel_src pbe_format, + uint32_t alpha_type) +{ + switch (pbe_format) { + case PVR_TRANSFER_PBE_PIXEL_SRC_Y_UV_INTERLEAVED: + case PVR_TRANSFER_PBE_PIXEL_SRC_Y_U_V: + return 1U; + default: + return pvr_pbe_pixel_num_loads(pbe_format, alpha_type); + } +} + +static VkResult pvr_sampler_state_for_surface( + const struct pvr_device_info *dev_info, + const struct pvr_transfer_cmd_surface *surface, + enum pvr_filter filter, + const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout, + uint32_t sampler, + uint32_t *mem_ptr) +{ + uint64_t sampler_state[2U] = { 0UL, 0UL }; + + pvr_csb_pack (&sampler_state[0U], TEXSTATE_SAMPLER, reg) { + reg.anisoctl = PVRX(TEXSTATE_ANISOCTL_DISABLED); + reg.minlod = PVRX(TEXSTATE_CLAMP_MIN); + reg.maxlod = PVRX(TEXSTATE_CLAMP_MIN); + reg.dadjust = PVRX(TEXSTATE_DADJUST_MIN_UINT); + + if (filter == PVR_FILTER_DONTCARE || filter == PVR_FILTER_POINT) { + reg.minfilter = PVRX(TEXSTATE_FILTER_POINT); + reg.magfilter = PVRX(TEXSTATE_FILTER_POINT); + } else if (filter == PVR_FILTER_LINEAR) { + reg.minfilter = PVRX(TEXSTATE_FILTER_LINEAR); + reg.magfilter = PVRX(TEXSTATE_FILTER_LINEAR); + } else { + assert(PVR_HAS_FEATURE(dev_info, tf_bicubic_filter)); + reg.minfilter = PVRX(TEXSTATE_FILTER_BICUBIC); + reg.magfilter = PVRX(TEXSTATE_FILTER_BICUBIC); + } + + reg.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE); + reg.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE); + + if (surface->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED) + reg.addrmode_w = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE); + } + + assert(sampler < PVR_TRANSFER_MAX_IMAGES); + + assert(sampler <= sh_reg_layout->combined_image_samplers.count); + mem_ptr += sh_reg_layout->combined_image_samplers.offsets[sampler].sampler; + + memcpy(mem_ptr, sampler_state, sizeof(sampler_state)); + + return VK_SUCCESS; +} + +static inline VkResult pvr_image_state_set_codegen_defaults( + struct pvr_device *device, + struct pvr_transfer_3d_state *state, + const struct pvr_transfer_cmd_surface *surface, + uint32_t load, + uint64_t *mem_ptr) +{ + struct pvr_tq_layer_properties *layer = &state->shader_props.layer_props; + struct pvr_texture_state_info info = { 0U }; + VkResult result; + + switch (surface->vk_format) { + /* ERN 46863 */ + case VK_FORMAT_D32_SFLOAT_S8_UINT: + switch (layer->pbe_format) { + case PVR_TRANSFER_PBE_PIXEL_SRC_RAW32: + case PVR_TRANSFER_PBE_PIXEL_SRC_RAW64: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_S8_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D32S8_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32S8_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_CONV_D32_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32_D24S8: + info.format = VK_FORMAT_R32G32_UINT; + break; + default: + break; + } + break; + + case VK_FORMAT_D24_UNORM_S8_UINT: + case VK_FORMAT_X8_D24_UNORM_PACK32: + info.format = VK_FORMAT_R32_UINT; + break; + + default: + info.format = surface->vk_format; + break; + } + + info.flags = 0U; + info.base_level = 0U; + info.mip_levels = 1U; + info.mipmaps_present = false; + info.sample_count = MAX2(surface->sample_count, 1U); + + if (surface->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED) + info.extent.depth = surface->depth; + else + info.extent.depth = 0U; + + if (PVR_HAS_FEATURE(&device->pdevice->dev_info, tpu_array_textures)) + info.array_size = 0U; + + result = pvr_mem_layout_spec(surface, + load, + true, + &info.extent.width, + &info.extent.height, + &info.stride, + &info.mem_layout, + &info.addr); + if (result != VK_SUCCESS) + return result; + + if (state->custom_mapping.texel_extend_dst > 1U) { + info.extent.width /= state->custom_mapping.texel_extend_dst; + info.stride /= state->custom_mapping.texel_extend_dst; + } + + info.tex_state_type = PVR_TEXTURE_STATE_SAMPLE; + memcpy(info.swizzle, + pvr_get_format_swizzle(info.format), + sizeof(info.swizzle)); + + if (info.extent.depth > 0U) + info.type = VK_IMAGE_VIEW_TYPE_3D; + else if (info.extent.height > 1U) + info.type = VK_IMAGE_VIEW_TYPE_2D; + else + info.type = VK_IMAGE_VIEW_TYPE_1D; + + result = pvr_pack_tex_state(device, &info, mem_ptr); + if (result != VK_SUCCESS) + return result; + + return VK_SUCCESS; +} + +static VkResult pvr_image_state_for_surface( + const struct pvr_transfer_ctx *ctx, + const struct pvr_transfer_cmd *transfer_cmd, + const struct pvr_transfer_cmd_surface *surface, + uint32_t load, + const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout, + struct pvr_transfer_3d_state *state, + uint32_t uf_image, + uint32_t *mem_ptr) +{ + uint32_t tex_state[ROGUE_MAXIMUM_IMAGE_STATE_SIZE] = { 0U }; + VkResult result; + uint8_t offset; + + result = pvr_image_state_set_codegen_defaults(ctx->device, + state, + surface, + load, + (uint64_t *)tex_state); + if (result != VK_SUCCESS) + return result; + + assert(uf_image < PVR_TRANSFER_MAX_IMAGES); + + /* Offset of the shared registers containing the hardware image state. */ + assert(uf_image < sh_reg_layout->combined_image_samplers.count); + offset = sh_reg_layout->combined_image_samplers.offsets[uf_image].image; + + /* Copy the image state to the buffer which is loaded into the shared + * registers. + */ + memcpy(mem_ptr + offset, tex_state, sizeof(tex_state)); + + return VK_SUCCESS; +} + +/* Writes the texture state/sampler state into DMAed memory. */ +static VkResult +pvr_sampler_image_state(struct pvr_transfer_ctx *ctx, + const struct pvr_transfer_cmd *transfer_cmd, + const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout, + struct pvr_transfer_3d_state *state, + uint32_t *mem_ptr) +{ + if (!state->empty_dst) { + struct pvr_tq_layer_properties *layer = + &state->shader_props.layer_props; + uint32_t max_load = + pvr_pbe_pixel_num_loads(layer->pbe_format, + state->shader_props.alpha_type); + uint32_t uf_sampler = 0U; + uint32_t uf_image = 0U; + + for (uint32_t load = 0U; load < max_load; load++) { + const struct pvr_transfer_cmd_surface *surface; + enum pvr_filter filter; + VkResult result; + + switch (layer->pbe_format) { + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_S8_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D24S8_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D32S8_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32S8_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_S8_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D24S8_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D24S8_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_F16F16: + case PVR_TRANSFER_PBE_PIXEL_SRC_F16_U8: + if (load > 0U) { + surface = &transfer_cmd->dst; + + if (state->shader_props.alpha_type != PVR_ALPHA_NONE) + filter = PVR_FILTER_POINT; + else + filter = transfer_cmd->filter; + } else { + surface = &transfer_cmd->src; + filter = state->filter; + } + break; + + case PVR_TRANSFER_PBE_PIXEL_SRC_Y_UV_INTERLEAVED: + case PVR_TRANSFER_PBE_PIXEL_SRC_Y_U_V: + surface = &transfer_cmd->src; + filter = state->filter; + break; + + default: + surface = &transfer_cmd->src; + filter = state->filter; + break; + } + + if (load < pvr_int_pbe_pixel_num_sampler_and_image_states( + layer->pbe_format, + state->shader_props.alpha_type)) { + const struct pvr_device_info *dev_info = + &transfer_cmd->cmd_buffer->device->pdevice->dev_info; + + result = pvr_sampler_state_for_surface(dev_info, + surface, + filter, + sh_reg_layout, + uf_sampler, + mem_ptr); + if (result != VK_SUCCESS) + return result; + + uf_sampler++; + + result = pvr_image_state_for_surface(ctx, + transfer_cmd, + surface, + load, + sh_reg_layout, + state, + uf_image, + mem_ptr); + if (result != VK_SUCCESS) + return result; + + uf_image++; + } + } + } + + return VK_SUCCESS; +} + +/* The returned offset is in dwords. */ +static inline uint32_t pvr_dynamic_const_reg_advance( + const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout, + struct pvr_transfer_3d_state *state) +{ + const uint32_t offset = sh_reg_layout->dynamic_consts.offset; + + assert(state->dynamic_const_reg_ptr < sh_reg_layout->dynamic_consts.count); + + return offset + state->dynamic_const_reg_ptr++; +} + +static inline void +pvr_dma_global_alpha(const struct pvr_transfer_alpha *alpha, + struct pvr_transfer_3d_state *state, + const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout, + uint32_t *mem_ptr) +{ + float global = (float)alpha->global / 255.0f; + + mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] = fui(global); +} + +/** Scales coefficients for sampling. (non normalized). */ +static inline void +pvr_dma_texture_floats(const struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_3d_state *state, + const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout, + uint32_t *mem_ptr) + +{ + if (transfer_cmd->src_present) { + struct pvr_tq_layer_properties *layer = &state->shader_props.layer_props; + const struct pvr_rect_mapping *mapping = &transfer_cmd->mappings[0U]; + VkRect2D src_rect = mapping->src_rect; + VkRect2D dst_rect = mapping->dst_rect; + + switch (layer->layer_floats) { + case PVR_INT_COORD_SET_FLOATS_0: + break; + + case PVR_INT_COORD_SET_FLOATS_6: + case PVR_INT_COORD_SET_FLOATS_4: { + int32_t consts[2U] = { 0U, 0U }; + int32_t denom[2U] = { 0U, 0U }; + int32_t nums[2U] = { 0U, 0U }; + int32_t src_x, dst_x; + int32_t src_y, dst_y; + float offset = 0.0f; + float tmp; + + dst_x = dst_rect.extent.width; + dst_y = dst_rect.extent.height; + src_x = src_rect.extent.width; + src_y = src_rect.extent.height; + + nums[0U] = src_x; + denom[0U] = dst_x; + consts[0U] = src_rect.offset.x * dst_x - src_x * dst_rect.offset.x; + nums[1U] = src_y; + denom[1U] = dst_y; + consts[1U] = src_rect.offset.y * dst_y - src_y * dst_rect.offset.y; + + for (uint32_t i = 0U; i < 2U; i++) { + tmp = (float)(nums[i]) / (float)(denom[i]); + mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] = + fui(tmp); + + tmp = ((float)(consts[i]) + (i == 1U ? offset : 0.0f)) / + (float)(denom[i]); + mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] = + fui(tmp); + } + + if (layer->layer_floats == PVR_INT_COORD_SET_FLOATS_6) { + tmp = (float)MIN2(dst_rect.offset.x, dst_rect.offset.x + dst_x); + mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] = + fui(tmp); + + tmp = (float)MIN2(dst_rect.offset.y, dst_rect.offset.y + dst_y); + mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] = + fui(tmp); + } + break; + } + + default: + unreachable("Unknown COORD_SET_FLOATS."); + break; + } + } +} + +static bool pvr_int_pbe_pixel_requires_usc_filter( + const struct pvr_device_info *dev_info, + enum pvr_transfer_pbe_pixel_src pixel_format) +{ + switch (pixel_format) { + case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D24S8_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D24S8_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_U16NORM: + case PVR_TRANSFER_PBE_PIXEL_SRC_S16NORM: + case PVR_TRANSFER_PBE_PIXEL_SRC_F32: + case PVR_TRANSFER_PBE_PIXEL_SRC_F32X2: + case PVR_TRANSFER_PBE_PIXEL_SRC_F32X4: + return true; + case PVR_TRANSFER_PBE_PIXEL_SRC_F16F16: + return !PVR_HAS_FEATURE(dev_info, pbe_filterable_f16); + default: + return false; + } +} + +/** + * Sets up the MSAA related bits in the operation + * + * TPU sample count is read directly from transfer_cmd in the TPU code. An MSAA + * src can be read from sample rate or instance rate shaders as long as the + * sample count is set on the TPU. If a layer is single sample we expect the + * same sample replicated in full rate shaders. If the layer is multi sample, + * instance rate shaders are used to emulate the filter or to select the + * specified sample. The sample number is static in the programs. + */ +static VkResult pvr_msaa_state(const struct pvr_device_info *dev_info, + const struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_3d_state *state) +{ + struct pvr_tq_shader_properties *shader_props = &state->shader_props; + struct pvr_tq_layer_properties *layer = &shader_props->layer_props; + struct pvr_winsys_transfer_regs *const regs = &state->regs; + uint32_t src_sample_count = transfer_cmd->src.sample_count & ~1U; + uint32_t dst_sample_count = transfer_cmd->dst.sample_count & ~1U; + uint32_t bsample_count = 0U; + + shader_props->full_rate = false; + state->msaa_multiplier = 1U; + state->down_scale = false; + + /* clang-format off */ + pvr_csb_pack (®s->isp_aa, CR_ISP_AA, reg); + /* clang-format on */ + + layer->sample_count = 1U; + layer->resolve_op = PVR_RESOLVE_BLEND; + + bsample_count |= src_sample_count | dst_sample_count; + + if (bsample_count > PVR_GET_FEATURE_VALUE(dev_info, max_multisample, 0U)) + return vk_error(transfer_cmd->cmd_buffer, VK_ERROR_FORMAT_NOT_SUPPORTED); + + /* Shouldn't get two distinct bits set (implies different sample counts). + * The reason being the rate at which the shader runs has to match. + */ + if ((bsample_count & (bsample_count - 1U)) != 0U) + return vk_error(transfer_cmd->cmd_buffer, VK_ERROR_FORMAT_NOT_SUPPORTED); + + if (src_sample_count == 0U && dst_sample_count == 0U) { + /* S -> S (no MSAA involved). */ + layer->msaa = false; + } else if (src_sample_count != 0U && dst_sample_count == 0U) { + /* M -> S (resolve). */ + layer->resolve_op = transfer_cmd->resolve_op; + + if ((uint32_t)layer->resolve_op >= + (src_sample_count + (uint32_t)PVR_RESOLVE_SAMPLE0)) { + return vk_error(transfer_cmd->cmd_buffer, + VK_ERROR_FORMAT_NOT_SUPPORTED); + } + + layer->msaa = true; + + switch (layer->resolve_op) { + case PVR_RESOLVE_MIN: + case PVR_RESOLVE_MAX: + switch (transfer_cmd->src.vk_format) { + case VK_FORMAT_D32_SFLOAT: + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_S8_UINT: + case VK_FORMAT_D24_UNORM_S8_UINT: + case VK_FORMAT_X8_D24_UNORM_PACK32: + if (transfer_cmd->src.vk_format != transfer_cmd->dst.vk_format) { + return vk_error(transfer_cmd->cmd_buffer, + VK_ERROR_FORMAT_NOT_SUPPORTED); + } + break; + + default: + return vk_error(transfer_cmd->cmd_buffer, + VK_ERROR_FORMAT_NOT_SUPPORTED); + } + + /* Instance rate. */ + layer->sample_count = src_sample_count; + state->shader_props.full_rate = false; + break; + + case PVR_RESOLVE_BLEND: + if (pvr_int_pbe_pixel_requires_usc_filter(dev_info, + layer->pbe_format)) { + /* Instance rate. */ + layer->sample_count = src_sample_count; + state->shader_props.full_rate = false; + } else { + /* Sample rate. */ + state->shader_props.full_rate = true; + state->msaa_multiplier = src_sample_count; + state->down_scale = true; + + pvr_csb_pack (®s->isp_aa, CR_ISP_AA, reg) { + reg.mode = pvr_cr_isp_aa_mode_type(src_sample_count); + } + } + break; + + default: + /* Shader doesn't have to know the number of samples. It's enough + * if the TPU knows, and the shader sets the right sno (given to the + * shader in resolve_op). + */ + state->shader_props.full_rate = false; + break; + } + } else { + state->msaa_multiplier = dst_sample_count; + + pvr_csb_pack (®s->isp_aa, CR_ISP_AA, reg) { + reg.mode = pvr_cr_isp_aa_mode_type(dst_sample_count); + } + + if (src_sample_count == 0U && dst_sample_count != 0U) { + /* S -> M (replicate samples) */ + layer->msaa = false; + state->shader_props.full_rate = !state->shader_props.iterated; + } else { + /* M -> M (sample to sample) */ + layer->msaa = true; + state->shader_props.full_rate = true; + } + } + + return VK_SUCCESS; +} + +static bool pvr_requires_usc_linear_filter(VkFormat format) +{ + switch (format) { + case VK_FORMAT_R32_SFLOAT: + case VK_FORMAT_R32G32_SFLOAT: + case VK_FORMAT_R32G32B32_SFLOAT: + case VK_FORMAT_R32G32B32A32_SFLOAT: + case VK_FORMAT_D32_SFLOAT: + case VK_FORMAT_D24_UNORM_S8_UINT: + case VK_FORMAT_X8_D24_UNORM_PACK32: + return true; + default: + return false; + } +} + +static inline bool +pvr_int_pbe_usc_linear_filter(enum pvr_transfer_pbe_pixel_src pbe_format, + bool sample, + bool msaa, + bool full_rate) +{ + if (sample || msaa || full_rate) + return false; + + switch (pbe_format) { + case PVR_TRANSFER_PBE_PIXEL_SRC_D24S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_S8D24: + case PVR_TRANSFER_PBE_PIXEL_SRC_D32S8: + case PVR_TRANSFER_PBE_PIXEL_SRC_F32: + case PVR_TRANSFER_PBE_PIXEL_SRC_F32X2: + case PVR_TRANSFER_PBE_PIXEL_SRC_F32X4: + return true; + default: + return false; + } +} + +static inline bool pvr_pick_component_needed( + const struct pvr_transfer_custom_mapping *custom_mapping) +{ + return custom_mapping->pass_count > 0U && + custom_mapping->texel_extend_dst > 1U && + custom_mapping->texel_extend_src <= 1U; +} + +/** Writes the shader related constants into the DMA space. */ +static void +pvr_write_usc_constants(const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout, + uint32_t *dma_space) +{ + const uint32_t reg = sh_reg_layout->driver_total; + const uint32_t consts_count = + sh_reg_layout->compiler_out.usc_constants.count; + + /* If not we likely need to write more consts. */ + assert(consts_count == sh_reg_layout->compiler_out_total); + + /* Append the usc consts after the driver allocated regs. */ + for (uint32_t i = 0U; i < consts_count; i++) + dma_space[reg + i] = sh_reg_layout->compiler_out.usc_constants.values[i]; +} + +static inline void +pvr_dma_texel_unwind(struct pvr_transfer_3d_state *state, + const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout, + uint32_t *mem_ptr) + +{ + const uint32_t coord_sample_mask = + state->custom_mapping.texel_extend_dst - 1U; + + mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] = + coord_sample_mask; + mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] = + state->custom_mapping.texel_unwind_dst; +} + +/** Writes the Uniform/Texture state data segments + the UniTex code. */ +static inline VkResult +pvr_pds_unitex(const struct pvr_device_info *dev_info, + struct pvr_transfer_ctx *ctx, + const struct pvr_transfer_cmd *transfer_cmd, + struct pvr_pds_pixel_shader_sa_program *program, + struct pvr_transfer_prep_data *prep_data) +{ + struct pvr_pds_upload *unitex_code = + &ctx->pds_unitex_code[program->num_texture_dma_kicks] + [program->num_uniform_dma_kicks]; + struct pvr_transfer_3d_state *state = &prep_data->state; + struct pvr_bo *pvr_bo; + VkResult result; + + /* Uniform program is not used. */ + assert(program->num_uniform_dma_kicks == 0U); + + if (program->num_texture_dma_kicks == 0U) { + state->uniform_data_size = 0U; + state->tex_state_data_size = 0U; + state->tex_state_data_offset = 0U; + state->uni_tex_code_offset = 0U; + + return VK_SUCCESS; + } + + pvr_pds_set_sizes_pixel_shader_sa_uniform_data(program, dev_info); + assert(program->data_size == 0U); + state->uniform_data_size = 0U; + + pvr_pds_set_sizes_pixel_shader_sa_texture_data(program, dev_info); + state->tex_state_data_size = + ALIGN_POT(program->data_size, + PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE)); + + result = pvr_cmd_buffer_alloc_mem(transfer_cmd->cmd_buffer, + ctx->device->heaps.pds_heap, + state->tex_state_data_size << 2U, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &pvr_bo); + if (result != VK_SUCCESS) + return result; + + state->tex_state_data_offset = + pvr_bo->vma->dev_addr.addr - ctx->device->heaps.pds_heap->base_addr.addr; + + pvr_pds_generate_pixel_shader_sa_texture_state_data(program, + pvr_bo->bo->map, + dev_info); + + pvr_bo_cpu_unmap(transfer_cmd->cmd_buffer->device, pvr_bo); + + /* Save the dev_addr and size in the 3D state. */ + state->uni_tex_code_offset = unitex_code->code_offset; + state->pds_temps = program->temps_used; + + return VK_SUCCESS; +} + +/** Converts a float in range 0 to 1 to an N-bit fixed-point integer. */ +static uint32_t pvr_float_to_ufixed(float value, uint32_t bits) +{ + uint32_t max = (1U << bits) - 1U; + + /* NaN and Inf and overflow. */ + if (util_is_inf_or_nan(value) || value >= 1.0f) + return max; + else if (value < 0.0f) + return 0U; + + /* Normalise. */ + value = value * (float)max; + + /* Cast to double so that we can accurately represent the sum for N > 23. */ + return (uint32_t)floor((double)value + 0.5f); +} + +/** Converts a float in range -1 to 1 to a signed N-bit fixed-point integer. */ +static uint32_t pvr_float_to_sfixed(float value, uint32_t N) +{ + int32_t max = (1 << (N - 1)) - 1; + int32_t min = 0 - (1 << (N - 1)); + union fi x; + + /* NaN and Inf and overflow. */ + if (util_is_inf_or_nan(value) || value >= 1.0f) + return (uint32_t)max; + else if (value == 0.0f) + return 0U; + else if (value <= -1.0f) + return (uint32_t)min; + + /* Normalise. */ + value *= (float)max; + + /* Cast to double so that we can accurately represent the sum for N > 23. */ + if (value > 0.0f) + x.i = (int32_t)floor((double)value + 0.5f); + else + x.i = (int32_t)floor((double)value - 0.5f); + + return x.ui; +} + +/** Convert a value in IEEE single precision format to 16-bit floating point + * format. + */ +/* TODO: See if we can use _mesa_float_to_float16_rtz_slow() instead. */ +static uint16_t pvr_float_to_f16(float value, bool round_to_even) +{ + uint32_t input_value; + uint32_t exponent; + uint32_t mantissa; + uint16_t output; + + /* 0.0f can be exactly expressed in binary using IEEE float format. */ + if (value == 0.0f) + return 0U; + + if (value < 0U) { + output = 0x8000; + value = -value; + } else { + output = 0U; + } + + /* 2^16 * (2 - 1/1024) = highest f16 representable value. */ + value = MIN2(value, 131008); + input_value = fui(value); + + /* Extract the exponent and mantissa. */ + exponent = util_get_float32_exponent(value) + 15; + mantissa = input_value & ((1 << 23) - 1); + + /* If the exponent is outside the supported range then denormalise the + * mantissa. + */ + if ((int32_t)exponent <= 0) { + uint32_t shift; + + mantissa |= (1 << 23); + exponent = input_value >> 23; + shift = -14 + 127 - exponent; + + if (shift < 24) + mantissa >>= shift; + else + mantissa = 0; + } else { + output = (uint16_t)(output | ((exponent << 10) & 0x7C00)); + } + + output = (uint16_t)(output | (((mantissa >> 13) << 0) & 0x03FF)); + + if (round_to_even) { + /* Round to nearest even. */ + if ((((int)value) % 2 != 0) && (((1 << 13) - 1) & mantissa)) + output++; + } else { + /* Round to nearest. */ + if (mantissa & (1 << 12)) + output++; + } + + return output; +} + +static VkResult pvr_pack_clear_color(VkFormat format, + const union fi color[static 4], + uint32_t pkd_color[static 4]) +{ + const uint32_t red_width = + vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0U); + uint32_t pbe_pack_mode = pvr_get_pbe_packmode(format); + const bool pbe_norm = vk_format_is_normalized(format); + + if (pbe_pack_mode == PVRX(PBESTATE_PACKMODE_INVALID)) + return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED); + + if (format == VK_FORMAT_A2B10G10R10_UINT_PACK32 || + format == VK_FORMAT_A2R10G10B10_UINT_PACK32) { + pbe_pack_mode = PVRX(PBESTATE_PACKMODE_R10B10G10A2); + } else if (format == VK_FORMAT_B8G8R8A8_UNORM || + format == VK_FORMAT_R8G8B8A8_UNORM) { + pbe_pack_mode = PVRX(PBESTATE_PACKMODE_U8U8U8U8); + } else if (format != VK_FORMAT_D16_UNORM && + format != VK_FORMAT_X8_D24_UNORM_PACK32 && red_width <= 8U && + vk_format_is_normalized(format)) { + pbe_pack_mode = PVRX(PBESTATE_PACKMODE_F16F16F16F16); + } else if (vk_format_is_srgb(format)) { + pbe_pack_mode = PVRX(PBESTATE_PACKMODE_F16F16F16F16); + } + + /* Set packed color based on PBE pack mode and PBE norm. */ + switch (pbe_pack_mode) { + case PVRX(PBESTATE_PACKMODE_U8U8U8U8): + case PVRX(PBESTATE_PACKMODE_A1R5G5B5): + case PVRX(PBESTATE_PACKMODE_R5G5B5A1): + case PVRX(PBESTATE_PACKMODE_A4R4G4B4): + case PVRX(PBESTATE_PACKMODE_A8R3G3B2): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 8) & 0xFFU; + pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 8) & 0xFFU) << 8; + pkd_color[0] |= (pvr_float_to_ufixed(color[2].f, 8) & 0xFFU) << 16; + pkd_color[0] |= (pvr_float_to_ufixed(color[3].f, 8) & 0xFFU) << 24; + } else { + pkd_color[0] = color[0].ui & 0xFFU; + pkd_color[0] |= (color[1].ui & 0xFFU) << 8; + pkd_color[0] |= (color[2].ui & 0xFFU) << 16; + pkd_color[0] |= (color[3].ui & 0xFFU) << 24; + } + break; + + case PVRX(PBESTATE_PACKMODE_S8S8S8S8): + case PVRX(PBESTATE_PACKMODE_X8U8S8S8): + case PVRX(PBESTATE_PACKMODE_X8S8S8U8): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_sfixed(color[0].f, 8) & 0xFFU; + pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 8) & 0xFFU) << 8; + pkd_color[0] |= (pvr_float_to_sfixed(color[2].f, 8) & 0xFFU) << 16; + pkd_color[0] |= (pvr_float_to_sfixed(color[3].f, 8) & 0xFFU) << 24; + } else { + pkd_color[0] = color[0].ui & 0xFFU; + pkd_color[0] |= (color[1].ui & 0xFFU) << 8; + pkd_color[0] |= (color[2].ui & 0xFFU) << 16; + pkd_color[0] |= (color[3].ui & 0xFFU) << 24; + } + break; + + case PVRX(PBESTATE_PACKMODE_U16U16U16U16): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 16) & 0xFFFFU; + pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 16) & 0xFFFFU) << 16; + pkd_color[1] = pvr_float_to_ufixed(color[2].f, 16) & 0xFFFFU; + pkd_color[1] |= (pvr_float_to_ufixed(color[3].f, 16) & 0xFFFFU) << 16; + } else { + pkd_color[0] = color[0].ui & 0xFFFFU; + pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16; + pkd_color[1] = color[2].ui & 0xFFFFU; + pkd_color[1] |= (color[3].ui & 0xFFFFU) << 16; + } + break; + + case PVRX(PBESTATE_PACKMODE_S16S16S16S16): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_sfixed(color[0].f, 16) & 0xFFFFU; + pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 16) & 0xFFFFU) << 16; + pkd_color[1] = (pvr_float_to_sfixed(color[2].f, 16) & 0xFFFFU); + pkd_color[1] |= (pvr_float_to_sfixed(color[3].f, 16) & 0xFFFFU) << 16; + } else { + pkd_color[0] = color[0].ui & 0xFFFFU; + pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16; + pkd_color[1] = color[2].ui & 0xFFFFU; + pkd_color[1] |= (color[3].ui & 0xFFFFU) << 16; + } + break; + + case PVRX(PBESTATE_PACKMODE_A2_XRBIAS_U10U10U10): + case PVRX(PBESTATE_PACKMODE_ARGBV16_XR10): + case PVRX(PBESTATE_PACKMODE_F16F16F16F16): + case PVRX(PBESTATE_PACKMODE_A2R10B10G10): + if (red_width > 0) { + pkd_color[0] = (uint32_t)pvr_float_to_f16(color[0].f, false); + pkd_color[0] |= (uint32_t)pvr_float_to_f16(color[1].f, false) << 16; + pkd_color[1] = (uint32_t)pvr_float_to_f16(color[2].f, false); + pkd_color[1] |= (uint32_t)pvr_float_to_f16(color[3].f, false) << 16; + } else { + /* Swizzle only uses first channel for alpha formats. */ + pkd_color[0] = (uint32_t)pvr_float_to_f16(color[3].f, false); + } + break; + + case PVRX(PBESTATE_PACKMODE_U32U32U32U32): + pkd_color[0] = color[0].ui; + pkd_color[1] = color[1].ui; + pkd_color[2] = color[2].ui; + pkd_color[3] = color[3].ui; + break; + + case PVRX(PBESTATE_PACKMODE_S32S32S32S32): + pkd_color[0] = (uint32_t)color[0].i; + pkd_color[1] = (uint32_t)color[1].i; + pkd_color[2] = (uint32_t)color[2].i; + pkd_color[3] = (uint32_t)color[3].i; + break; + + case PVRX(PBESTATE_PACKMODE_F32F32F32F32): + memcpy(pkd_color, &color[0].f, 4U * sizeof(float)); + break; + + case PVRX(PBESTATE_PACKMODE_R10B10G10A2): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 10) & 0xFFU; + pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 10) & 0xFFU) << 10; + pkd_color[0] |= (pvr_float_to_ufixed(color[2].f, 10) & 0xFFU) << 20; + pkd_color[0] |= (pvr_float_to_ufixed(color[3].f, 2) & 0xFFU) << 30; + } else if (format == VK_FORMAT_A2R10G10B10_UINT_PACK32) { + pkd_color[0] = color[2].ui & 0x3FFU; + pkd_color[0] |= (color[1].ui & 0x3FFU) << 10; + pkd_color[0] |= (color[0].ui & 0x3FFU) << 20; + pkd_color[0] |= (color[3].ui & 0x3U) << 30; + } else { + pkd_color[0] = color[0].ui & 0x3FFU; + pkd_color[0] |= (color[1].ui & 0x3FFU) << 10; + pkd_color[0] |= (color[2].ui & 0x3FFU) << 20; + pkd_color[0] |= (color[3].ui & 0x3U) << 30; + } + + break; + + case PVRX(PBESTATE_PACKMODE_A2F10F10F10): + case PVRX(PBESTATE_PACKMODE_F10F10F10A2): + pkd_color[0] = pvr_float_to_sfixed(color[0].f, 10) & 0xFFU; + pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 10) & 0xFFU) << 10; + pkd_color[0] |= (pvr_float_to_sfixed(color[2].f, 10) & 0xFFU) << 20; + pkd_color[0] |= (pvr_float_to_sfixed(color[3].f, 2) & 0xFFU) << 30; + break; + + case PVRX(PBESTATE_PACKMODE_U8U8U8): + case PVRX(PBESTATE_PACKMODE_R5G6B5): + case PVRX(PBESTATE_PACKMODE_R5SG5SB6): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 8) & 0xFFU; + pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 8) & 0xFFU) << 8; + pkd_color[0] |= (pvr_float_to_ufixed(color[2].f, 8) & 0xFFU) << 16; + } else { + pkd_color[0] = color[0].ui & 0xFFU; + pkd_color[0] |= (color[1].ui & 0xFFU) << 8; + pkd_color[0] |= (color[2].ui & 0xFFU) << 16; + } + break; + + case PVRX(PBESTATE_PACKMODE_S8S8S8): + case PVRX(PBESTATE_PACKMODE_B6G5SR5S): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_sfixed(color[0].f, 8) & 0xFFU; + pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 8) & 0xFFU) << 8; + pkd_color[0] |= (pvr_float_to_sfixed(color[2].f, 8) & 0xFFU) << 16; + } else { + pkd_color[0] = color[0].ui & 0xFFU; + pkd_color[0] |= (color[1].ui & 0xFFU) << 8; + pkd_color[0] |= (color[2].ui & 0xFFU) << 16; + } + break; + + case PVRX(PBESTATE_PACKMODE_U16U16U16): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 16) & 0xFFFFU; + pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 16) & 0xFFFFU) << 16; + pkd_color[1] = (pvr_float_to_ufixed(color[2].f, 16) & 0xFFFFU); + } else { + pkd_color[0] = color[0].ui & 0xFFFFU; + pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16; + pkd_color[1] = color[2].ui & 0xFFFFU; + } + break; + + case PVRX(PBESTATE_PACKMODE_S16S16S16): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_sfixed(color[0].f, 16) & 0xFFFFU; + pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 16) & 0xFFFFU) << 16; + pkd_color[1] = pvr_float_to_sfixed(color[2].f, 16) & 0xFFFFU; + } else { + pkd_color[0] = color[0].ui & 0xFFFFU; + pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16; + pkd_color[1] = color[2].ui & 0xFFFFU; + } + break; + + case PVRX(PBESTATE_PACKMODE_F16F16F16): + case PVRX(PBESTATE_PACKMODE_F11F11F10): + case PVRX(PBESTATE_PACKMODE_F10F11F11): + case PVRX(PBESTATE_PACKMODE_SE9995): + pkd_color[0] = (uint32_t)pvr_float_to_f16(color[0].f, true); + pkd_color[0] |= (uint32_t)pvr_float_to_f16(color[1].f, true) << 16; + pkd_color[1] = (uint32_t)pvr_float_to_f16(color[2].f, true); + break; + + case PVRX(PBESTATE_PACKMODE_U32U32U32): + pkd_color[0] = color[0].ui; + pkd_color[1] = color[1].ui; + pkd_color[2] = color[2].ui; + break; + + case PVRX(PBESTATE_PACKMODE_S32S32S32): + pkd_color[0] = (uint32_t)color[0].i; + pkd_color[1] = (uint32_t)color[1].i; + pkd_color[2] = (uint32_t)color[2].i; + break; + + case PVRX(PBESTATE_PACKMODE_X24G8X32): + case PVRX(PBESTATE_PACKMODE_U8X24): + pkd_color[1] = (color[1].ui & 0xFFU) << 24; + break; + + case PVRX(PBESTATE_PACKMODE_F32F32F32): + memcpy(pkd_color, &color[0].f, 3U * sizeof(float)); + break; + + case PVRX(PBESTATE_PACKMODE_U8U8): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 8) & 0xFFU; + pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 8) & 0xFFU) << 8; + } else { + pkd_color[0] = color[0].ui & 0xFFU; + pkd_color[0] |= (color[1].ui & 0xFFU) << 8; + } + break; + + case PVRX(PBESTATE_PACKMODE_S8S8): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_sfixed(color[0].f, 8) & 0xFFU; + pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 8) & 0xFFU) << 8; + pkd_color[0] |= (pvr_float_to_sfixed(color[2].f, 8) & 0xFFU) << 16; + pkd_color[0] |= (pvr_float_to_sfixed(color[3].f, 8) & 0xFFU) << 24; + } else { + pkd_color[0] = color[0].ui & 0xFFU; + pkd_color[0] |= (color[1].ui & 0xFFU) << 8; + pkd_color[0] |= (color[2].ui & 0xFFU) << 16; + pkd_color[0] |= (color[3].ui & 0xFFU) << 24; + } + break; + + case PVRX(PBESTATE_PACKMODE_U16U16): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 16) & 0xFFFFU; + pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 16) & 0xFFFFU) << 16; + } else { + pkd_color[0] = color[0].ui & 0xFFFFU; + pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16; + } + break; + + case PVRX(PBESTATE_PACKMODE_S16S16): + if (pbe_norm) { + pkd_color[0] = pvr_float_to_sfixed(color[0].f, 16) & 0xFFFFU; + pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 16) & 0xFFFFU) << 16; + } else { + pkd_color[0] = color[0].ui & 0xFFFFU; + pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16; + } + break; + + case PVRX(PBESTATE_PACKMODE_F16F16): + pkd_color[0] = (uint32_t)pvr_float_to_f16(color[0].f, true); + pkd_color[0] |= (uint32_t)pvr_float_to_f16(color[1].f, true) << 16; + break; + + case PVRX(PBESTATE_PACKMODE_U32U32): + pkd_color[0] = color[0].ui; + pkd_color[1] = color[1].ui; + break; + + case PVRX(PBESTATE_PACKMODE_S32S32): + pkd_color[0] = (uint32_t)color[0].i; + pkd_color[1] = (uint32_t)color[1].i; + break; + + case PVRX(PBESTATE_PACKMODE_X24U8F32): + case PVRX(PBESTATE_PACKMODE_X24X8F32): + memcpy(pkd_color, &color[0].f, 1U * sizeof(float)); + pkd_color[1] = color[1].ui & 0xFFU; + break; + + case PVRX(PBESTATE_PACKMODE_F32F32): + memcpy(pkd_color, &color[0].f, 2U * sizeof(float)); + break; + + case PVRX(PBESTATE_PACKMODE_ST8U24): + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 24) & 0xFFFFFFU; + pkd_color[0] |= color[1].ui << 24; + break; + + case PVRX(PBESTATE_PACKMODE_U8): + if (format == VK_FORMAT_S8_UINT) + pkd_color[0] = color[1].ui & 0xFFU; + else if (pbe_norm) + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 8) & 0xFFU; + else + pkd_color[0] = color[0].ui & 0xFFU; + + break; + + case PVRX(PBESTATE_PACKMODE_S8): + if (pbe_norm) + pkd_color[0] = pvr_float_to_sfixed(color[0].f, 8) & 0xFFU; + else + pkd_color[0] = color[0].ui & 0xFFU; + break; + + case PVRX(PBESTATE_PACKMODE_U16): + if (pbe_norm) + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 16) & 0xFFFFU; + else + pkd_color[0] = color[0].ui & 0xFFFFU; + break; + + case PVRX(PBESTATE_PACKMODE_S16): + if (pbe_norm) + pkd_color[0] = pvr_float_to_sfixed(color[0].f, 16) & 0xFFFFU; + else + pkd_color[0] = color[0].ui & 0xFFFFU; + break; + + case PVRX(PBESTATE_PACKMODE_F16): + pkd_color[0] = (uint32_t)pvr_float_to_f16(color[0].f, true); + break; + + /* U32 */ + case PVRX(PBESTATE_PACKMODE_U32): + if (format == VK_FORMAT_X8_D24_UNORM_PACK32) { + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 24) & 0xFFFFFFU; + } else if (format == VK_FORMAT_D24_UNORM_S8_UINT) { + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 24) & 0xFFFFFFU; + pkd_color[0] |= (color[1].ui & 0xFFU) << 24; + } else { + pkd_color[0] = color[0].ui; + } + break; + + /* U24ST8 */ + case PVRX(PBESTATE_PACKMODE_U24ST8): + pkd_color[1] = (color[1].ui & 0xFFU) << 24; + pkd_color[1] |= pvr_float_to_ufixed(color[0].f, 24) & 0xFFFFFFU; + break; + + /* S32 */ + case PVRX(PBESTATE_PACKMODE_S32): + pkd_color[0] = (uint32_t)color[0].i; + break; + + /* F32 */ + case PVRX(PBESTATE_PACKMODE_F32): + memcpy(pkd_color, &color[0].f, sizeof(float)); + break; + + /* X8U24 */ + case PVRX(PBESTATE_PACKMODE_X8U24): + pkd_color[0] = pvr_float_to_ufixed(color[0].f, 24) & 0xFFFFFFU; + break; + + default: + break; + } + + return VK_SUCCESS; +} + +static VkResult pvr_3d_copy_blit_core(struct pvr_transfer_ctx *ctx, + struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_prep_data *prep_data, + uint32_t pass_idx, + bool *finished_out) +{ + struct pvr_transfer_3d_state *const state = &prep_data->state; + struct pvr_winsys_transfer_regs *const regs = &state->regs; + struct pvr_device *const device = ctx->device; + const struct pvr_device_info *const dev_info = &device->pdevice->dev_info; + + VkResult result; + + *finished_out = true; + + state->common_ptr = 0U; + state->dynamic_const_reg_ptr = 0U; + state->usc_const_reg_ptr = 0U; + + if ((transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FILL) != 0U) { + uint32_t packed_color[4U] = { 0U }; + + if (vk_format_is_compressed(transfer_cmd->dst.vk_format)) + return vk_error(device, VK_ERROR_FORMAT_NOT_SUPPORTED); + + /* No shader. */ + state->pds_temps = 0U; + state->uniform_data_size = 0U; + state->tex_state_data_size = 0U; + + /* No background enabled. */ + /* clang-format off */ + pvr_csb_pack (®s->isp_bgobjvals, CR_ISP_BGOBJVALS, reg); + /* clang-format on */ + pvr_csb_pack (®s->isp_aa, CR_ISP_AA, reg) { + reg.mode = pvr_cr_isp_aa_mode_type(transfer_cmd->dst.sample_count); + } + + result = pvr_pack_clear_color(transfer_cmd->dst.vk_format, + transfer_cmd->clear_color, + packed_color); + if (result != VK_SUCCESS) + return result; + + pvr_csb_pack (®s->usc_clear_register0, CR_USC_CLEAR_REGISTER0, reg) { + reg.val = packed_color[0U]; + } + + pvr_csb_pack (®s->usc_clear_register1, CR_USC_CLEAR_REGISTER1, reg) { + reg.val = packed_color[1U]; + } + + pvr_csb_pack (®s->usc_clear_register2, CR_USC_CLEAR_REGISTER2, reg) { + reg.val = packed_color[2U]; + } + + pvr_csb_pack (®s->usc_clear_register3, CR_USC_CLEAR_REGISTER3, reg) { + reg.val = packed_color[3U]; + } + + state->msaa_multiplier = transfer_cmd->dst.sample_count & ~1U; + state->pds_shader_task_offset = 0U; + state->uni_tex_code_offset = 0U; + state->tex_state_data_offset = 0U; + } else if (transfer_cmd->src_present) { + const struct pvr_tq_frag_sh_reg_layout nop_sh_reg_layout = { + /* TODO: Setting this to 1 so that we don't try to pvr_bo_alloc() with + * zero size. The device will ignore the PDS program if USC_SHAREDSIZE + * is zero and in the case of the nop shader we're expecting it to be + * zero. See if we can safely pass PVR_DEV_ADDR_INVALID for the unitex + * program. + */ + .driver_total = 1, + }; + const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout; + struct pvr_pds_pixel_shader_sa_program unitex_prog = { 0U }; + uint32_t tex_state_dma_size_dw; + struct pvr_bo *pvr_bo; + uint32_t *dma_space; + + result = pvr_pbe_src_format(transfer_cmd, state, &state->shader_props); + if (result != VK_SUCCESS) + return result; + + pvr_uv_space(dev_info, transfer_cmd, state); + + state->shader_props.iterated = false; + + state->shader_props.layer_props.byte_unwind = 0U; + state->shader_props.layer_props.sample = + transfer_cmd->src.mem_layout == PVR_MEMLAYOUT_3DTWIDDLED; + + result = pvr_msaa_state(dev_info, transfer_cmd, state); + if (result != VK_SUCCESS) + return result; + + state->shader_props.pick_component = + pvr_pick_component_needed(&state->custom_mapping); + state->shader_props.alpha_type = transfer_cmd->blit.alpha.type; + + if (state->shader_props.alpha_type != PVR_ALPHA_NONE && + (state->shader_props.layer_props.pbe_format != + PVR_TRANSFER_PBE_PIXEL_SRC_F16F16 && + state->shader_props.layer_props.pbe_format != + PVR_TRANSFER_PBE_PIXEL_SRC_F16_U8)) { + return vk_error(device, VK_ERROR_FORMAT_NOT_SUPPORTED); + } + + if (state->filter == PVR_FILTER_LINEAR && + pvr_requires_usc_linear_filter(transfer_cmd->src.vk_format)) { + if (pvr_int_pbe_usc_linear_filter( + state->shader_props.layer_props.pbe_format, + state->shader_props.layer_props.sample, + state->shader_props.layer_props.msaa, + state->shader_props.full_rate)) { + state->shader_props.layer_props.linear = true; + } else { + mesa_logw("Transfer: F32 linear filter not supported."); + } + } + + if (state->empty_dst) { + sh_reg_layout = &nop_sh_reg_layout; + state->pds_shader_task_offset = device->nop_program.pds.data_offset; + } else { + pvr_dev_addr_t kick_usc_pds_dev_addr; + + result = + pvr_transfer_frag_store_get_shader_info(device, + &ctx->frag_store, + &state->shader_props, + &kick_usc_pds_dev_addr, + &sh_reg_layout); + if (result != VK_SUCCESS) + return result; + + assert(kick_usc_pds_dev_addr.addr <= UINT32_MAX); + state->pds_shader_task_offset = (uint32_t)kick_usc_pds_dev_addr.addr; + } + + unitex_prog.kick_usc = false; + unitex_prog.clear = false; + + tex_state_dma_size_dw = + sh_reg_layout->driver_total + sh_reg_layout->compiler_out_total; + + unitex_prog.num_texture_dma_kicks = 1U; + unitex_prog.num_uniform_dma_kicks = 0U; + + result = pvr_cmd_buffer_alloc_mem(transfer_cmd->cmd_buffer, + device->heaps.general_heap, + tex_state_dma_size_dw << 2U, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &pvr_bo); + if (result != VK_SUCCESS) + return result; + + dma_space = (uint32_t *)pvr_bo->bo->map; + + result = pvr_sampler_image_state(ctx, + transfer_cmd, + sh_reg_layout, + state, + dma_space); + if (result != VK_SUCCESS) + return result; + + if (state->shader_props.alpha_type == PVR_ALPHA_GLOBAL || + state->shader_props.alpha_type == + PVR_ALPHA_PREMUL_SOURCE_WITH_GLOBAL) { + pvr_dma_global_alpha(&transfer_cmd->blit.alpha, + state, + sh_reg_layout, + dma_space); + } + + pvr_dma_texture_floats(transfer_cmd, state, sh_reg_layout, dma_space); + + if (transfer_cmd->src.mem_layout == PVR_MEMLAYOUT_3DTWIDDLED) { + dma_space[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] = + fui(transfer_cmd->src.z_position); + } + + pvr_write_usc_constants(sh_reg_layout, dma_space); + + if (pvr_pick_component_needed(&state->custom_mapping)) + pvr_dma_texel_unwind(state, sh_reg_layout, dma_space); + + pvr_pds_encode_dma_burst(unitex_prog.texture_dma_control, + unitex_prog.texture_dma_address, + state->common_ptr, + tex_state_dma_size_dw, + pvr_bo->vma->dev_addr.addr, + true, + dev_info); + + state->common_ptr += tex_state_dma_size_dw; + + result = + pvr_pds_unitex(dev_info, ctx, transfer_cmd, &unitex_prog, prep_data); + if (result != VK_SUCCESS) + return result; + + pvr_csb_pack (®s->isp_bgobjvals, CR_ISP_BGOBJVALS, reg) { + reg.enablebgtag = true; + } + + /* clang-format off */ + pvr_csb_pack (®s->isp_aa, CR_ISP_AA, reg); + /* clang-format on */ + } else { + /* No shader. */ + state->pds_temps = 0U; + state->uniform_data_size = 0U; + state->tex_state_data_size = 0U; + + /* No background enabled. */ + /* clang-format off */ + pvr_csb_pack (®s->isp_bgobjvals, CR_ISP_BGOBJVALS, reg); + /* clang-format on */ + pvr_csb_pack (®s->isp_aa, CR_ISP_AA, reg) { + reg.mode = pvr_cr_isp_aa_mode_type(transfer_cmd->dst.sample_count); + } + state->msaa_multiplier = transfer_cmd->dst.sample_count & ~1U; + state->pds_shader_task_offset = 0U; + state->uni_tex_code_offset = 0U; + state->tex_state_data_offset = 0U; + + result = pvr_pbe_src_format(transfer_cmd, state, &state->shader_props); + if (result != VK_SUCCESS) + return result; + } + + pvr_setup_hwbg_object(dev_info, state); + + pvr_csb_pack (®s->isp_render, CR_ISP_RENDER, reg) { + reg.mode_type = PVRX(CR_ISP_RENDER_MODE_TYPE_FAST_SCALE); + pvr_finishme("Remove direction hardcoding."); + reg.dir_type = PVRX(CR_DIR_TYPE_TL2BR); + } + + /* Set up pixel event handling. */ + result = pvr_pbe_setup(transfer_cmd, ctx, state); + if (result != VK_SUCCESS) + return result; + + result = pvr_isp_tiles(device, state); + if (result != VK_SUCCESS) + return result; + + if (PVR_HAS_FEATURE(&device->pdevice->dev_info, gpu_multicore_support)) { + pvr_csb_pack (®s->frag_screen, CR_FRAG_SCREEN, reg) { + reg.xmax = transfer_cmd->dst.width - 1; + reg.ymax = transfer_cmd->dst.height - 1; + } + } + + if ((pass_idx + 1U) < state->custom_mapping.pass_count) + *finished_out = false; + + return VK_SUCCESS; +} + +static VkResult +pvr_pbe_src_format_f2d(uint32_t merge_flags, + struct pvr_transfer_cmd_surface *src, + VkFormat dst_format, + bool down_scale, + bool dont_force_pbe, + enum pvr_transfer_pbe_pixel_src *pixel_format_out) +{ + VkFormat src_format = src->vk_format; + + /* This has to come before the rest as S8 for instance is integer and + * signedsess check fails on D24S8. + */ + if (vk_format_is_depth_or_stencil(src_format) || + vk_format_is_depth_or_stencil(dst_format) || + merge_flags & PVR_TRANSFER_CMD_FLAGS_DSMERGE) { + pvr_finishme("Complete pvr_pbe_src_format_f2d()."); + } + + return pvr_pbe_src_format_normal(src_format, + dst_format, + down_scale, + dont_force_pbe, + pixel_format_out); +} + +/** Writes the coefficient loading PDS task. */ +static inline VkResult +pvr_pds_coeff_task(struct pvr_transfer_ctx *ctx, + const struct pvr_transfer_cmd *transfer_cmd, + const bool sample_3d, + struct pvr_transfer_prep_data *prep_data) +{ + struct pvr_transfer_3d_state *state = &prep_data->state; + struct pvr_pds_coeff_loading_program program = { 0U }; + struct pvr_bo *pvr_bo; + VkResult result; + + program.num_fpu_iterators = 1U; + + pvr_csb_pack (&program.FPU_iterators[0U], + PDSINST_DOUT_FIELDS_DOUTI_SRC, + reg) { + if (sample_3d) + reg.size = PVRX(PDSINST_DOUTI_SIZE_3D); + else + reg.size = PVRX(PDSINST_DOUTI_SIZE_2D); + + reg.perspective = false; + + /* Varying wrap on the TSP means that the TSP chooses the shorter path + * out of the normal and the wrapping path i.e. chooses between u0->u1 + * and u1->1.0 == 0.0 -> u0. We don't need this behavior. + */ + /* + * if RHW ever needed offset SRC_F32 to the first U in 16 bit units + * l0 U <= offs 0 + * l0 V + * l1 U <= offs 4 + * ... + */ + reg.shademodel = PVRX(PDSINST_DOUTI_SHADEMODEL_GOURUAD); + reg.f32_offset = 0U; + } + + if (sample_3d) + state->usc_coeff_regs = 12U; + else + state->usc_coeff_regs = 8U; + + pvr_pds_set_sizes_coeff_loading(&program); + + result = + pvr_cmd_buffer_alloc_mem(transfer_cmd->cmd_buffer, + ctx->device->heaps.pds_heap, + (program.data_size + program.code_size) << 2U, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &pvr_bo); + if (result != VK_SUCCESS) + return result; + + state->pds_coeff_task_offset = + pvr_bo->vma->dev_addr.addr - ctx->device->heaps.pds_heap->base_addr.addr; + + pvr_pds_generate_coeff_loading_program(&program, pvr_bo->bo->map); + + state->coeff_data_size = program.data_size; + state->pds_temps = program.temps_used; + + return VK_SUCCESS; +} + +#define X 0U +#define Y 1U +#define Z 2U + +static void pvr_tsp_floats(const struct pvr_device_info *dev_info, + VkRect2D *rect, + const float recips[3U], + bool custom_filter, + bool z_present, + float z_value, + struct pvr_transfer_3d_iteration *layer) +{ +#define U0 0U +#define U1 1U +#define V0 2U +#define V1 3U + + const uint32_t indices[8U] = { U0, V0, U0, V1, U1, V1, U1, V0 }; + float delta[2U] = { 0.0f, 0.0f }; + int32_t non_normalized[4U]; + uint32_t src_flipped[2U]; + uint32_t normalized[4U]; + int32_t src_span[2U]; + + non_normalized[U0] = rect->offset.x; + non_normalized[U1] = rect->offset.x + rect->extent.width; + non_normalized[V0] = rect->offset.y; + non_normalized[V1] = rect->offset.y + rect->extent.height; + + /* Filter adjust. */ + src_span[X] = rect->extent.width; + src_flipped[X] = src_span[X] > 0U ? 0U : 1U; + src_span[Y] = rect->extent.height; + src_flipped[Y] = src_span[Y] > 0U ? 0U : 1U; + /* + * | X | Y | srcFlipX | srcFlipY | + * +----+----+----------+----------| + * | X | Y | 0 | 0 | + * | -X | Y | 1 | 0 | + * | X | -Y | 0 | 1 | + * | -X | -Y | 1 | 1 | + */ + for (uint32_t i = X; i <= Y; i++) { + if (custom_filter) { + if (src_flipped[i] != 0U) + delta[i] += 0.25; + else + delta[i] -= 0.25; + } + } + + /* Normalize. */ + for (uint32_t i = 0U; i < ARRAY_SIZE(normalized); i++) { + uint32_t tmp; + float ftmp; + + ftmp = (float)non_normalized[i] + delta[i >> 1U]; + ftmp *= recips[i >> 1U]; + + tmp = fui(ftmp); + if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) + tmp = XXH_rotl32(tmp, 1U); + + normalized[i] = tmp; + } + + /* Apply indices. */ + for (uint32_t i = 0U; i < 8U; i++) + layer->texture_coords[i] = normalized[indices[i]]; + + if (z_present) { + uint32_t tmp = fui(z_value * recips[2U]); + + if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) + tmp = XXH_rotl32(tmp, 1U); + + for (uint32_t i = 8U; i < 12U; i++) + layer->texture_coords[i] = tmp; + } + +#undef U0 +#undef U1 +#undef V0 +#undef V1 +} + +static void +pvr_isp_prim_block_tsp_vertex_block(const struct pvr_device_info *dev_info, + const struct pvr_transfer_cmd_surface *src, + struct pvr_rect_mapping *mappings, + bool custom_filter, + uint32_t num_mappings, + uint32_t mapping_offset, + enum pvr_filter filter, + uint32_t tsp_comp_format_in_dw, + uint32_t **const cs_ptr_out) +{ + struct pvr_transfer_3d_iteration layer; + uint32_t *cs_ptr = *cs_ptr_out; + + /* |<-32b->| + * +-------+----- + * | RHW | | X num_isp_vertices + * +-------+-- | + * | U | | | + * | V | | X PVR_TRANSFER_NUM_LAYERS + * +-------+----- + * + * RHW is not there any more in the Transfer. The comment still explains + * where it should go if ever needed. + */ + for (uint32_t i = mapping_offset; i < mapping_offset + num_mappings; i++) { + bool z_present = src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED; + const float recips[3U] = { + [X] = 1.0f / (float)src->width, + [Y] = 1.0f / (float)src->height, + [Z] = z_present ? 1.0f / (float)src->depth : 0.0f, + }; + float z_pos = (filter < PVR_FILTER_LINEAR) ? floor(src->z_position + 0.5f) + : src->z_position; + + pvr_tsp_floats(dev_info, + &mappings[i].src_rect, + recips, + custom_filter, + z_present, + z_pos, + &layer); + + /* We request UVs from TSP for ISP triangle: + * 0 u 1 + * +---, + * v| /| + * | / | + * 2'/--'3 + */ + for (uint32_t j = 0U; j < PVR_TRANSFER_NUM_LAYERS; j++) { + *cs_ptr++ = layer.texture_coords[0U]; + *cs_ptr++ = layer.texture_coords[1U]; + } + + if (z_present) { + *cs_ptr++ = layer.texture_coords[8U]; + *cs_ptr++ = 0U; + } + + for (uint32_t j = 0U; j < PVR_TRANSFER_NUM_LAYERS; j++) { + *cs_ptr++ = layer.texture_coords[6U]; + *cs_ptr++ = layer.texture_coords[7U]; + } + + if (z_present) { + *cs_ptr++ = layer.texture_coords[11U]; + *cs_ptr++ = 0U; + } + + for (uint32_t j = 0U; j < PVR_TRANSFER_NUM_LAYERS; j++) { + *cs_ptr++ = layer.texture_coords[2U]; + *cs_ptr++ = layer.texture_coords[3U]; + } + + if (z_present) { + *cs_ptr++ = layer.texture_coords[9U]; + *cs_ptr++ = 0U; + } + + for (uint32_t j = 0U; j < PVR_TRANSFER_NUM_LAYERS; j++) { + *cs_ptr++ = layer.texture_coords[4U]; + *cs_ptr++ = layer.texture_coords[5U]; + } + + if (z_present) { + *cs_ptr++ = layer.texture_coords[10U]; + *cs_ptr++ = 0U; + } + } + + if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { + /* Skipped optional primitive id. */ + for (uint32_t i = 0U; i < tsp_comp_format_in_dw; i++) + *cs_ptr++ = 0x88888888U; + } else { + /* Align back to 64 bits. */ + if (((uintptr_t)cs_ptr & 7U) != 0U) + cs_ptr++; + } + + *cs_ptr_out = cs_ptr; +} + +#undef X +#undef Y +#undef Z + +static void pvr_isp_prim_block_pds_state(const struct pvr_device_info *dev_info, + struct pvr_transfer_ctx *ctx, + struct pvr_transfer_3d_state *state, + uint32_t **const cs_ptr_out) +{ + uint32_t *cs_ptr = *cs_ptr_out; + + pvr_csb_pack (cs_ptr, TA_STATE_PDS_SHADERBASE, shader_base) { + shader_base.addr = PVR_DEV_ADDR(state->pds_shader_task_offset); + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, TA_STATE_PDS_TEXUNICODEBASE, tex_base) { + tex_base.addr = PVR_DEV_ADDR(state->uni_tex_code_offset); + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, TA_STATE_PDS_SIZEINFO1, info1) { + info1.pds_uniformsize = + state->uniform_data_size / + PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE); + + info1.pds_texturestatesize = + state->tex_state_data_size / + PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE); + + info1.pds_varyingsize = + state->coeff_data_size / + PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE); + + info1.usc_varyingsize = + ALIGN_POT(state->usc_coeff_regs, + PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE)) / + PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE); + + info1.pds_tempsize = + ALIGN_POT(state->pds_temps, + PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE)) / + PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE); + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, TA_STATE_PDS_VARYINGBASE, base) { + base.addr = PVR_DEV_ADDR(state->pds_coeff_task_offset); + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, TA_STATE_PDS_TEXTUREDATABASE, base) { + base.addr = PVR_DEV_ADDR(state->tex_state_data_offset); + } + cs_ptr++; + + /* PDS uniform program not used. */ + pvr_csb_pack (cs_ptr, TA_STATE_PDS_UNIFORMDATABASE, base) { + base.addr = PVR_DEV_ADDR(0U); + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, TA_STATE_PDS_SIZEINFO2, info) { + info.usc_sharedsize = + ALIGN_POT(state->common_ptr, + PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE)) / + PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE); + info.pds_tri_merge_disable = !PVR_HAS_ERN(dev_info, 42307); + info.pds_batchnum = 0U; + } + cs_ptr++; + + /* Get back to 64 bits boundary. */ + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) + cs_ptr++; + + *cs_ptr_out = cs_ptr; +} + +static void pvr_isp_prim_block_isp_state(const struct pvr_device_info *dev_info, + UNUSED uint32_t tsp_comp_format_in_dw, + uint32_t tsp_data_size_in_bytes, + uint32_t num_isp_vertices, + bool read_bgnd, + uint32_t **const cs_ptr_out) +{ + uint32_t *cs_ptr = *cs_ptr_out; + + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format_v2)) { + pvr_finishme("Unimplemented path."); + } else { + /* ISP state words. */ + + /* clang-format off */ + pvr_csb_pack (cs_ptr, TA_STATE_ISPCTL, ispctl); + /* clang-format on */ + cs_ptr += pvr_cmd_length(TA_STATE_ISPCTL); + + pvr_csb_pack (cs_ptr, TA_STATE_ISPA, ispa) { + ispa.objtype = PVRX(TA_OBJTYPE_TRIANGLE); + ispa.passtype = read_bgnd ? PVRX(TA_PASSTYPE_TRANSLUCENT) + : PVRX(TA_PASSTYPE_OPAQUE); + ispa.dcmpmode = PVRX(TA_CMPMODE_ALWAYS); + ispa.dwritedisable = true; + } + cs_ptr += pvr_cmd_length(TA_STATE_ISPA); + + /* How many bytes the TSP compression format needs? */ + pvr_csb_pack (cs_ptr, IPF_COMPRESSION_SIZE_WORD, word) { + word.cs_isp_comp_table_size = 0U; + word.cs_tsp_comp_format_size = tsp_comp_format_in_dw; + word.cs_tsp_comp_table_size = 0U; + word.cs_tsp_comp_vertex_size = + tsp_data_size_in_bytes / num_isp_vertices; + } + cs_ptr += pvr_cmd_length(IPF_COMPRESSION_SIZE_WORD); + + /* ISP vertex compression. */ + pvr_csb_pack (cs_ptr, IPF_ISP_COMPRESSION_WORD_0, word0) { + word0.cf_isp_comp_fmt_x0 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE); + word0.cf_isp_comp_fmt_x1 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE); + word0.cf_isp_comp_fmt_x2 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE); + word0.cf_isp_comp_fmt_y0 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE); + word0.cf_isp_comp_fmt_y1 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE); + word0.cf_isp_comp_fmt_y2 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE); + word0.cf_isp_comp_fmt_z0 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE); + word0.cf_isp_comp_fmt_z1 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE); + } + cs_ptr += pvr_cmd_length(IPF_ISP_COMPRESSION_WORD_0); + + pvr_csb_pack (cs_ptr, IPF_ISP_COMPRESSION_WORD_1, word1) { + word1.vf_prim_msaa = 0U; + word1.vf_prim_id_pres = 0U; + word1.vf_vertex_clipped = 0U; + word1.vf_vertex_total = num_isp_vertices - 1U; + word1.cf_isp_comp_fmt_z3 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE); + word1.cf_isp_comp_fmt_z2 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE); + } + cs_ptr += pvr_cmd_length(IPF_ISP_COMPRESSION_WORD_1); + } + + *cs_ptr_out = cs_ptr; +} + +static void +pvr_isp_prim_block_index_block(const struct pvr_device_info *dev_info, + uint32_t num_mappings, + uint32_t **const cs_ptr_out) +{ + uint32_t *cs_ptr = *cs_ptr_out; + + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { + pvr_finishme("Unimplemented path."); + } else { + for (uint32_t i = 0U, j = 0U; i < num_mappings; i++, j += 4U) { + if ((i & 1U) == 0U) { + pvr_csb_pack (cs_ptr, IPF_INDEX_DATA, word) { + word.ix_index0_0 = j; + word.ix_index0_1 = j + 1U; + word.ix_index0_2 = j + 2U; + word.ix_index1_0 = j + 3U; + } + cs_ptr += pvr_cmd_length(IPF_INDEX_DATA); + + /* Don't increment cs_ptr here. IPF_INDEX_DATA is patched in the + * else part and then cs_ptr is incremented. + */ + pvr_csb_pack (cs_ptr, IPF_INDEX_DATA, word) { + word.ix_index0_0 = j + 2U; + word.ix_index0_1 = j + 1U; + } + } else { + uint32_t tmp; + + pvr_csb_pack (&tmp, IPF_INDEX_DATA, word) { + word.ix_index0_2 = j; + word.ix_index1_0 = j + 1U; + } + *cs_ptr |= tmp; + cs_ptr += pvr_cmd_length(IPF_INDEX_DATA); + + pvr_csb_pack (cs_ptr, IPF_INDEX_DATA, word) { + word.ix_index0_0 = j + 2U; + word.ix_index0_1 = j + 3U; + word.ix_index0_2 = j + 2U; + word.ix_index1_0 = j + 1U; + } + cs_ptr += pvr_cmd_length(IPF_INDEX_DATA); + } + } + + /* The last pass didn't ++. */ + if ((num_mappings & 1U) != 0U) + cs_ptr++; + } + + *cs_ptr_out = cs_ptr; +} + +/* Calculates a 24 bit fixed point (biased) representation of a signed integer. + */ +static inline VkResult +pvr_int32_to_isp_xy_vtx(const struct pvr_device_info *dev_info, + int32_t val, + UNUSED bool bias, + uint32_t *word_out) +{ + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { + pvr_finishme("Unimplemented path."); + return VK_SUCCESS; + } + + val += PVRX(IPF_ISP_VERTEX_XY_BIAS_VALUE); + + if (((uint32_t)val & 0x7fff8000U) != 0U) + return vk_error(NULL, VK_ERROR_UNKNOWN); + + pvr_csb_pack (word_out, IPF_ISP_VERTEX_XY, word) { + word.sign = val < 0U; + word.integer = val; + } + + return VK_SUCCESS; +} + +static VkResult +pvr_isp_prim_block_isp_vertices(const struct pvr_device_info *dev_info, + struct pvr_transfer_3d_state *state, + struct pvr_rect_mapping *mappings, + uint32_t num_mappings, + uint32_t mapping_offset, + uint32_t **const cs_ptr_out) +{ + uint32_t *cs_ptr = *cs_ptr_out; + bool bias = true; + uint32_t i; + + if (PVR_HAS_FEATURE(dev_info, screen_size8K)) + bias = state->width_in_tiles <= 256U && state->height_in_tiles <= 256U; + + for (i = mapping_offset; i < mapping_offset + num_mappings; i++) { + uint32_t bottom = 0U; + uint32_t right = 0U; + uint32_t left = 0U; + uint32_t top = 0U; + VkResult result; + + /* ISP vertex data (X, Y, Z). */ + result = pvr_int32_to_isp_xy_vtx(dev_info, + mappings[i].dst_rect.offset.y, + bias, + &top); + if (result != VK_SUCCESS) + return result; + + result = pvr_int32_to_isp_xy_vtx(dev_info, + mappings[i].dst_rect.offset.y + + mappings[i].dst_rect.extent.height, + bias, + &bottom); + if (result != VK_SUCCESS) + return result; + + result = pvr_int32_to_isp_xy_vtx(dev_info, + mappings[i].dst_rect.offset.x, + bias, + &left); + if (result != VK_SUCCESS) + return result; + + result = pvr_int32_to_isp_xy_vtx(dev_info, + mappings[i].dst_rect.offset.x + + mappings[i].dst_rect.extent.width, + bias, + &right); + if (result != VK_SUCCESS) + return result; + + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { + pvr_finishme("Unimplemented path."); + } else { + /* ISP vertices 0 and 1. */ + pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_0, word0) { + word0.x0 = left; + word0.y0 = top & 0xFF; + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_1, word1) { + word1.y0 = top >> PVRX(IPF_ISP_VERTEX_WORD_1_Y0_SHIFT); + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_2, word2) { + word2.x1 = right & 0xFFFF; + word2.z0 = 0U; + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_3, word3) { + word3.x1 = right >> PVRX(IPF_ISP_VERTEX_WORD_3_X1_SHIFT); + word3.y1 = top; + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_4, word4) { + word4.z1 = 0U; + } + cs_ptr++; + + /* ISP vertices 2 and 3. */ + pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_0, word0) { + word0.x0 = left; + word0.y0 = bottom & 0xFF; + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_1, word1) { + word1.y0 = bottom >> PVRX(IPF_ISP_VERTEX_WORD_1_Y0_SHIFT); + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_2, word2) { + word2.x1 = right & 0xFFFF; + word2.z0 = 0U; + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_3, word3) { + word3.x1 = right >> PVRX(IPF_ISP_VERTEX_WORD_3_X1_SHIFT); + word3.y1 = bottom; + } + cs_ptr++; + + pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_4, word4) { + word4.z1 = 0U; + } + cs_ptr++; + } + } + *cs_ptr_out = cs_ptr; + + return VK_SUCCESS; +} + +static uint32_t +pvr_isp_primitive_block_size(const struct pvr_device_info *dev_info, + const struct pvr_transfer_cmd_surface *src, + uint32_t num_mappings) +{ + uint32_t num_isp_vertices = num_mappings * 4U; + uint32_t num_tsp_vertices_per_isp_vertex; + uint32_t isp_vertex_data_size_dw; + bool color_fill = (src == NULL); + uint32_t tsp_comp_format_dw; + uint32_t isp_state_size_dw; + uint32_t pds_state_size_dw; + uint32_t idx_data_size_dw; + uint32_t tsp_data_size; + uint32_t stream_size; + + if (color_fill) { + num_tsp_vertices_per_isp_vertex = 0U; + } else { + num_tsp_vertices_per_isp_vertex = + src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED ? 4U : 2U; + } + + tsp_data_size = num_isp_vertices * PVR_TRANSFER_NUM_LAYERS * 4U * + num_tsp_vertices_per_isp_vertex; + + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { + /* An XYZ vertex is 16/16/32 bits => 8 bytes. */ + isp_vertex_data_size_dw = num_isp_vertices * 2U; + + /* Round to even for 64 bit boundary. */ + idx_data_size_dw = ALIGN_POT(num_mappings, 2U); + tsp_comp_format_dw = 0U; + isp_state_size_dw = 4U; + pds_state_size_dw = 8U; + } else { + tsp_comp_format_dw = color_fill ? 0U : PVR_TRANSFER_NUM_LAYERS; + + if (!color_fill) { + if (src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED) + tsp_comp_format_dw *= 2U; + } + + /* An XYZ vertex is 24/24/32 bits => 10 bytes with last padded to 4 byte + * burst align. + */ + isp_vertex_data_size_dw = DIV_ROUND_UP(num_isp_vertices * 10U, 4U); + + /* 4 triangles fit in 3 dw: t0t0t0t1_t1t1t2t2_t2t3t3t3. */ + idx_data_size_dw = num_mappings + DIV_ROUND_UP(num_mappings, 2U); + isp_state_size_dw = 5U; + pds_state_size_dw = 7U; + } + + stream_size = tsp_data_size + (idx_data_size_dw + tsp_comp_format_dw + + isp_vertex_data_size_dw + isp_state_size_dw + + pds_state_size_dw) * + 4U; + + return stream_size; +} + +static VkResult +pvr_isp_primitive_block(const struct pvr_device_info *dev_info, + struct pvr_transfer_ctx *ctx, + const struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_prep_data *prep_data, + const struct pvr_transfer_cmd_surface *src, + bool custom_filter, + struct pvr_rect_mapping *mappings, + uint32_t num_mappings, + uint32_t mapping_offset, + bool read_bgnd, + uint32_t *cs_start_offset, + uint32_t **cs_ptr_out) +{ + struct pvr_transfer_3d_state *state = &prep_data->state; + uint32_t num_isp_vertices = num_mappings * 4U; + uint32_t num_tsp_vertices_per_isp_vert; + uint32_t tsp_data_size_in_bytes; + uint32_t tsp_comp_format_in_dw; + bool color_fill = src == NULL; + uint32_t stream_size_in_bytes; + uint32_t *cs_ptr_start; + VkResult result; + + if (color_fill) { + num_tsp_vertices_per_isp_vert = 0U; + } else { + num_tsp_vertices_per_isp_vert = + src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED ? 4U : 2U; + } + + tsp_data_size_in_bytes = num_isp_vertices * PVR_TRANSFER_NUM_LAYERS * 4U * + num_tsp_vertices_per_isp_vert; + + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { + tsp_comp_format_in_dw = 0U; + } else { + tsp_comp_format_in_dw = color_fill ? 0U : PVR_TRANSFER_NUM_LAYERS; + + if (!color_fill && src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED) + tsp_comp_format_in_dw *= 2U; + } + + stream_size_in_bytes = + pvr_isp_primitive_block_size(dev_info, src, num_mappings); + + cs_ptr_start = *cs_ptr_out; + + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) { + pvr_finishme("Unimplemented path."); + } else { + if (!color_fill) { + /* This includes: + * Compressed TSP vertex data & tables. + * Primitive id. + * TSP compression formats. + */ + pvr_isp_prim_block_tsp_vertex_block(dev_info, + src, + mappings, + custom_filter, + num_mappings, + mapping_offset, + transfer_cmd->filter, + tsp_comp_format_in_dw, + cs_ptr_out); + } + + pvr_isp_prim_block_pds_state(dev_info, ctx, state, cs_ptr_out); + + /* Point the CS_PRIM_BASE here. */ + *cs_start_offset = (*cs_ptr_out - cs_ptr_start) * sizeof(uint32_t); + + /* This includes: + * ISP state words. + * Compression size word. + * ISP compression and vertex formats. + */ + pvr_isp_prim_block_isp_state(dev_info, + tsp_comp_format_in_dw, + tsp_data_size_in_bytes, + num_isp_vertices, + read_bgnd, + cs_ptr_out); + + pvr_isp_prim_block_index_block(dev_info, num_mappings, cs_ptr_out); + + result = pvr_isp_prim_block_isp_vertices(dev_info, + state, + mappings, + num_mappings, + mapping_offset, + cs_ptr_out); + if (result != VK_SUCCESS) + return result; + } + + assert((*cs_ptr_out - cs_ptr_start) * sizeof(uint32_t) == + stream_size_in_bytes); + + return VK_SUCCESS; +} + +static inline uint32_t +pvr_transfer_prim_blocks_per_alloc(const struct pvr_device_info *dev_info) +{ + uint32_t ret = PVRX(IPF_CONTROL_STREAM_SIZE_DWORDS) * sizeof(uint32_t); + + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) + return ret / sizeof(uint64_t) / 2U; + + return ret / sizeof(uint32_t) / 2U - 1U; +} + +static inline uint32_t +pvr_transfer_max_quads_per_pb(const struct pvr_device_info *dev_info) +{ + return PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) ? 4U + : 16U; +} + +/** + * Writes ISP ctrl stream. + * + * We change sampler/texture state when we process a new TQ source. The + * primitive block contains the shader pointers, but we supply the primitive + * blocks with shaders from here. + */ +static VkResult pvr_isp_ctrl_stream(const struct pvr_device_info *dev_info, + struct pvr_transfer_ctx *ctx, + struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_prep_data *prep_data) +{ + const uint32_t max_mappings_per_pb = pvr_transfer_max_quads_per_pb(dev_info); + struct pvr_transfer_3d_state *const state = &prep_data->state; + struct pvr_winsys_transfer_regs *const regs = &state->regs; + struct pvr_transfer_pass *pass = NULL; + uint32_t flags = transfer_cmd->flags; + uint32_t num_prim_blks = 0U; + uint32_t prim_blk_size = 0U; + uint32_t region_arrays_size; + uint32_t num_region_arrays; + uint32_t total_stream_size; + struct pvr_bo *pvr_cs_bo; + uint32_t rem_mappings; + uint32_t *blk_cs_ptr; + uint32_t *cs_ptr; + VkResult result; + + if (state->custom_mapping.pass_count > 0U) { + uint32_t num_mappings; + + pass = &state->custom_mapping.passes[state->pass_idx]; + num_mappings = pass->mapping_count; + + while (num_mappings > 0U) { + if (flags & PVR_TRANSFER_CMD_FLAGS_FILL) { + prim_blk_size += pvr_isp_primitive_block_size( + dev_info, + NULL, + MIN2(max_mappings_per_pb, num_mappings)); + } + + if (transfer_cmd->src_present) { + prim_blk_size += pvr_isp_primitive_block_size( + dev_info, + &transfer_cmd->src, + MIN2(max_mappings_per_pb, num_mappings)); + } + + num_mappings -= MIN2(max_mappings_per_pb, num_mappings); + num_prim_blks++; + } + } else if ((flags & PVR_TRANSFER_CMD_FLAGS_FILL) != 0U) { + num_prim_blks = 1U; + prim_blk_size += + pvr_isp_primitive_block_size(dev_info, + NULL, + MIN2(max_mappings_per_pb, 1U)); + + if (transfer_cmd->src_present) { + uint32_t num_mappings = transfer_cmd->mapping_count; + + while (num_mappings > 0U) { + prim_blk_size += pvr_isp_primitive_block_size( + dev_info, + &transfer_cmd->src, + MIN2(max_mappings_per_pb, num_mappings)); + + num_mappings -= MIN2(max_mappings_per_pb, num_mappings); + num_prim_blks++; + } + } + } else { + pvr_finishme("Unimplemented path."); + } + + num_region_arrays = + (num_prim_blks + (pvr_transfer_prim_blocks_per_alloc(dev_info) - 1U)) / + pvr_transfer_prim_blocks_per_alloc(dev_info); + region_arrays_size = PVRX(IPF_CONTROL_STREAM_SIZE_DWORDS) * + sizeof(uint32_t) * num_region_arrays; + total_stream_size = region_arrays_size + prim_blk_size; + + /* Allocate space for IPF control stream. */ + result = pvr_cmd_buffer_alloc_mem(transfer_cmd->cmd_buffer, + ctx->device->heaps.transfer_3d_heap, + total_stream_size, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &pvr_cs_bo); + if (result != VK_SUCCESS) + return result; + + cs_ptr = pvr_cs_bo->bo->map; + blk_cs_ptr = cs_ptr + region_arrays_size / sizeof(uint32_t); + + if (flags & PVR_TRANSFER_CMD_FLAGS_FILL) + rem_mappings = pass ? pass->mapping_count : 1U; + else + rem_mappings = transfer_cmd->mapping_count; + + if ((transfer_cmd->src_present || flags & PVR_TRANSFER_CMD_FLAGS_FILL) && + rem_mappings != 0U) { + struct pvr_pds_pixel_shader_sa_program unitex_pds_prog = { 0U }; + struct pvr_transfer_cmd_surface *src = &transfer_cmd->src; + struct pvr_rect_mapping fill_mapping; + uint32_t mapping_offset = 0U; + bool read_bgnd = false; + + if (flags & PVR_TRANSFER_CMD_FLAGS_FILL) { + uint32_t packed_color[4U] = { 0U }; + + if (vk_format_is_compressed(transfer_cmd->dst.vk_format)) { + return vk_error(transfer_cmd->cmd_buffer, + VK_ERROR_FORMAT_NOT_SUPPORTED); + } + + state->pds_shader_task_offset = 0U; + state->uni_tex_code_offset = 0U; + state->tex_state_data_offset = 0U; + state->common_ptr = 0U; + + result = pvr_pack_clear_color(transfer_cmd->dst.vk_format, + transfer_cmd->clear_color, + packed_color); + if (result != VK_SUCCESS) + return result; + + fill_mapping.dst_rect = transfer_cmd->scissor; + + pvr_csb_pack (®s->usc_clear_register0, + CR_USC_CLEAR_REGISTER0, + reg) { + reg.val = packed_color[0U]; + } + + pvr_csb_pack (®s->usc_clear_register1, + CR_USC_CLEAR_REGISTER1, + reg) { + reg.val = packed_color[1U]; + } + + pvr_csb_pack (®s->usc_clear_register2, + CR_USC_CLEAR_REGISTER2, + reg) { + reg.val = packed_color[2U]; + } + + pvr_csb_pack (®s->usc_clear_register3, + CR_USC_CLEAR_REGISTER3, + reg) { + reg.val = packed_color[3U]; + } + + state->pds_shader_task_offset = + transfer_cmd->cmd_buffer->device->nop_program.pds.data_offset; + + unitex_pds_prog.kick_usc = false; + unitex_pds_prog.clear = false; + } else { + const bool down_scale = + transfer_cmd->resolve_op == PVR_RESOLVE_BLEND && + src->sample_count > 1U && transfer_cmd->dst.sample_count <= 1U; + struct pvr_tq_shader_properties *shader_props = + &state->shader_props; + struct pvr_tq_layer_properties *layer = &shader_props->layer_props; + const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout; + enum pvr_transfer_pbe_pixel_src pbe_src_format; + uint32_t tex_state_dma_size; + pvr_dev_addr_t dev_offset; + struct pvr_bo *pvr_bo; + + /* Reset the shared register bank ptrs each src implies new texture + * state (Note that we don't change texture state per prim block). + */ + state->common_ptr = 0U; + state->usc_const_reg_ptr = 0U; + /* We don't use state->dynamic_const_reg_ptr here. */ + + if (flags & PVR_TRANSFER_CMD_FLAGS_DSMERGE) + read_bgnd = true; + + result = pvr_pbe_src_format_f2d(flags, + src, + transfer_cmd->dst.vk_format, + down_scale, + state->dont_force_pbe, + &pbe_src_format); + if (result != VK_SUCCESS) + return result; + + memset(shader_props, 0U, sizeof(*shader_props)); + + if (state->custom_mapping.byte_unwind_src > 0U && + state->custom_mapping.passes[0U].byte_unwind) { + pvr_finishme("Unimplemented path."); + } + + layer->pbe_format = pbe_src_format; + layer->sample = (src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED); + shader_props->iterated = true; + + shader_props->pick_component = + pvr_pick_component_needed(&state->custom_mapping); + + result = pvr_msaa_state(dev_info, transfer_cmd, state); + if (result != VK_SUCCESS) + return result; + + if (state->filter == PVR_FILTER_LINEAR && + pvr_requires_usc_linear_filter(src->vk_format)) { + if (pvr_int_pbe_usc_linear_filter(layer->pbe_format, + layer->sample, + layer->msaa, + shader_props->full_rate)) { + layer->linear = true; + } else { + mesa_logw("Transfer: F32 linear filter not supported."); + } + } + + result = pvr_transfer_frag_store_get_shader_info( + transfer_cmd->cmd_buffer->device, + &ctx->frag_store, + shader_props, + &dev_offset, + &sh_reg_layout); + if (result != VK_SUCCESS) + return result; + + assert(dev_offset.addr <= UINT32_MAX); + prep_data->state.pds_shader_task_offset = (uint32_t)dev_offset.addr; + + result = + pvr_pds_coeff_task(ctx, transfer_cmd, layer->sample, prep_data); + if (result != VK_SUCCESS) + return result; + + unitex_pds_prog.kick_usc = false; + unitex_pds_prog.clear = false; + + tex_state_dma_size = + sh_reg_layout->driver_total + sh_reg_layout->compiler_out_total; + + unitex_pds_prog.num_texture_dma_kicks = 1U; + unitex_pds_prog.num_uniform_dma_kicks = 0U; + + /* Allocate memory for DMA. */ + result = pvr_cmd_buffer_alloc_mem(transfer_cmd->cmd_buffer, + ctx->device->heaps.general_heap, + tex_state_dma_size << 2U, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &pvr_bo); + if (result != VK_SUCCESS) + return result; + + result = pvr_sampler_state_for_surface(dev_info, + &transfer_cmd->src, + state->filter, + sh_reg_layout, + 0U, + pvr_bo->bo->map); + if (result != VK_SUCCESS) + return result; + + result = pvr_image_state_for_surface(ctx, + transfer_cmd, + &transfer_cmd->src, + 0U, + sh_reg_layout, + state, + 0U, + pvr_bo->bo->map); + if (result != VK_SUCCESS) + return result; + + pvr_pds_encode_dma_burst(unitex_pds_prog.texture_dma_control, + unitex_pds_prog.texture_dma_address, + state->common_ptr, + tex_state_dma_size, + pvr_bo->vma->dev_addr.addr, + true, + dev_info); + + state->common_ptr += tex_state_dma_size; + + pvr_write_usc_constants(sh_reg_layout, pvr_bo->bo->map); + + if (pvr_pick_component_needed(&state->custom_mapping)) { + pvr_dma_texel_unwind(state, sh_reg_layout, pvr_bo->bo->map); + } + } + + result = pvr_pds_unitex(dev_info, + ctx, + transfer_cmd, + &unitex_pds_prog, + prep_data); + if (result != VK_SUCCESS) + return result; + + while (rem_mappings > 0U) { + const uint64_t transfer_heap_base = + transfer_cmd->cmd_buffer->device->heaps.transfer_3d_heap + ->base_addr.addr; + const uint32_t num_mappings = MIN2(max_mappings_per_pb, rem_mappings); + struct pvr_rect_mapping *mappings = NULL; + uint32_t stream_start_offset = 0U; + pvr_dev_addr_t prim_blk_addr; + + if (PVR_HAS_FEATURE(dev_info, ipf_creq_pf)) + pvr_finishme("Unimplemented path."); + + if (flags & PVR_TRANSFER_CMD_FLAGS_FILL) + mappings = pass ? pass->mappings : &fill_mapping; + else + mappings = transfer_cmd->mappings; + + prim_blk_addr = + PVR_DEV_ADDR(pvr_cs_bo->vma->dev_addr.addr - transfer_heap_base); + prim_blk_addr.addr += + (uintptr_t)blk_cs_ptr - (uintptr_t)pvr_cs_bo->bo->map; + + result = pvr_isp_primitive_block( + dev_info, + ctx, + transfer_cmd, + prep_data, + flags & PVR_TRANSFER_CMD_FLAGS_FILL ? NULL : src, + state->custom_filter, + mappings, + num_mappings, + mapping_offset, + read_bgnd, + &stream_start_offset, + &blk_cs_ptr); + if (result != VK_SUCCESS) + return result; + + prim_blk_addr.addr += stream_start_offset; + + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format_v2)) { + pvr_finishme("Unimplemented path."); + } else { + pvr_csb_pack (cs_ptr, IPF_PRIMITIVE_FORMAT, word) { + word.cs_type = PVRX(IPF_CS_TYPE_PRIM); + word.cs_isp_state_read = true; + word.cs_isp_state_size = 2U; + word.cs_prim_total = 2U * num_mappings - 1U; + word.cs_mask_fmt = PVRX(IPF_CS_MASK_FMT_FULL); + word.cs_prim_base_pres = true; + } + cs_ptr += pvr_cmd_length(IPF_PRIMITIVE_FORMAT); + + pvr_csb_pack (cs_ptr, IPF_PRIMITIVE_BASE, word) { + word.cs_prim_base = prim_blk_addr; + } + cs_ptr += pvr_cmd_length(IPF_PRIMITIVE_BASE); + } + + rem_mappings -= num_mappings; + mapping_offset += num_mappings; + } + } + + if (PVR_HAS_FEATURE(dev_info, ipf_creq_pf)) + pvr_finishme("Unimplemented path."); + + pvr_csb_pack (cs_ptr, IPF_CONTROL_STREAM, word) { + word.cs_type = PVRX(IPF_CS_TYPE_TERM); + } + cs_ptr += pvr_cmd_length(IPF_CONTROL_STREAM); + + pvr_csb_pack (®s->isp_mtile_base, CR_ISP_MTILE_BASE, reg) { + reg.addr = + PVR_DEV_ADDR(pvr_cs_bo->vma->dev_addr.addr - + ctx->device->heaps.transfer_3d_heap->base_addr.addr); + } + + pvr_csb_pack (®s->isp_render, CR_ISP_RENDER, reg) { + reg.mode_type = PVRX(CR_ISP_RENDER_MODE_TYPE_FAST_2D); + } + + if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format_v2)) + pvr_finishme("Unimplemented path."); + else + regs->isp_rgn = 0UL; + + return VK_SUCCESS; +} + +static void pvr_transfer_set_filter(struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_3d_state *state) +{ + VkRect2D *src = &transfer_cmd->mappings[0U].src_rect; + VkRect2D *dst = &transfer_cmd->mappings[0U].dst_rect; + + if (!transfer_cmd->src_present) + return; + + /* If no scaling is applied to the copy region, we can use point + * filtering. + */ + if (!state->custom_filter && (src->extent.width == dst->extent.width) && + (src->extent.height == dst->extent.height)) + state->filter = PVR_FILTER_POINT; + else + state->filter = transfer_cmd->filter; +} + +/** Generates hw resources to kick a 3D clip blit. */ +static VkResult pvr_3d_clip_blit(struct pvr_transfer_ctx *ctx, + struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_prep_data *prep_data, + uint32_t pass_idx, + bool *finished_out) +{ + struct pvr_transfer_3d_state *state = &prep_data->state; + uint32_t texel_unwind_src = state->custom_mapping.texel_unwind_src; + struct pvr_transfer_cmd bg_cmd = { 0U }; + uint32_t control_reg; + VkResult result; + + state->dont_force_pbe = false; + bg_cmd.scissor = transfer_cmd->scissor; + bg_cmd.cmd_buffer = transfer_cmd->cmd_buffer; + bg_cmd.flags = transfer_cmd->flags; + bg_cmd.flags &= + ~(PVR_TRANSFER_CMD_FLAGS_FAST2D | PVR_TRANSFER_CMD_FLAGS_FILL | + PVR_TRANSFER_CMD_FLAGS_DSMERGE | PVR_TRANSFER_CMD_FLAGS_PICKD); + + bg_cmd.src_present = state->custom_mapping.pass_count > 0U ? false : true; + if (bg_cmd.src_present) { + bg_cmd.mappings[0U].src_rect = transfer_cmd->scissor; + bg_cmd.mappings[0U].dst_rect = transfer_cmd->scissor; + bg_cmd.resolve_op = PVR_RESOLVE_BLEND; + bg_cmd.src = transfer_cmd->dst; + } + + state->filter = PVR_FILTER_DONTCARE; + bg_cmd.dst = transfer_cmd->dst; + state->custom_mapping.texel_unwind_src = + state->custom_mapping.texel_unwind_dst; + + result = + pvr_3d_copy_blit_core(ctx, &bg_cmd, prep_data, pass_idx, finished_out); + if (result != VK_SUCCESS) + return result; + + /* If the destination has 4 channels and the source has at most 2, we still + * need all 4 channels from the USC into the PBE. + */ + state->dont_force_pbe = true; + state->custom_mapping.texel_unwind_src = texel_unwind_src; + + /* We need the viewport mask, otherwise all pixels would be disabled. */ + pvr_csb_pack (&control_reg, CR_ISP_BGOBJVALS, reg) { + reg.mask = true; + } + state->regs.isp_bgobjvals |= control_reg; + + pvr_transfer_set_filter(transfer_cmd, state); + result = pvr_isp_ctrl_stream(&ctx->device->pdevice->dev_info, + ctx, + transfer_cmd, + prep_data); + if (result != VK_SUCCESS) + return result; + + /* In case of resolve M -> S, the accumulation is read from and written to a + * single sampled surface. Make sure that we are resolving and we have the + * right number of tiles. + */ + if (state->down_scale) { + uint64_t tmp; + + pvr_csb_pack (&tmp, CR_PBE_WORD0_MRT0, reg) { + reg.downscale = true; + } + state->regs.pbe_wordx_mrty[0U] |= tmp; + + result = pvr_isp_tiles(ctx->device, state); + if (result != VK_SUCCESS) + return result; + } + + return VK_SUCCESS; +} + +static bool pvr_texel_unwind(uint32_t bpp, + pvr_dev_addr_t dev_addr, + bool is_input, + uint32_t texel_extend, + uint32_t *texel_unwind_out) +{ + uint32_t texel_unwind = 0U; + + for (uint32_t i = 0U; i < 16U; i++) { + if (pvr_is_surface_aligned(dev_addr, is_input, bpp)) { + break; + } else { + if (i == 15U) { + return false; + } else { + dev_addr.addr -= (bpp / texel_extend) / 8U; + texel_unwind++; + } + } + } + + *texel_unwind_out = texel_unwind; + + return true; +} + +static bool pvr_byte_unwind(uint32_t bpp, + pvr_dev_addr_t dev_addr, + bool is_input, + uint32_t *byte_unwind_out) +{ + uint32_t byte_unwind = 0U; + + for (uint32_t i = 0U; i < 16U; i++) { + if (pvr_is_surface_aligned(dev_addr, is_input, bpp)) { + break; + } else { + if (i == 15U) { + return false; + } else { + dev_addr.addr -= 1U; + byte_unwind++; + } + } + } + + *byte_unwind_out = byte_unwind; + + return true; +} + +static bool pvr_is_identity_mapping(const struct pvr_rect_mapping *mapping) +{ + return (mapping->src_rect.offset.x == mapping->dst_rect.offset.x && + mapping->src_rect.offset.y == mapping->dst_rect.offset.y && + mapping->src_rect.extent.width == mapping->dst_rect.extent.width && + mapping->src_rect.extent.height == mapping->dst_rect.extent.height); +} + +static inline bool pvr_is_pbe_stride_aligned(const uint32_t stride) +{ + if (stride == 1U) + return true; + + return ((stride & (PVRX(PBESTATE_REG_WORD0_LINESTRIDE_UNIT_SIZE) - 1U)) == + 0x0U); +} + +static struct pvr_transfer_pass * +pvr_create_pass(struct pvr_transfer_custom_mapping *custom_mapping, + uint32_t dst_offset, + uint32_t src_offset, + bool extend_height) +{ + struct pvr_transfer_pass *pass; + + assert(custom_mapping->pass_count < PVR_TRANSFER_MAX_PASSES); + + pass = &custom_mapping->passes[custom_mapping->pass_count]; + pass->clip_rects_count = 0U; + pass->dst_offset = dst_offset; + pass->src_offset = src_offset; + pass->mapping_count = 0U; + pass->extend_height = extend_height; + + custom_mapping->pass_count++; + + return pass; +} + +/* Acquire pass with given offset. If one doesn't exist, create new. */ +static struct pvr_transfer_pass * +pvr_acquire_pass(struct pvr_transfer_custom_mapping *custom_mapping, + uint32_t dst_offset, + uint32_t src_offset, + bool extend_height) +{ + for (uint32_t i = 0U; i < custom_mapping->pass_count; i++) { + if (custom_mapping->passes[i].dst_offset == dst_offset) + return &custom_mapping->passes[i]; + } + + return pvr_create_pass(custom_mapping, dst_offset, src_offset, extend_height); +} + +static void pvr_remove_mapping(struct pvr_transfer_pass *pass, uint32_t idx) +{ + assert(idx < pass->mapping_count); + + for (uint32_t i = idx; i < (pass->mapping_count - 1U); i++) + pass->mappings[i] = pass->mappings[i + 1U]; + + pass->mapping_count--; +} + +static struct pvr_rect_mapping * +pvr_create_mapping(struct pvr_transfer_pass *pass) +{ + assert(pass->mapping_count < ARRAY_SIZE(pass->mappings)); + + return &pass->mappings[pass->mapping_count++]; +} + +/** + * If PBE can't write to surfaces with odd stride, the stride of + * destination surface is doubled to make it even. Height of the surface is + * halved. The source surface is not resized. Each half of the modified + * destination surface samples every second row from the source surface. This + * only works with nearest filtering. + */ +static bool pvr_double_stride(struct pvr_transfer_pass *pass, uint32_t stride) +{ + struct pvr_rect_mapping *mappings = pass->mappings; + + if (stride == 1U) + return false; + + if (mappings[0U].dst_rect.extent.height == 1U && pass->mapping_count == 1U) { + /* Only one mapping required if height is 1. */ + if ((mappings[0U].dst_rect.offset.y & 1U) != 0U) { + mappings[0U].dst_rect.offset.x += (int32_t)stride; + mappings[0U].dst_rect.offset.y /= 2U; + mappings[0U].dst_rect.extent.height = + (mappings[0U].dst_rect.extent.height + 1U) / 2U; + } else { + mappings[0U].dst_rect.extent.height = + (mappings[0U].dst_rect.offset.y + + mappings[0U].dst_rect.extent.height + 1U) / + 2U - + mappings[0U].dst_rect.offset.y; + mappings[0U].dst_rect.offset.y /= 2U; + } + + return true; + } + + pvr_finishme("Add support for multiple mappings."); + + return false; +} + +static void pvr_unwind_rects(uint32_t width, + uint32_t height, + uint32_t texel_unwind, + bool input, + struct pvr_transfer_pass *pass) +{ + pvr_finishme("Implement pvr_unwind_rects()."); +} + +/** + * Assign clip rects to rectangle mappings. TDM can only do two PBE clip + * rects per screen. + */ +static void +pvr_map_clip_rects(struct pvr_transfer_custom_mapping *custom_mapping) +{ + for (uint32_t i = 0U; i < custom_mapping->pass_count; i++) { + struct pvr_transfer_pass *pass = &custom_mapping->passes[i]; + + pass->clip_rects_count = 0U; + for (uint32_t j = 0U; j < pass->mapping_count; j++) { + struct pvr_rect_mapping *mappings = pass->mappings; + VkRect2D *clip_rects = pass->clip_rects; + bool merged = false; + + /* Try merge adjacent clip rects. */ + for (uint32_t k = 0U; k < pass->clip_rects_count; k++) { + if (clip_rects[k].offset.y == mappings[j].dst_rect.offset.y && + clip_rects[k].extent.height == + mappings[j].dst_rect.extent.height && + clip_rects[k].offset.x + clip_rects[k].extent.width == + mappings[j].dst_rect.offset.x) { + clip_rects[k].extent.width += mappings[j].dst_rect.extent.width; + merged = true; + break; + } + + if (clip_rects[k].offset.y == mappings[j].dst_rect.offset.y && + clip_rects[k].extent.height == + mappings[j].dst_rect.extent.height && + clip_rects[k].offset.x == + mappings[j].dst_rect.offset.x + + mappings[j].dst_rect.extent.width) { + clip_rects[k].offset.x = mappings[j].dst_rect.offset.x; + clip_rects[k].extent.width += mappings[j].dst_rect.extent.width; + merged = true; + break; + } + + if (clip_rects[k].offset.x == mappings[j].dst_rect.offset.x && + clip_rects[k].extent.width == + mappings[j].dst_rect.extent.width && + clip_rects[k].offset.y + clip_rects[k].extent.height == + mappings[j].dst_rect.offset.y) { + clip_rects[k].extent.height += + mappings[j].dst_rect.extent.height; + merged = true; + break; + } + + if (clip_rects[k].offset.x == mappings[j].dst_rect.offset.x && + clip_rects[k].extent.width == + mappings[j].dst_rect.extent.width && + clip_rects[k].offset.y == + mappings[j].dst_rect.offset.y + + mappings[j].dst_rect.extent.height) { + clip_rects[k].extent.height += + mappings[j].dst_rect.extent.height; + clip_rects[k].offset.y = mappings[j].dst_rect.offset.y; + merged = true; + break; + } + } + + if (merged) + continue; + + /* Create new pass if needed, TDM can only have 2 clip rects. */ + if (pass->clip_rects_count >= custom_mapping->max_clip_rects) { + struct pvr_transfer_pass *new_pass = + pvr_create_pass(custom_mapping, + pass->dst_offset, + pass->src_offset, + pass->extend_height); + struct pvr_rect_mapping *new_mapping = pvr_create_mapping(new_pass); + + new_pass->clip_rects_count = 1U; + *new_mapping = pass->mappings[j]; + + pvr_remove_mapping(pass, j); + + /* Redo - mapping was replaced. */ + j--; + } else { + pass->clip_rects[pass->clip_rects_count] = + pass->mappings[j].dst_rect; + + pass->clip_rects_count++; + + assert(pass->clip_rects_count <= ARRAY_SIZE(pass->clip_rects)); + } + } + } +} + +static void +pvr_generate_custom_mapping(uint32_t src_stride, + uint32_t src_width, + uint32_t src_height, + uint32_t dst_stride, + uint32_t dst_width, + uint32_t dst_height, + enum pvr_memlayout dst_mem_layout, + struct pvr_transfer_custom_mapping *custom_mapping) +{ + src_stride *= custom_mapping->texel_extend_src; + src_width *= custom_mapping->texel_extend_src; + dst_stride *= custom_mapping->texel_extend_dst; + dst_width *= custom_mapping->texel_extend_dst; + + if (custom_mapping->texel_unwind_src > 0U) { + pvr_unwind_rects(src_stride, + src_height, + custom_mapping->texel_unwind_src, + true, + &custom_mapping->passes[0U]); + } + + if (custom_mapping->double_stride) { + custom_mapping->double_stride = + pvr_double_stride(&custom_mapping->passes[0U], dst_stride); + + dst_stride *= 2U; + } + + pvr_unwind_rects(dst_stride, + dst_height, + custom_mapping->texel_unwind_dst, + false, + &custom_mapping->passes[0U]); + + pvr_map_clip_rects(custom_mapping); + + /* If the last row of the source mapping is sampled, height of the surface + * can only be increased if the new area contains a valid region. Some blits + * are split to two sources. + */ + if (custom_mapping->texel_unwind_src > 0U) { + pvr_finishme("Implement pvr_generate_custom_mapping()."); + } +} + +static bool +pvr_get_custom_mapping(const struct pvr_device_info *dev_info, + const struct pvr_transfer_cmd *transfer_cmd, + uint32_t max_clip_rects, + struct pvr_transfer_custom_mapping *custom_mapping) +{ + const uint32_t dst_bpp = + vk_format_get_blocksizebits(transfer_cmd->dst.vk_format); + struct pvr_transfer_pass *pass; + bool ret; + + custom_mapping->max_clip_rects = max_clip_rects; + custom_mapping->texel_unwind_src = 0U; + custom_mapping->texel_unwind_dst = 0U; + custom_mapping->texel_extend_src = 1U; + custom_mapping->texel_extend_dst = 1U; + custom_mapping->byte_unwind_src = 0U; + custom_mapping->pass_count = 0U; + + custom_mapping->max_clip_size = PVR_MAX_CLIP_SIZE(dev_info); + + ret = pvr_texel_unwind(dst_bpp, + transfer_cmd->dst.dev_addr, + false, + 1U, + &custom_mapping->texel_unwind_dst); + if (!ret) { + custom_mapping->texel_extend_dst = dst_bpp / 8U; + if (transfer_cmd->src_present) { + if (transfer_cmd->src.mem_layout == PVR_MEMLAYOUT_LINEAR) { + custom_mapping->texel_extend_src = custom_mapping->texel_extend_dst; + } else if (transfer_cmd->src.mem_layout == PVR_MEMLAYOUT_TWIDDLED && + transfer_cmd->src.height == 1U) { + custom_mapping->texel_extend_src = custom_mapping->texel_extend_dst; + } + } + + ret = pvr_texel_unwind(dst_bpp, + transfer_cmd->dst.dev_addr, + false, + custom_mapping->texel_extend_dst, + &custom_mapping->texel_unwind_dst); + if (!ret) + return false; + } + + if (transfer_cmd->src_present) { + const uint32_t src_bpp = + vk_format_get_blocksizebits(transfer_cmd->src.vk_format); + + ret = pvr_is_surface_aligned(transfer_cmd->src.dev_addr, true, src_bpp); + + if (!ret && (transfer_cmd->src.mem_layout == PVR_MEMLAYOUT_LINEAR || + transfer_cmd->src.height == 1U)) { + ret = pvr_texel_unwind(src_bpp, + transfer_cmd->src.dev_addr, + true, + custom_mapping->texel_extend_src, + &custom_mapping->texel_unwind_src); + } + + if (!ret && dst_bpp != 24U) { + ret = pvr_byte_unwind(src_bpp, + transfer_cmd->src.dev_addr, + true, + &custom_mapping->byte_unwind_src); + } + + if (!ret) { + custom_mapping->texel_extend_src = dst_bpp / 8U; + custom_mapping->texel_extend_dst = custom_mapping->texel_extend_src; + + ret = pvr_texel_unwind(src_bpp, + transfer_cmd->src.dev_addr, + true, + custom_mapping->texel_extend_src, + &custom_mapping->texel_unwind_src); + } + + if (!ret) + return false; + } + + VkRect2D rect = transfer_cmd->scissor; + assert( + (rect.offset.x + rect.extent.width) <= custom_mapping->max_clip_size && + (rect.offset.y + rect.extent.height) <= custom_mapping->max_clip_size); + + /* Texel extend only works with strided memory layout, because pixel width is + * changed. Texel unwind only works with strided memory layout. 1D blits are + * allowed. + */ + if (transfer_cmd->src.height > 1U && transfer_cmd->src_present && + (custom_mapping->texel_extend_src > 1U || + custom_mapping->texel_unwind_src > 0U) && + transfer_cmd->src.mem_layout != PVR_MEMLAYOUT_LINEAR) { + return false; + } + + /* Texel extend only works with strided memory layout, because pixel width is + * changed. Texel unwind only works with strided memory layout. 1D blits are + * allowed. + */ + if ((custom_mapping->texel_extend_dst > 1U || + custom_mapping->texel_unwind_dst > 0U) && + transfer_cmd->dst.mem_layout != PVR_MEMLAYOUT_LINEAR && + transfer_cmd->dst.height > 1U) { + return false; + } + + if (transfer_cmd->dst.mem_layout == PVR_MEMLAYOUT_LINEAR) { + custom_mapping->double_stride = !pvr_is_pbe_stride_aligned( + transfer_cmd->dst.stride * custom_mapping->texel_extend_dst); + } + + if (custom_mapping->byte_unwind_src > 0U || + custom_mapping->texel_unwind_src > 0U || + custom_mapping->texel_unwind_dst > 0U || custom_mapping->double_stride) { + struct pvr_rect_mapping *mapping; + + pass = pvr_acquire_pass(custom_mapping, 0U, 0U, false); + mapping = pvr_create_mapping(pass); + + if (transfer_cmd->src_present) { + *mapping = transfer_cmd->mappings[0U]; + } else { + mapping->src_rect = transfer_cmd->scissor; + mapping->dst_rect = transfer_cmd->scissor; + } + } else { + return false; + } + + if (custom_mapping->texel_extend_src > 1U || + custom_mapping->texel_extend_dst > 1U) { + pass->mappings[0U].src_rect.offset.x *= + (int32_t)custom_mapping->texel_extend_dst; + pass->mappings[0U].src_rect.extent.width *= + (int32_t)custom_mapping->texel_extend_dst; + pass->mappings[0U].dst_rect.offset.x *= + (int32_t)custom_mapping->texel_extend_dst; + pass->mappings[0U].dst_rect.extent.width *= + (int32_t)custom_mapping->texel_extend_dst; + } + + if (transfer_cmd->src_present) { + pvr_generate_custom_mapping(transfer_cmd->src.stride, + transfer_cmd->src.width, + transfer_cmd->src.height, + transfer_cmd->dst.stride, + transfer_cmd->dst.width, + transfer_cmd->dst.height, + transfer_cmd->dst.mem_layout, + custom_mapping); + } else { + pvr_generate_custom_mapping(0U, + 0U, + 0U, + transfer_cmd->dst.stride, + transfer_cmd->dst.width, + transfer_cmd->dst.height, + transfer_cmd->dst.mem_layout, + custom_mapping); + } + + return true; +} + +static void +pvr_modify_command(struct pvr_transfer_custom_mapping *custom_mapping, + uint32_t pass_idx, + struct pvr_transfer_cmd *transfer_cmd) +{ + struct pvr_transfer_pass *pass = &custom_mapping->passes[pass_idx]; + uint32_t bpp; + + if (custom_mapping->texel_extend_src > 1U) { + pvr_finishme("Complete pvr_modify_command()."); + } else if (custom_mapping->texel_extend_dst > 1U) { + pvr_finishme("Complete pvr_modify_command()."); + } + + if (custom_mapping->double_stride) { + transfer_cmd->dst.width *= 2U; + transfer_cmd->dst.stride *= 2U; + } + + if (custom_mapping->texel_unwind_src > 0U) { + if (transfer_cmd->src.height == 1U) { + transfer_cmd->src.width += custom_mapping->texel_unwind_src; + transfer_cmd->src.stride += custom_mapping->texel_unwind_src; + } else if (transfer_cmd->src.stride == 1U) { + transfer_cmd->src.height += custom_mapping->texel_unwind_src; + } else { + /* Increase source width by texel unwind. If texel unwind is less than + * the distance between width and stride. The blit can be done with one + * rectangle mapping, but the width of the surface needs be to + * increased in case we sample from the area between width and stride. + */ + transfer_cmd->src.width = + MIN2(transfer_cmd->src.width + custom_mapping->texel_unwind_src, + transfer_cmd->src.stride); + } + } + + transfer_cmd->mapping_count = pass->mapping_count; + for (uint32_t i = 0U; i < transfer_cmd->mapping_count; i++) + transfer_cmd->mappings[i] = pass->mappings[i]; + + if (pass->extend_height) + transfer_cmd->src.height += 1U; + + transfer_cmd->src.width = MIN2(PVR_MAX_WIDTH, transfer_cmd->src.width); + transfer_cmd->src.height = MIN2(PVR_MAX_WIDTH, transfer_cmd->src.height); + transfer_cmd->src.stride = MIN2(PVR_MAX_WIDTH, transfer_cmd->src.stride); + + if (transfer_cmd->dst.height == 1U) { + transfer_cmd->dst.width = + transfer_cmd->dst.stride + custom_mapping->texel_unwind_dst; + transfer_cmd->dst.mem_layout = PVR_MEMLAYOUT_TWIDDLED; + } + + if (transfer_cmd->dst.mem_layout == PVR_MEMLAYOUT_TWIDDLED) { + transfer_cmd->dst.width = + MIN2((uint32_t)custom_mapping->max_clip_size, transfer_cmd->dst.width); + transfer_cmd->dst.height = MIN2((uint32_t)custom_mapping->max_clip_size, + transfer_cmd->dst.height); + } else { + transfer_cmd->dst.width = MIN2(PVR_MAX_WIDTH, transfer_cmd->dst.width); + } + + if (transfer_cmd->src_present) { + bpp = vk_format_get_blocksizebits(transfer_cmd->src.vk_format); + + transfer_cmd->src.dev_addr.addr -= + custom_mapping->texel_unwind_src * bpp / 8U; + transfer_cmd->src.dev_addr.addr += + MAX2(transfer_cmd->src.sample_count, 1U) * pass->src_offset * bpp / 8U; + } + + bpp = vk_format_get_blocksizebits(transfer_cmd->dst.vk_format); + transfer_cmd->dst.dev_addr.addr -= + custom_mapping->texel_unwind_dst * bpp / 8U; + transfer_cmd->dst.dev_addr.addr += + MAX2(transfer_cmd->dst.sample_count, 1U) * pass->dst_offset * bpp / 8U; + + transfer_cmd->src_present = + transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FILL ? false : true; +} + +static VkResult pvr_reroute_to_clip(struct pvr_transfer_ctx *ctx, + const struct pvr_transfer_cmd *transfer_cmd, + const struct VkRect2D *dst_rect, + struct pvr_transfer_prep_data *prep_data, + uint32_t pass_idx, + bool *finished_out) +{ + pvr_finishme("Unimplemented path."); + return VK_SUCCESS; +} + +static VkResult pvr_3d_copy_blit(struct pvr_transfer_ctx *ctx, + struct pvr_transfer_cmd *transfer_cmd, + struct pvr_transfer_prep_data *prep_data, + uint32_t pass_idx, + bool *finished_out) +{ + const struct pvr_device_info *const dev_info = + &ctx->device->pdevice->dev_info; + + const struct pvr_transfer_blit *blit = &transfer_cmd->blit; + struct pvr_transfer_3d_state *state = &prep_data->state; + struct pvr_transfer_cmd *active_cmd = transfer_cmd; + struct pvr_transfer_cmd int_cmd; + VkResult result; + + state->dont_force_pbe = false; + state->pass_idx = pass_idx; + + pvr_transfer_set_filter(transfer_cmd, state); + + if (transfer_cmd->src_present) { + /* Try to work out a condition to map pixel formats to RAW. That is only + * possible if we don't perform any kind of 2D operation on the blit as we + * don't know the actual pixel values - i.e. it has to be point sampled - + * scaling doesn't matter as long as point sampled. + */ + if (transfer_cmd->src.vk_format == transfer_cmd->dst.vk_format && + state->filter == PVR_FILTER_POINT && + transfer_cmd->src.sample_count <= transfer_cmd->dst.sample_count && + (transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_DSMERGE) == 0U && + transfer_cmd->blit.alpha.type == PVR_ALPHA_NONE) { + uint32_t bpp; + + int_cmd = *transfer_cmd; + active_cmd = &int_cmd; + bpp = vk_format_get_blocksizebits(int_cmd.dst.vk_format); + + if (bpp > 0U) { + switch (bpp) { + case 8U: + int_cmd.src.vk_format = VK_FORMAT_R8_UINT; + break; + case 16U: + int_cmd.src.vk_format = VK_FORMAT_R8G8_UINT; + break; + case 24U: + int_cmd.src.vk_format = VK_FORMAT_R8G8B8_UINT; + break; + case 32U: + int_cmd.src.vk_format = VK_FORMAT_R32_UINT; + break; + case 48U: + int_cmd.src.vk_format = VK_FORMAT_R16G16B16_UINT; + break; + case 64U: + int_cmd.src.vk_format = VK_FORMAT_R32G32_UINT; + break; + case 96U: + int_cmd.src.vk_format = VK_FORMAT_R32G32B32_UINT; + break; + case 128U: + int_cmd.src.vk_format = VK_FORMAT_R32G32B32A32_UINT; + break; + default: + active_cmd = transfer_cmd; + break; + } + } + + int_cmd.dst.vk_format = int_cmd.src.vk_format; + } + } + + if (pass_idx == 0U) { + pvr_get_custom_mapping(dev_info, active_cmd, 3U, &state->custom_mapping); + + if (state->custom_mapping.texel_extend_src > 1U) + state->custom_mapping.texel_extend_dst = 1U; + } + + if (state->custom_mapping.pass_count > 0U) { + struct pvr_transfer_pass *pass = &state->custom_mapping.passes[pass_idx]; + + if (blit->alpha.type != PVR_ALPHA_NONE) + return vk_error(ctx->device, VK_ERROR_FORMAT_NOT_SUPPORTED); + + if (active_cmd != &int_cmd) { + int_cmd = *active_cmd; + active_cmd = &int_cmd; + } + + state->custom_filter = true; + + pvr_modify_command(&state->custom_mapping, pass_idx, active_cmd); + + if (state->custom_mapping.double_stride || pass->mapping_count > 1U) { + result = + pvr_3d_clip_blit(ctx, active_cmd, prep_data, pass_idx, finished_out); + } else { + struct pvr_rect_mapping *mappings = &pass->mappings[0U]; + + mappings[0U].src_rect.offset.x /= + MAX2(1U, state->custom_mapping.texel_extend_dst); + mappings[0U].src_rect.extent.width /= + MAX2(1U, state->custom_mapping.texel_extend_dst); + + if (int_cmd.src_present) { + for (uint32_t i = 0U; i < pass->mapping_count; i++) + active_cmd->mappings[i] = mappings[i]; + } + + active_cmd->scissor = mappings[0U].dst_rect; + + result = pvr_3d_copy_blit_core(ctx, + active_cmd, + prep_data, + pass_idx, + finished_out); + } + + return result; + } + + /* Route DS merge blits to Clip blit. Background object is used to preserve + * the unmerged channel. + */ + if ((transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_DSMERGE) != 0U) { + /* PBE byte mask could be used for DS merge with FastScale. Clearing the + * other channel on a DS merge requires Clip blit. + */ + if (!PVR_HAS_ERN(dev_info, 42064) || + ((transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FILL) != 0U)) { + return pvr_reroute_to_clip(ctx, + active_cmd, + &active_cmd->scissor, + prep_data, + pass_idx, + finished_out); + } + } + + return pvr_3d_copy_blit_core(ctx, + active_cmd, + prep_data, + pass_idx, + finished_out); +} + +static bool pvr_3d_validate_addr(struct pvr_transfer_cmd *transfer_cmd) +{ + /* TODO: Complete this function, based on TQ_3DValidateVaddr. */ + pvr_finishme("Complete pvr_3d_validate_addr()."); + return true; +} + +static void +pvr_submit_info_stream_init(struct pvr_transfer_ctx *ctx, + struct pvr_transfer_prep_data *prep_data, + struct pvr_winsys_transfer_cmd *cmd) +{ + const struct pvr_winsys_transfer_regs *const regs = &prep_data->state.regs; + const struct pvr_physical_device *const pdevice = ctx->device->pdevice; + const struct pvr_device_info *const dev_info = &pdevice->dev_info; + + uint32_t *stream_ptr = (uint32_t *)cmd->fw_stream; + + *(uint64_t *)stream_ptr = regs->pds_bgnd0_base; + stream_ptr += pvr_cmd_length(CR_PDS_BGRND0_BASE); + + *(uint64_t *)stream_ptr = regs->pds_bgnd1_base; + stream_ptr += pvr_cmd_length(CR_PDS_BGRND1_BASE); + + *(uint64_t *)stream_ptr = regs->pds_bgnd3_sizeinfo; + stream_ptr += pvr_cmd_length(CR_PDS_BGRND3_SIZEINFO); + + *(uint64_t *)stream_ptr = regs->isp_mtile_base; + stream_ptr += pvr_cmd_length(CR_ISP_MTILE_BASE); + + STATIC_ASSERT(ARRAY_SIZE(regs->pbe_wordx_mrty) == 9U); + STATIC_ASSERT(sizeof(regs->pbe_wordx_mrty[0]) == sizeof(uint64_t)); + memcpy(stream_ptr, regs->pbe_wordx_mrty, sizeof(regs->pbe_wordx_mrty)); + stream_ptr += 9U * 2U; + + *stream_ptr = regs->isp_bgobjvals; + stream_ptr += pvr_cmd_length(CR_ISP_BGOBJVALS); + + *stream_ptr = regs->usc_pixel_output_ctrl; + stream_ptr += pvr_cmd_length(CR_USC_PIXEL_OUTPUT_CTRL); + + *stream_ptr = regs->usc_clear_register0; + stream_ptr += pvr_cmd_length(CR_USC_CLEAR_REGISTER0); + + *stream_ptr = regs->usc_clear_register1; + stream_ptr += pvr_cmd_length(CR_USC_CLEAR_REGISTER1); + + *stream_ptr = regs->usc_clear_register2; + stream_ptr += pvr_cmd_length(CR_USC_CLEAR_REGISTER2); + + *stream_ptr = regs->usc_clear_register3; + stream_ptr += pvr_cmd_length(CR_USC_CLEAR_REGISTER3); + + *stream_ptr = regs->isp_mtile_size; + stream_ptr += pvr_cmd_length(CR_ISP_MTILE_SIZE); + + *stream_ptr = regs->isp_render_origin; + stream_ptr += pvr_cmd_length(CR_ISP_RENDER_ORIGIN); + + *stream_ptr = regs->isp_ctl; + stream_ptr += pvr_cmd_length(CR_ISP_CTL); + + *stream_ptr = regs->isp_aa; + stream_ptr += pvr_cmd_length(CR_ISP_AA); + + *stream_ptr = regs->event_pixel_pds_info; + stream_ptr += pvr_cmd_length(CR_EVENT_PIXEL_PDS_INFO); + + *stream_ptr = regs->event_pixel_pds_code; + stream_ptr += pvr_cmd_length(CR_EVENT_PIXEL_PDS_CODE); + + *stream_ptr = regs->event_pixel_pds_data; + stream_ptr += pvr_cmd_length(CR_EVENT_PIXEL_PDS_DATA); + + *stream_ptr = regs->isp_render; + stream_ptr += pvr_cmd_length(CR_ISP_RENDER); + + *stream_ptr = regs->isp_rgn; + stream_ptr++; + + if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support)) { + *stream_ptr = regs->frag_screen; + stream_ptr++; + } + + cmd->fw_stream_len = (uint8_t *)stream_ptr - cmd->fw_stream; + assert(cmd->fw_stream_len <= ARRAY_SIZE(cmd->fw_stream)); +} + +static void pvr_transfer_job_ws_submit_info_init( + struct pvr_transfer_ctx *ctx, + struct pvr_transfer_submit *submit, + struct vk_sync *wait, + struct pvr_winsys_transfer_submit_info *submit_info) +{ + const struct pvr_device *const device = ctx->device; + + submit_info->frame_num = device->global_queue_present_count; + submit_info->job_num = device->global_cmd_buffer_submit_count; + submit_info->wait = wait; + submit_info->cmd_count = submit->prep_count; + + for (uint32_t i = 0U; i < submit->prep_count; i++) { + struct pvr_winsys_transfer_cmd *const cmd = &submit_info->cmds[i]; + struct pvr_transfer_prep_data *prep_data = &submit->prep_array[i]; + + pvr_submit_info_stream_init(ctx, prep_data, cmd); + + cmd->flags = prep_data->flags; + } +} + +static VkResult pvr_submit_transfer(struct pvr_transfer_ctx *ctx, + struct pvr_transfer_submit *submit, + struct vk_sync *wait, + struct vk_sync *signal_sync) +{ + struct pvr_winsys_transfer_submit_info submit_info; + + pvr_transfer_job_ws_submit_info_init(ctx, submit, wait, &submit_info); + + return ctx->device->ws->ops->transfer_submit(ctx->ws_ctx, + &submit_info, + &ctx->device->pdevice->dev_info, + signal_sync); +} + +static VkResult pvr_queue_transfer(struct pvr_transfer_ctx *ctx, + struct pvr_transfer_cmd *transfer_cmd, + struct vk_sync *wait, + struct vk_sync *signal_sync) +{ + struct pvr_transfer_prep_data *prep_data = NULL; + struct pvr_transfer_prep_data *prev_prep_data; + struct pvr_transfer_submit submit = { 0U }; + bool finished = false; + uint32_t pass = 0U; + VkResult result; + + /* Transfer queue might decide to do a blit in multiple passes. When the + * prepare doesn't set the finished flag this code will keep calling the + * prepare with increasing pass. If queued transfers are submitted from + * here we submit them straight away. That's why we only need a single + * prepare for the blit rather then one for each pass. Otherwise we insert + * each prepare into the prepare array. When the client does blit batching + * and we split the blit into multiple passes each pass in each queued + * transfer adds one more prepare. Thus the prepare array after 2 + * pvr_queue_transfer calls might look like: + * + * +------+------++-------+-------+-------+ + * |B0/P0 |B0/P1 || B1/P0 | B1/P1 | B1/P2 | + * +------+------++-------+-------+-------+ + * F S/U F S/U + * + * Bn/Pm : nth blit (queue transfer call) / mth prepare + * F : fence point + * S/U : update / server sync update point + */ + + while (!finished) { + prev_prep_data = prep_data; + prep_data = &submit.prep_array[submit.prep_count++]; + + /* Clear down the memory before we write to this prep. */ + memset(prep_data, 0U, sizeof(*prep_data)); + + if (pass == 0U) { + if (!pvr_3d_validate_addr(transfer_cmd)) + return vk_error(ctx->device, VK_ERROR_FEATURE_NOT_PRESENT); + } else { + /* Transfer queue workarounds could use more than one pass with 3D + * path. + */ + prep_data->state = prev_prep_data->state; + } + + if (transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FAST2D) { + result = + pvr_3d_clip_blit(ctx, transfer_cmd, prep_data, pass, &finished); + } else { + result = + pvr_3d_copy_blit(ctx, transfer_cmd, prep_data, pass, &finished); + } + if (result != VK_SUCCESS) + return result; + + /* Submit if we have finished the blit or if we are out of prepares. */ + if (finished || submit.prep_count == ARRAY_SIZE(submit.prep_array)) { + result = pvr_submit_transfer(ctx, + &submit, + wait, + finished ? signal_sync : NULL); + if (result != VK_SUCCESS) + return result; + + /* Check if we need to reset prep_count. */ + if (submit.prep_count == ARRAY_SIZE(submit.prep_array)) + submit.prep_count = 0U; + } + + pass++; + } + + return VK_SUCCESS; +} + +VkResult pvr_transfer_job_submit(struct pvr_transfer_ctx *ctx, + struct pvr_sub_cmd_transfer *sub_cmd, + struct vk_sync *wait_sync, + struct vk_sync *signal_sync) +{ + list_for_each_entry_safe (struct pvr_transfer_cmd, + transfer_cmd, + &sub_cmd->transfer_cmds, + link) { + /* The fw guarantees that any kick on the same context will be + * synchronized in submission order. This means only the first kick must + * wait, and only the last kick need signal. + */ + struct vk_sync *first_cmd_wait_sync = NULL; + struct vk_sync *last_cmd_signal_sync = NULL; + VkResult result; + + if (list_first_entry(&sub_cmd->transfer_cmds, + struct pvr_transfer_cmd, + link) == transfer_cmd) { + first_cmd_wait_sync = wait_sync; + } + + if (list_last_entry(&sub_cmd->transfer_cmds, + struct pvr_transfer_cmd, + link) == transfer_cmd) { + last_cmd_signal_sync = signal_sync; + } + + result = pvr_queue_transfer(ctx, + transfer_cmd, + first_cmd_wait_sync, + last_cmd_signal_sync); + if (result != VK_SUCCESS) + return result; + } + + return VK_SUCCESS; } diff --git a/src/imagination/vulkan/pvr_job_transfer.h b/src/imagination/vulkan/pvr_job_transfer.h index 0084969..8c3ea4c 100644 --- a/src/imagination/vulkan/pvr_job_transfer.h +++ b/src/imagination/vulkan/pvr_job_transfer.h @@ -27,13 +27,23 @@ #include #include -struct pvr_device; struct pvr_sub_cmd_transfer; struct pvr_transfer_ctx; struct vk_sync; -VkResult pvr_transfer_job_submit(struct pvr_device *device, - struct pvr_transfer_ctx *ctx, +/** + * Destination pixels not covered by any of the destination rectangles but + * inside the scissor are filled with the clear color. + */ +#define PVR_TRANSFER_CMD_FLAGS_FILL 0x00000800U +/** If using TQ3D, route to fast2d. */ +#define PVR_TRANSFER_CMD_FLAGS_FAST2D 0x00200000U +/** Merge a depth or stencil against a depth + stencil texture. */ +#define PVR_TRANSFER_CMD_FLAGS_DSMERGE 0x00000200U +/** Valid if doing a DS merge with depth + stencil to depth + stencil. */ +#define PVR_TRANSFER_CMD_FLAGS_PICKD 0x00000400U + +VkResult pvr_transfer_job_submit(struct pvr_transfer_ctx *ctx, struct pvr_sub_cmd_transfer *sub_cmd, struct vk_sync *wait, struct vk_sync *signal_sync); diff --git a/src/imagination/vulkan/pvr_private.h b/src/imagination/vulkan/pvr_private.h index c0aa911..2a40d53 100644 --- a/src/imagination/vulkan/pvr_private.h +++ b/src/imagination/vulkan/pvr_private.h @@ -58,6 +58,7 @@ #include "util/macros.h" #include "util/simple_mtx.h" #include "util/u_dynarray.h" +#include "util/u_math.h" #include "vk_buffer.h" #include "vk_command_buffer.h" #include "vk_device.h" @@ -353,16 +354,118 @@ struct pvr_buffer_view { uint64_t texture_state[2]; }; +#define PVR_TRANSFER_MAX_CUSTOM_MAPPINGS 6U + +/** A surface describes a source or destination for a transfer operation. */ +struct pvr_transfer_cmd_surface { + pvr_dev_addr_t dev_addr; + + /* Memory address for extra U/V planes. */ + pvr_dev_addr_t uv_address[2]; + + /* Surface width in texels. */ + uint32_t width; + + /* Surface height in texels. */ + uint32_t height; + + uint32_t depth; + + /* Z position in a 3D tecture. 0.0f <= z_position <= depth. */ + float z_position; + + /* Stride in texels. */ + uint32_t stride; + + VkFormat vk_format; + + enum pvr_memlayout mem_layout; + + uint32_t sample_count; +}; + +struct pvr_rect_mapping { + VkRect2D src_rect; + VkRect2D dst_rect; +}; + +/* Describes an Alpha-Transparency configuration - for Transfer Queue Use. */ +struct pvr_transfer_alpha { + enum pvr_alpha_type type; + /* Global alpha value. */ + uint32_t global; + + /* Custom blend op for rgb. */ + uint32_t custom_rgb; + /* Custom blend op for alpha. */ + uint32_t custom_alpha; + /* Custom global alpha value for alpha output. */ + uint32_t global2; + /* Custom multiplication of global and source alpha. */ + bool glob_src_mul; + /* Custom zero source alpha transparency stage. */ + bool zero_src_a_trans; + + /* Enable argb1555 alpha components. */ + bool alpha_components; + /* Source alpha value when argb1555 alpha bit is 0. */ + uint32_t component0; + /* Source alpha value when argb1555 alpha bit is 1. */ + uint32_t component1; +}; + +struct pvr_transfer_blit { + /* 16 bit rop4 (ie two 8 bit rop3's). */ + uint32_t rop_code; + + /* Color key mask. */ + uint32_t color_mask; + + /* Alpha blend. */ + struct pvr_transfer_alpha alpha; + + VkOffset2D offset; +}; + struct pvr_transfer_cmd { /* Node to link this cmd into the transfer_cmds list in * pvr_sub_cmd::transfer structure. */ struct list_head link; - struct pvr_buffer *src; - struct pvr_buffer *dst; - uint32_t region_count; - VkBufferCopy2 regions[0]; + uint32_t flags; + + struct pvr_transfer_cmd_surface src; + bool src_present; + + union fi clear_color[4]; + + struct pvr_transfer_cmd_surface dst; + + VkRect2D scissor; + + uint32_t mapping_count; + struct pvr_rect_mapping mappings[PVR_TRANSFER_MAX_CUSTOM_MAPPINGS]; + + /* In the case of a simple 1:1 copy, this setting does not affect the output + * but will affect performance. Use clamp to edge when possible. + */ + /* This is of type enum PVRX(TEXSTATE_ADDRMODE). */ + int addr_mode; + + /* Source filtering method. */ + enum pvr_filter filter; + + /* MSAA resolve operation. */ + enum pvr_resolve_op resolve_op; + + struct pvr_transfer_blit blit; + + /* Pointer to cmd buffer this transfer cmd belongs to. This is mainly used + * to link buffer objects allocated during job submission into + * cmd_buffer::bo_list head. + */ + struct pvr_cmd_buffer *cmd_buffer; }; struct pvr_sub_cmd_gfx { diff --git a/src/imagination/vulkan/pvr_queue.c b/src/imagination/vulkan/pvr_queue.c index 4b8ebc3..e029ba4 100644 --- a/src/imagination/vulkan/pvr_queue.c +++ b/src/imagination/vulkan/pvr_queue.c @@ -361,8 +361,7 @@ static VkResult pvr_process_transfer_cmds(struct pvr_device *device, return result; result = - pvr_transfer_job_submit(device, - queue->transfer_ctx, + pvr_transfer_job_submit(queue->transfer_ctx, sub_cmd, queue->next_job_wait_sync[PVR_JOB_TYPE_TRANSFER], sync); diff --git a/src/imagination/vulkan/pvr_tex_state.c b/src/imagination/vulkan/pvr_tex_state.c index 4ef771a..4de0e99 100644 --- a/src/imagination/vulkan/pvr_tex_state.c +++ b/src/imagination/vulkan/pvr_tex_state.c @@ -218,7 +218,8 @@ pvr_pack_tex_state(struct pvr_device *device, if (iview_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) array_layers /= 6; - word1.depth = array_layers - 1; + if (array_layers > 0) + word1.depth = array_layers - 1; } word1.texaddr = PVR_DEV_ADDR_OFFSET(info->addr, info->offset); diff --git a/src/imagination/vulkan/pvr_transfer_frag_store.c b/src/imagination/vulkan/pvr_transfer_frag_store.c new file mode 100644 index 0000000..d1354d4 --- /dev/null +++ b/src/imagination/vulkan/pvr_transfer_frag_store.c @@ -0,0 +1,392 @@ +/* + * Copyright © 2023 Imagination Technologies Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "hwdef/rogue_hw_utils.h" +#include "pvr_bo.h" +#include "pvr_common.h" +#include "pvr_device_info.h" +#include "pvr_job_transfer.h" +#include "pvr_pds.h" +#include "pvr_private.h" +#include "pvr_transfer_frag_store.h" +#include "pvr_types.h" +#include "pvr_uscgen.h" +#include "util/hash_table.h" +#include "util/macros.h" +#include "util/ralloc.h" +#include "util/u_dynarray.h" +#include "util/u_math.h" +#include "vk_log.h" + +#define PVR_TRANSFER_BYTE_UNWIND_MAX 16U + +struct pvr_transfer_frag_store_entry_data { + pvr_dev_addr_t kick_usc_pds_offset; + struct pvr_bo *kick_usc_pds_upload; + + struct pvr_bo *usc_upload; + struct pvr_tq_frag_sh_reg_layout sh_reg_layout; +}; + +#define to_pvr_entry_data(_entry) \ + _Generic((_entry), \ + struct hash_entry *: (struct pvr_transfer_frag_store_entry_data *)((_entry)->data), \ + const struct hash_entry *: (const struct pvr_transfer_frag_store_entry_data *)((_entry)->data)) + +VkResult pvr_transfer_frag_store_init(struct pvr_device *device, + struct pvr_transfer_frag_store *store) +{ + const struct pvr_device_info *dev_info = &device->pdevice->dev_info; + + *store = (struct pvr_transfer_frag_store){ + .max_multisample = PVR_GET_FEATURE_VALUE(dev_info, max_multisample, 1U), + .hash_table = _mesa_hash_table_create_u32_keys(NULL), + }; + + if (!store->hash_table) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + return VK_SUCCESS; +} + +/** + * \brief Returns a key based on shader properties. + * + * Returns a unique key that can be used to uniquely identify a transfer + * fragment shader based on the provided shader properties. + * + * Make sure that the non valid parts of shader_props are memset to 0. Otherwise + * these bits might appear in the key as uninitialized data and might not + * match a key for the same shader. + */ +static uint32_t pvr_transfer_frag_shader_key( + uint32_t max_multisample, + const struct pvr_tq_shader_properties *shader_props) +{ + const struct pvr_tq_layer_properties *layer = &shader_props->layer_props; + uint32_t resolve_op_num = max_multisample + PVR_RESOLVE_SAMPLE0; + + uint32_t num_layers_bits = util_logbase2_ceil(PVR_TRANSFER_MAX_LAYERS + 1U); + uint32_t layer_float_bits = util_logbase2_ceil(PVR_INT_COORD_SET_FLOATS_NUM); + uint32_t pixel_src_bits = util_logbase2_ceil(PVR_TRANSFER_PBE_PIXEL_SRC_NUM); + uint32_t byte_unwind_bits = util_logbase2_ceil(PVR_TRANSFER_BYTE_UNWIND_MAX); + uint32_t resolve_op_bits = util_logbase2_ceil(resolve_op_num); + uint32_t sample_cnt_bits = util_last_bit(util_logbase2(max_multisample)); + uint32_t hash = 0U; + +#if defined(DEBUG) + uint32_t max_shift = 0U; +# define shift_hash(hash, num) \ + do { \ + max_shift += (num); \ + assert(max_shift <= 32U); \ + \ + (hash) <<= (num); \ + } while (0U) +#else +# define shift_hash(hash, num) hash <<= (num) +#endif + + /* Hash layer info. */ + + shift_hash(hash, layer_float_bits); + hash |= (uint32_t)shader_props->layer_props.layer_floats; + + shift_hash(hash, 1U); + hash |= layer->sample; + + shift_hash(hash, 1U); + hash |= (uint32_t) false; + + shift_hash(hash, 1U); + hash |= (uint32_t) false; + + shift_hash(hash, pixel_src_bits); + hash |= (uint32_t)layer->pbe_format; + + shift_hash(hash, resolve_op_bits); + hash |= (uint32_t)layer->resolve_op; + + assert(util_is_power_of_two_nonzero(layer->sample_count)); + shift_hash(hash, sample_cnt_bits); + hash |= (uint32_t)util_logbase2(layer->sample_count); + + shift_hash(hash, 1U); + hash |= (uint32_t)layer->msaa; + + shift_hash(hash, byte_unwind_bits); + hash |= layer->byte_unwind; + + shift_hash(hash, 1U); + hash |= (uint32_t)layer->linear; + + /* End layer info. */ + + shift_hash(hash, 1U); + hash |= (uint32_t)shader_props->full_rate; + + shift_hash(hash, 1U); + hash |= (uint32_t)shader_props->iterated; + + shift_hash(hash, 1U); + hash |= (uint32_t)shader_props->pick_component; + + shift_hash(hash, num_layers_bits); + /* Just 1 layer. */ + hash |= 1; + + shift_hash(hash, 3U); + hash |= shader_props->alpha_type; + +#undef shift_hash + + return hash; +} + +#define to_hash_table_key(_key) ((void *)(uintptr_t)(_key)) + +static VkResult pvr_transfer_frag_store_entry_data_compile( + struct pvr_device *device, + struct pvr_transfer_frag_store_entry_data *const entry_data, + const struct pvr_tq_shader_properties *shader_props, + uint32_t *const num_usc_temps_out) +{ + const uint32_t image_desc_offset = + offsetof(struct pvr_combined_image_sampler_descriptor, image) / 4; + const uint32_t sampler_desc_offset = + offsetof(struct pvr_combined_image_sampler_descriptor, sampler) / 4; + + const uint32_t cache_line_size = + rogue_get_slc_cache_line_size(&device->pdevice->dev_info); + + struct pvr_tq_frag_sh_reg_layout *sh_reg_layout = &entry_data->sh_reg_layout; + uint32_t next_free_sh_reg = 0; + struct util_dynarray shader; + VkResult result; + + /* TODO: Allocate all combined image samplers if needed? Otherwise change the + * array to a single descriptor. + */ + sh_reg_layout->combined_image_samplers.offsets[0].image = + next_free_sh_reg + image_desc_offset; + sh_reg_layout->combined_image_samplers.offsets[0].sampler = + next_free_sh_reg + sampler_desc_offset; + sh_reg_layout->combined_image_samplers.count = 1; + next_free_sh_reg += sizeof(struct pvr_combined_image_sampler_descriptor) / 4; + + /* TODO: Handle dynamic_const_regs used for PVR_INT_COORD_SET_FLOATS_{4,6}, Z + * position, texel unwind, etc. when compiler adds support for them. + */ + sh_reg_layout->dynamic_consts.offset = next_free_sh_reg; + sh_reg_layout->dynamic_consts.count = 0; + + sh_reg_layout->driver_total = next_free_sh_reg; + + pvr_uscgen_tq_frag(shader_props, + &entry_data->sh_reg_layout, + num_usc_temps_out, + &shader); + + result = pvr_gpu_upload_usc(device, + util_dynarray_begin(&shader), + util_dynarray_num_elements(&shader, uint8_t), + cache_line_size, + &entry_data->usc_upload); + util_dynarray_fini(&shader); + if (result != VK_SUCCESS) + return result; + + return VK_SUCCESS; +} + +static VkResult pvr_transfer_frag_store_entry_data_create( + struct pvr_device *device, + struct pvr_transfer_frag_store *store, + const struct pvr_tq_shader_properties *shader_props, + const struct pvr_transfer_frag_store_entry_data **const entry_data_out) +{ + struct pvr_pds_kickusc_program kick_usc_pds_prog = { 0 }; + struct pvr_transfer_frag_store_entry_data *entry_data; + pvr_dev_addr_t dev_addr; + uint32_t num_usc_temps; + VkResult result; + + entry_data = ralloc(store->hash_table, __typeof__(*entry_data)); + if (!entry_data) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + result = pvr_transfer_frag_store_entry_data_compile(device, + entry_data, + shader_props, + &num_usc_temps); + if (result != VK_SUCCESS) + goto err_free_entry; + + dev_addr = entry_data->usc_upload->vma->dev_addr; + dev_addr.addr -= device->heaps.usc_heap->base_addr.addr; + + pvr_pds_setup_doutu(&kick_usc_pds_prog.usc_task_control, + dev_addr.addr, + num_usc_temps, + shader_props->full_rate + ? PVRX(PDSINST_DOUTU_SAMPLE_RATE_FULL) + : PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE), + false); + + pvr_pds_kick_usc(&kick_usc_pds_prog, NULL, 0U, false, PDS_GENERATE_SIZES); + + result = pvr_bo_alloc( + device, + device->heaps.pds_heap, + (kick_usc_pds_prog.data_size + kick_usc_pds_prog.code_size) * 4, + 16, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &entry_data->kick_usc_pds_upload); + if (result != VK_SUCCESS) + goto err_free_usc_upload; + + pvr_pds_kick_usc(&kick_usc_pds_prog, + entry_data->kick_usc_pds_upload->bo->map, + 0U, + false, + PDS_GENERATE_CODEDATA_SEGMENTS); + + dev_addr = entry_data->kick_usc_pds_upload->vma->dev_addr; + dev_addr.addr -= device->heaps.pds_heap->base_addr.addr; + entry_data->kick_usc_pds_offset = dev_addr; + + *entry_data_out = entry_data; + + return VK_SUCCESS; + +err_free_usc_upload: + pvr_bo_free(device, entry_data->usc_upload); + +err_free_entry: + ralloc_free(entry_data); + + return result; +} + +static void inline pvr_transfer_frag_store_entry_data_destroy_no_ralloc_free( + struct pvr_device *device, + const struct pvr_transfer_frag_store_entry_data *entry_data) +{ + pvr_bo_free(device, entry_data->kick_usc_pds_upload); + pvr_bo_free(device, entry_data->usc_upload); +} + +static void inline pvr_transfer_frag_store_entry_data_destroy( + struct pvr_device *device, + const struct pvr_transfer_frag_store_entry_data *entry_data) +{ + pvr_transfer_frag_store_entry_data_destroy_no_ralloc_free(device, + entry_data); + /* Casting away the const :( */ + ralloc_free((void *)entry_data); +} + +static VkResult pvr_transfer_frag_store_get_entry( + struct pvr_device *device, + struct pvr_transfer_frag_store *store, + const struct pvr_tq_shader_properties *shader_props, + const struct pvr_transfer_frag_store_entry_data **const entry_data_out) +{ + const uint32_t key = + pvr_transfer_frag_shader_key(store->max_multisample, shader_props); + const struct hash_entry *entry; + VkResult result; + + entry = _mesa_hash_table_search(store->hash_table, to_hash_table_key(key)); + if (!entry) { + /* Init so that gcc stops complaining. */ + const struct pvr_transfer_frag_store_entry_data *entry_data = NULL; + + result = pvr_transfer_frag_store_entry_data_create(device, + store, + shader_props, + &entry_data); + if (result != VK_SUCCESS) + return result; + + assert(entry_data); + + entry = _mesa_hash_table_insert(store->hash_table, + to_hash_table_key(key), + (void *)entry_data); + if (!entry) { + pvr_transfer_frag_store_entry_data_destroy(device, entry_data); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + } + } + + *entry_data_out = to_pvr_entry_data(entry); + + return VK_SUCCESS; +} + +VkResult pvr_transfer_frag_store_get_shader_info( + struct pvr_device *device, + struct pvr_transfer_frag_store *store, + const struct pvr_tq_shader_properties *shader_props, + pvr_dev_addr_t *const pds_dev_addr_out, + const struct pvr_tq_frag_sh_reg_layout **const reg_layout_out) +{ + /* Init so that gcc stops complaining. */ + const struct pvr_transfer_frag_store_entry_data *entry_data = NULL; + VkResult result; + + result = pvr_transfer_frag_store_get_entry(device, + store, + shader_props, + &entry_data); + if (result != VK_SUCCESS) + return result; + + *pds_dev_addr_out = entry_data->kick_usc_pds_offset; + *reg_layout_out = &entry_data->sh_reg_layout; + + return VK_SUCCESS; +} + +void pvr_transfer_frag_store_fini(struct pvr_device *device, + struct pvr_transfer_frag_store *store) +{ + hash_table_foreach_remove(store->hash_table, entry) + { + /* ralloc_free() in _mesa_hash_table_destroy() will free each entry's + * memory so let's not waste extra time freeing them one by one and + * unliking. + */ + pvr_transfer_frag_store_entry_data_destroy_no_ralloc_free( + device, + to_pvr_entry_data(entry)); + } + + _mesa_hash_table_destroy(store->hash_table, NULL); +} diff --git a/src/imagination/vulkan/pvr_transfer_frag_store.h b/src/imagination/vulkan/pvr_transfer_frag_store.h new file mode 100644 index 0000000..ba1a7ae --- /dev/null +++ b/src/imagination/vulkan/pvr_transfer_frag_store.h @@ -0,0 +1,57 @@ +/* + * Copyright © 2023 Imagination Technologies Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef PVR_TRANSFER_FRAG_STORE_H +#define PVR_TRANSFER_FRAG_STORE_H + +#include +#include + +#include "pvr_device_info.h" +#include "pvr_uscgen.h" +#include "pvr_types.h" +#include "util/hash_table.h" + +struct pvr_device; + +struct pvr_transfer_frag_store { + uint32_t max_multisample; + /* Hash table mapping keys, produced by pvr_transfer_frag_shader_key(), to + * pvr_transfer_frag_store_entry_data entries. + */ + struct hash_table *hash_table; +}; + +VkResult pvr_transfer_frag_store_init(struct pvr_device *device, + struct pvr_transfer_frag_store *store); +void pvr_transfer_frag_store_fini(struct pvr_device *device, + struct pvr_transfer_frag_store *store); + +VkResult pvr_transfer_frag_store_get_shader_info( + struct pvr_device *device, + struct pvr_transfer_frag_store *store, + const struct pvr_tq_shader_properties *shader_props, + pvr_dev_addr_t *const pds_dev_addr_out, + const struct pvr_tq_frag_sh_reg_layout **const reg_layout_out); + +#endif /* PVR_TRANSFER_FRAG_STORE_H */ diff --git a/src/imagination/vulkan/vk_format.h b/src/imagination/vulkan/vk_format.h index 2153460..4b846ce 100644 --- a/src/imagination/vulkan/vk_format.h +++ b/src/imagination/vulkan/vk_format.h @@ -88,4 +88,45 @@ static inline bool vk_format_is_normalized(VkFormat vk_format) return true; } +static inline uint32_t +vk_format_get_common_color_channel_count(VkFormat src_format, + VkFormat dst_format) +{ + const struct util_format_description *dst_desc = + vk_format_description(dst_format); + const struct util_format_description *src_desc = + vk_format_description(src_format); + uint32_t count = 0; + + /* Check if destination format is alpha only and source format has alpha + * channel. + */ + if (util_format_is_alpha(vk_format_to_pipe_format(dst_format))) { + count = 1; + } else if (dst_desc->nr_channels <= src_desc->nr_channels) { + for (uint32_t i = 0; i < dst_desc->nr_channels; i++) { + enum pipe_swizzle swizzle = dst_desc->swizzle[i]; + + if (swizzle > PIPE_SWIZZLE_W) + continue; + + for (uint32_t j = 0; j < src_desc->nr_channels; j++) { + if (src_desc->swizzle[j] == swizzle) { + count++; + break; + } + } + } + } else { + count = dst_desc->nr_channels; + } + + return count; +} + +static inline bool vk_format_is_alpha(VkFormat format) +{ + return util_format_is_alpha(vk_format_to_pipe_format(format)); +} + #endif /* VK_FORMAT_H */ diff --git a/src/imagination/vulkan/winsys/pvr_winsys.h b/src/imagination/vulkan/winsys/pvr_winsys.h index 1fa8650..08d4989 100644 --- a/src/imagination/vulkan/winsys/pvr_winsys.h +++ b/src/imagination/vulkan/winsys/pvr_winsys.h @@ -258,6 +258,7 @@ struct pvr_winsys_transfer_regs { uint32_t event_pixel_pds_code; uint32_t event_pixel_pds_data; uint32_t event_pixel_pds_info; + uint32_t frag_screen; uint32_t isp_aa; uint32_t isp_bgobjvals; uint32_t isp_ctl;