From: Rajnesh Kanwal <rajnesh.kanwal@imgtec.com>
Date: Tue, 17 May 2022 16:19:31 +0000 (+0100)
Subject: pvr: Add support to process transfer and blit cmds
X-Git-Tag: upstream/23.3.3~9831
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=480bdff4b526c2aabd86a42ddb02665e9b7c69e2;p=platform%2Fupstream%2Fmesa.git

pvr: Add support to process transfer and blit cmds

Co-authored-by: Karmjit Mahil <Karmjit.Mahil@imgtec.com>
Co-authored-by: Matt Coster <matt.coster@imgtec.com>
Co-authored-by: Sarah Walker <sarah.walker@imgtec.com>
Signed-off-by: Rajnesh Kanwal <rajnesh.kanwal@imgtec.com>
Signed-off-by: Karmjit Mahil <Karmjit.Mahil@imgtec.com>
Signed-off-by: Matt Coster <matt.coster@imgtec.com>
Signed-off-by: Sarah Walker <sarah.walker@imgtec.com>
Acked-by: Frank Binns <frank.binns@imgtec.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21550>
---

diff --git a/src/imagination/common/pvr_device_info.c b/src/imagination/common/pvr_device_info.c
index d1fd295..c1dd1db 100644
--- a/src/imagination/common/pvr_device_info.c
+++ b/src/imagination/common/pvr_device_info.c
@@ -53,6 +53,8 @@ const struct pvr_device_features pvr_device_features_4_V_2_51 = {
    .has_num_clusters = true,
    .has_num_raster_pipes = true,
    .has_num_user_clip_planes = true,
+   .has_pbe_filterable_f16 = true,
+   .has_pbe_yuv = true,
    .has_slc_cache_line_size_bits = true,
    .has_slc_mcu_cache_controls = true,
    .has_tf_bicubic_filter = true,
@@ -96,6 +98,7 @@ const struct pvr_device_enhancements pvr_device_enhancements_4_40_2_51 = {
    .has_ern35421 = true,
    .has_ern38020 = true,
    .has_ern38748 = true,
+   .has_ern42064 = true,
    .has_ern42307 = true,
 };
 
@@ -126,6 +129,7 @@ const struct pvr_device_ident pvr_device_ident_33_V_11_3 = {
 const struct pvr_device_features pvr_device_features_33_V_11_3 = {
    .has_common_store_size_in_dwords = true,
    .has_compute = true,
+   .has_ipf_creq_pf = true,
    .has_isp_max_tiles_in_flight = true,
    .has_isp_samples_per_pixel = true,
    .has_max_instances_per_pds_task = true,
@@ -136,6 +140,8 @@ const struct pvr_device_features pvr_device_features_33_V_11_3 = {
    .has_num_raster_pipes = true,
    .has_num_user_clip_planes = true,
    .has_pbe2_in_xe = true,
+   .has_pbe_filterable_f16 = true,
+   .has_pbe_yuv = true,
    .has_roguexe = true,
    .has_screen_size8K = true,
    .has_simple_internal_parameter_format = true,
@@ -205,6 +211,7 @@ const struct pvr_device_features pvr_device_features_36_V_104_796 = {
    .has_compute_overlap = true,
    .has_gpu_multicore_support = true,
    .has_gs_rta_support = true,
+   .has_ipf_creq_pf = true,
    .has_isp_max_tiles_in_flight = true,
    .has_isp_samples_per_pixel = true,
    .has_max_instances_per_pds_task = true,
@@ -216,6 +223,8 @@ const struct pvr_device_features pvr_device_features_36_V_104_796 = {
    .has_num_user_clip_planes = true,
    .has_paired_tiles = true,
    .has_pbe2_in_xe = true,
+   .has_pbe_filterable_f16 = true,
+   .has_pbe_yuv = true,
    .has_pds_ddmadt = true,
    .has_roguexe = true,
    .has_screen_size8K = true,
diff --git a/src/imagination/common/pvr_device_info.h b/src/imagination/common/pvr_device_info.h
index 4ed26df..ed8286d 100644
--- a/src/imagination/common/pvr_device_info.h
+++ b/src/imagination/common/pvr_device_info.h
@@ -257,6 +257,7 @@ struct pvr_device_features {
    bool has_eight_output_registers : 1;
    bool has_gpu_multicore_support : 1;
    bool has_gs_rta_support : 1;
+   bool has_ipf_creq_pf : 1;
    bool has_isp_max_tiles_in_flight : 1;
    bool has_isp_samples_per_pixel : 1;
    bool has_max_instances_per_pds_task : 1;
@@ -268,10 +269,13 @@ struct pvr_device_features {
    bool has_num_user_clip_planes : 1;
    bool has_paired_tiles : 1;
    bool has_pbe2_in_xe : 1;
+   bool has_pbe_filterable_f16 : 1;
+   bool has_pbe_yuv : 1;
    bool has_pds_ddmadt : 1;
    bool has_roguexe : 1;
    bool has_screen_size8K : 1;
    bool has_simple_internal_parameter_format : 1;
+   bool has_simple_internal_parameter_format_v1 : 1;
    bool has_simple_internal_parameter_format_v2 : 1;
    bool has_simple_parameter_format_version : 1;
    bool has_slc_cache_line_size_bits : 1;
@@ -327,6 +331,7 @@ struct pvr_device_enhancements {
    bool has_ern35421 : 1;
    bool has_ern38020 : 1;
    bool has_ern38748 : 1;
+   bool has_ern42064 : 1;
    bool has_ern42307 : 1;
    bool has_ern45493 : 1;
 };
diff --git a/src/imagination/common/pvr_util.h b/src/imagination/common/pvr_util.h
index e534bc0..f94a319 100644
--- a/src/imagination/common/pvr_util.h
+++ b/src/imagination/common/pvr_util.h
@@ -27,9 +27,18 @@
 #include <assert.h>
 #include <stdint.h>
 
+#include "pvr_types.h"
+
 #include "util/bitscan.h"
 #include "util/macros.h"
 
+static inline bool pvr_dev_addr_is_aligned(pvr_dev_addr_t addr,
+                                           const uint32_t alignment)
+{
+   assert(util_is_power_of_two_nonzero(alignment));
+   return ((uintptr_t)(addr.addr) & (alignment - 1)) == 0;
+}
+
 static inline bool ptr_is_aligned(const void *const ptr,
                                   const uint32_t alignment)
 {
diff --git a/src/imagination/csbgen/rogue_cr.xml b/src/imagination/csbgen/rogue_cr.xml
index d1f4aef..a551f95 100644
--- a/src/imagination/csbgen/rogue_cr.xml
+++ b/src/imagination/csbgen/rogue_cr.xml
@@ -499,7 +499,7 @@ SOFTWARE.
   </struct>
 
   <struct name="ISP_AA" length="1">
-    <field name="mode" start="0" end="1" type="ISP_AA_MODE_TYPE"/>
+    <field name="mode" start="0" end="1" type="ISP_AA_MODE_TYPE" default="AA_NONE"/>
   </struct>
 
   <struct name="ISP_CTL" length="1">
diff --git a/src/imagination/csbgen/rogue_ipf.xml b/src/imagination/csbgen/rogue_ipf.xml
index 7b71b6c..90d7d06 100644
--- a/src/imagination/csbgen/rogue_ipf.xml
+++ b/src/imagination/csbgen/rogue_ipf.xml
@@ -26,6 +26,40 @@ SOFTWARE.
 <csbgen name="ROGUE" prefix="IPF">
 
   <define name="TILE_SIZE_PIXELS" value="32"/>
+  <define name="CONTROL_STREAM_SIZE_DWORDS" value="32"/>
+  <define name="ISP_VERTEX_XY_BIAS_VALUE"   value="4096"/>
+
+  <enum name="COMPRESSION_FORMAT">
+    <value name="UNIQUE_1" value="0"/>
+    <value name="ORIGIN_1_DELTA_1" value="1"/>
+    <value name="ORIGIN_1_DELTA_2" value="2"/>
+    <value name="ORIGIN_1_DELTA_3" value="3"/>
+    <value name="ORIGIN_1_DELTA_4" value="4"/>
+    <value name="ORIGIN_1_DELTA_5" value="5"/>
+    <value name="ORIGIN_1_DELTA_6" value="6"/>
+    <value name="ORIGIN_1_DELTA_7" value="7"/>
+    <value name="RAW_BYTE"         value="8"/>
+    <value name="UNIQUE_2"         value="9"/>
+    <value name="ORIGIN_2_DELTA_1" value="10"/>
+    <value name="ORIGIN_2_DELTA_2" value="11"/>
+    <value name="ORIGIN_2_DELTA_3" value="12"/>
+    <value name="ORIGIN_2_DELTA_4" value="13"/>
+    <value name="ORIGIN_2_DELTA_5" value="14"/>
+    <value name="ORIGIN_2_DELTA_6" value="15"/>
+  </enum>
+
+  <enum name="CS_MASK_FMT">
+    <value name="INDEX" value="0"/>
+    <value name="BYTE"  value="1"/>
+    <value name="BIT"   value="2"/>
+    <value name="FULL"  value="3"/>
+  </enum>
+
+  <enum name="CS_TYPE">
+    <value name="PRIM" value="0"/>
+    <value name="LINK" value="2"/>
+    <value name="TERM" value="3"/>
+  </enum>
 
   <struct name="SCISSOR_WORD_0" length="1">
     <field name="scw0_xmin" start="16" end="31" type="uint"/>
@@ -37,4 +71,100 @@ SOFTWARE.
     <field name="scw1_ymax" start="0" end="15" type="uint"/>
   </struct>
 
+  <struct name="CONTROL_STREAM" length="1">
+    <field name="cs_type" start="30" end="31" type="CS_TYPE"/>
+    <field name="cs_link" start="3" end="29" type="uint">
+      <define name="UNIT_SIZE" value="128"/>
+    </field>
+  </struct>
+
+  <struct name="PRIMITIVE_FORMAT" length="1">
+    <field name="cs_type" start="30" end="31" type="CS_TYPE"/>
+    <field name="cs_isp_state_read" start="29" end="29" type="bool"/>
+    <field name="cs_isp_state_size" start="26" end="28" type="uint"/>
+    <field name="cs_prim_total" start="19" end="25" type="uint"/>
+    <field name="cs_mask_fmt" start="17" end="18" type="CS_MASK_FMT"/>
+    <field name="cs_prim_base_pres" start="16" end="16" type="bool"/>
+    <field name="cs_prim_base_offset" start="0" end="15" type="uint"/>
+  </struct>
+
+  <struct name="PRIMITIVE_BASE" length="1">
+    <field name="cs_prim_base" start="0" end="31" shift="2" type="address"/>
+  </struct>
+
+  <struct name="COMPRESSION_SIZE_WORD" length="1">
+    <field name="cs_isp_comp_table_size" start="27" end="31" type="uint"/>
+    <field name="cs_tsp_comp_format_size" start="21" end="26" type="uint"/>
+    <field name="cs_tsp_comp_table_size" start="9" end="18" type="uint"/>
+    <field name="cs_tsp_comp_vertex_size" start="0" end="8" type="uint"/>
+  </struct>
+
+  <struct name="ISP_COMPRESSION_WORD_0" length="1">
+    <field name="cf_isp_comp_fmt_z1" start="28" end="31" type="COMPRESSION_FORMAT"/>
+    <field name="cf_isp_comp_fmt_z0" start="24" end="27" type="COMPRESSION_FORMAT"/>
+    <field name="cf_isp_comp_fmt_y2" start="20" end="23" type="COMPRESSION_FORMAT"/>
+    <field name="cf_isp_comp_fmt_y1" start="16" end="19" type="COMPRESSION_FORMAT"/>
+    <field name="cf_isp_comp_fmt_y0" start="12" end="15" type="COMPRESSION_FORMAT"/>
+    <field name="cf_isp_comp_fmt_x2" start="8" end="11" type="COMPRESSION_FORMAT"/>
+    <field name="cf_isp_comp_fmt_x1" start="4" end="7" type="COMPRESSION_FORMAT"/>
+    <field name="cf_isp_comp_fmt_x0" start="0" end="3" type="COMPRESSION_FORMAT"/>
+  </struct>
+
+  <struct name="ISP_COMPRESSION_WORD_1" length="1">
+    <field name="vf_prim_msaa" start="16" end="16" type="bool"/>
+    <field name="vf_prim_id_pres" start="15" end="15" type="bool"/>
+    <field name="vf_vertex_clipped" start="14" end="14" type="bool"/>
+    <field name="vf_vertex_total" start="8" end="13" type="uint"/>
+    <field name="cf_isp_comp_fmt_z3" start="4" end="7" type="COMPRESSION_FORMAT"/>
+    <field name="cf_isp_comp_fmt_z2" start="0" end="3" type="COMPRESSION_FORMAT"/>
+  </struct>
+
+  <struct name="INDEX_DATA" length="1">
+    <field name="ix_edge_flag1_ab" start="30" end="30" type="bool"/>
+    <field name="ix_index1_0" start="24" end="29" type="uint"/>
+    <field name="ix_bf_flag0" start="23" end="23" type="bool"/>
+    <field name="ix_edge_flag0_ca" start="22" end="22" type="bool"/>
+    <field name="ix_index0_2" start="16" end="21" type="uint"/>
+    <field name="ix_edge_flag0_bc" start="14" end="14" type="bool"/>
+    <field name="ix_index0_1" start="8" end="13" type="uint"/>
+    <field name="ix_edge_flag0_ab" start="6" end="6" type="bool"/>
+    <field name="ix_index0_0" start="0" end="5" type="uint"/>
+  </struct>
+
+  <struct name="ISP_VERTEX_XY" length="1">
+    <field name="sign" start="23" end="23" type="bool"/>
+    <field name="integer" start="8" end="22" type="uint"/>
+    <field name="frac" start="0" end="7" type="uint"/>
+  </struct>
+
+  <struct name="ISP_VERTEX_WORD_0" length="1">
+    <field name="y0" start="24" end="31" type="uint"/>
+    <field name="x0" start="0" end="23" type="uint"/>
+  </struct>
+
+  <struct name="ISP_VERTEX_WORD_1" length="1">
+    <field name="z0" start="16" end="31" type="uint"/>
+    <field name="y0" start="0" end="15" type="uint">
+      <define name="SHIFT" value="8"/>
+    </field>
+  </struct>
+
+  <struct name="ISP_VERTEX_WORD_2" length="1">
+    <field name="x1" start="16" end="31" type="uint"/>
+    <field name="z0" start="0" end="15" type="uint">
+      <define name="SHIFT" value="16"/>
+    </field>
+  </struct>
+
+  <struct name="ISP_VERTEX_WORD_3" length="1">
+    <field name="y1" start="8" end="31" type="uint"/>
+    <field name="x1" start="0" end="7" type="uint">
+      <define name="SHIFT" value="16"/>
+    </field>
+  </struct>
+
+  <struct name="ISP_VERTEX_WORD_4" length="1">
+    <field name="z1" start="0" end="31" type="uint"/>
+  </struct>
+
 </csbgen>
diff --git a/src/imagination/include/hwdef/rogue_hw_defs.h b/src/imagination/include/hwdef/rogue_hw_defs.h
index 146bc60..f238b67 100644
--- a/src/imagination/include/hwdef/rogue_hw_defs.h
+++ b/src/imagination/include/hwdef/rogue_hw_defs.h
@@ -125,6 +125,16 @@
  */
 #define ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES 7U
 
+/* Size of the image state in 64-bit units. */
+#define ROGUE_MAXIMUM_IMAGE_STATE_SIZE_IN_ULONGLONGS 2U
+
+/* Size of the image state in dwords. The last 64-bit word is optional for
+ * non-YUV textures.
+ */
+#define ROGUE_MAXIMUM_IMAGE_STATE_SIZE             \
+   (ROGUE_MAXIMUM_IMAGE_STATE_SIZE_IN_ULONGLONGS * \
+    (sizeof(uint64_t) / sizeof(uint32_t)))
+
 #define PVR_NUM_PBE_EMIT_REGS 8U
 
 #endif /* ROGUE_HW_DEFS_H */
diff --git a/src/imagination/vulkan/meson.build b/src/imagination/vulkan/meson.build
index 1cb990d..c4ce9d3 100644
--- a/src/imagination/vulkan/meson.build
+++ b/src/imagination/vulkan/meson.build
@@ -57,6 +57,7 @@ pvr_files = files(
   'pvr_pass.c',
   'pvr_pipeline.c',
   'pvr_pipeline_cache.c',
+  'pvr_transfer_frag_store.c',
   'pvr_query.c',
   'pvr_query_compute.c',
   'pvr_queue.c',
diff --git a/src/imagination/vulkan/pds/pvr_pds.c b/src/imagination/vulkan/pds/pvr_pds.c
index 9523ea3..5baad53 100644
--- a/src/imagination/vulkan/pds/pvr_pds.c
+++ b/src/imagination/vulkan/pds/pvr_pds.c
@@ -433,22 +433,18 @@ void pvr_pds_pixel_shader_sa_initialize(
  * \param dest_offset Destination offset in the attribute.
  * \param dma_size The size of the DMA in words.
  * \param src_address Source address for the burst.
+ * \param last Last DMA in program.
  * \param dev_info PVR device info structure.
  * \returns The number of DMA transfers required.
  */
-
 uint32_t pvr_pds_encode_dma_burst(uint32_t *dma_control,
                                   uint64_t *dma_address,
                                   uint32_t dest_offset,
                                   uint32_t dma_size,
                                   uint64_t src_address,
+                                  bool last,
                                   const struct pvr_device_info *dev_info)
 {
-   /* Simplified for MS2. */
-
-   /* Force to 1 DMA. */
-   const uint32_t num_kicks = 1;
-
    dma_control[0] = dma_size
                     << PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_SHIFT;
    dma_control[0] |= dest_offset
@@ -457,12 +453,15 @@ uint32_t pvr_pds_encode_dma_burst(uint32_t *dma_control,
    dma_control[0] |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_CMODE_CACHED |
                      PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_DEST_COMMON_STORE;
 
+   if (last)
+      dma_control[0] |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_LAST_EN;
+
    dma_address[0] = src_address;
-   if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls)) {
+   if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls))
       dma_address[0] |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC0_SLCMODE_CACHED;
-   }
 
-   return num_kicks;
+   /* Force to 1 DMA. */
+   return 1;
 }
 
 /* FIXME: use the csbgen interface and pvr_csb_pack.
diff --git a/src/imagination/vulkan/pds/pvr_pds.h b/src/imagination/vulkan/pds/pvr_pds.h
index b000e83..00c424a 100644
--- a/src/imagination/vulkan/pds/pvr_pds.h
+++ b/src/imagination/vulkan/pds/pvr_pds.h
@@ -624,6 +624,7 @@ uint32_t pvr_pds_encode_dma_burst(uint32_t *dma_control,
                                   uint32_t dest_offset,
                                   uint32_t dma_size,
                                   uint64_t src_address,
+                                  bool last,
                                   const struct pvr_device_info *dev_info);
 
 void pvr_pds_setup_doutu(struct pvr_pds_usc_task_control *usc_task_control,
diff --git a/src/imagination/vulkan/pvr_blit.c b/src/imagination/vulkan/pvr_blit.c
index a13785e..e25da8e 100644
--- a/src/imagination/vulkan/pvr_blit.c
+++ b/src/imagination/vulkan/pvr_blit.c
@@ -28,6 +28,7 @@
 #include "pvr_clear.h"
 #include "pvr_csb.h"
 #include "pvr_formats.h"
+#include "pvr_job_transfer.h"
 #include "pvr_private.h"
 #include "pvr_shader_factory.h"
 #include "pvr_static_shaders.h"
@@ -114,35 +115,161 @@ void pvr_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
    assert(!"Unimplemented");
 }
 
+static struct pvr_transfer_cmd *
+pvr_transfer_cmd_alloc(struct pvr_cmd_buffer *cmd_buffer)
+{
+   struct pvr_transfer_cmd *transfer_cmd;
+
+   transfer_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
+                            sizeof(*transfer_cmd),
+                            8U,
+                            VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+   if (!transfer_cmd) {
+      cmd_buffer->state.status =
+         vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return NULL;
+   }
+
+   /* transfer_cmd->mapping_count is already set to zero. */
+   transfer_cmd->filter = PVR_FILTER_POINT;
+   transfer_cmd->resolve_op = PVR_RESOLVE_BLEND;
+   transfer_cmd->addr_mode = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
+   transfer_cmd->cmd_buffer = cmd_buffer;
+
+   return transfer_cmd;
+}
+
+static void pvr_setup_buffer_surface(struct pvr_transfer_cmd_surface *surface,
+                                     VkRect2D *rect,
+                                     pvr_dev_addr_t dev_addr,
+                                     VkDeviceSize offset,
+                                     VkFormat vk_format,
+                                     uint32_t width,
+                                     uint32_t height)
+{
+   surface->dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, offset);
+   surface->width = width;
+   surface->height = height;
+   surface->stride = width;
+   surface->vk_format = vk_format;
+   surface->mem_layout = PVR_MEMLAYOUT_LINEAR;
+   surface->sample_count = 1;
+
+   /* Initialize rectangle extent. Also, rectangle.offset should be set to
+    * zero, as the offset is already adjusted in the device address above. We
+    * don't explicitly set offset to zero as transfer_cmd is zero allocated.
+    */
+   rect->extent.width = width;
+   rect->extent.height = height;
+}
+
+static VkResult pvr_cmd_copy_buffer_region(struct pvr_cmd_buffer *cmd_buffer,
+                                           pvr_dev_addr_t src_addr,
+                                           VkDeviceSize src_offset,
+                                           pvr_dev_addr_t dst_addr,
+                                           VkDeviceSize dst_offset,
+                                           VkDeviceSize size)
+{
+   VkDeviceSize offset = 0;
+
+   while (offset < size) {
+      VkDeviceSize remaining_size = size - offset;
+      struct pvr_transfer_cmd *transfer_cmd;
+      uint32_t texel_width;
+      VkDeviceSize texels;
+      VkFormat vk_format;
+      VkResult result;
+      uint32_t height;
+      uint32_t width;
+
+      if (remaining_size >= 16U) {
+         vk_format = VK_FORMAT_R32G32B32A32_UINT;
+         texel_width = 16U;
+      } else if (remaining_size >= 4U) {
+         vk_format = VK_FORMAT_R32_UINT;
+         texel_width = 4U;
+      } else {
+         vk_format = VK_FORMAT_R8_UINT;
+         texel_width = 1U;
+      }
+
+      texels = remaining_size / texel_width;
+
+      /* Try to do max-width rects, fall back to a 1-height rect for the
+       * remainder.
+       */
+      if (texels > PVR_MAX_TRANSFER_SIZE_IN_TEXELS) {
+         width = PVR_MAX_TRANSFER_SIZE_IN_TEXELS;
+         height = texels / PVR_MAX_TRANSFER_SIZE_IN_TEXELS;
+         height = MIN2(height, PVR_MAX_TRANSFER_SIZE_IN_TEXELS);
+      } else {
+         width = texels;
+         height = 1;
+      }
+
+      transfer_cmd = pvr_transfer_cmd_alloc(cmd_buffer);
+      if (!transfer_cmd)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+      if (!(transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FILL)) {
+         pvr_setup_buffer_surface(&transfer_cmd->src,
+                                  &transfer_cmd->mappings[0].src_rect,
+                                  src_addr,
+                                  offset + src_offset,
+                                  vk_format,
+                                  width,
+                                  height);
+         transfer_cmd->src_present = true;
+      }
+
+      pvr_setup_buffer_surface(&transfer_cmd->dst,
+                               &transfer_cmd->scissor,
+                               dst_addr,
+                               offset + dst_offset,
+                               vk_format,
+                               width,
+                               height);
+
+      if (transfer_cmd->src_present)
+         transfer_cmd->mappings[0].dst_rect = transfer_cmd->scissor;
+
+      transfer_cmd->mapping_count++;
+      transfer_cmd->cmd_buffer = cmd_buffer;
+
+      result = pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd);
+      if (result != VK_SUCCESS) {
+         vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
+         return result;
+      }
+
+      offset += width * height * texel_width;
+   }
+
+   return VK_SUCCESS;
+}
+
 void pvr_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
                            const VkCopyBufferInfo2 *pCopyBufferInfo)
 {
    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
    PVR_FROM_HANDLE(pvr_buffer, src, pCopyBufferInfo->srcBuffer);
    PVR_FROM_HANDLE(pvr_buffer, dst, pCopyBufferInfo->dstBuffer);
-   const size_t regions_size =
-      pCopyBufferInfo->regionCount * sizeof(*pCopyBufferInfo->pRegions);
-   struct pvr_transfer_cmd *transfer_cmd;
 
    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
 
-   transfer_cmd = vk_alloc(&cmd_buffer->vk.pool->alloc,
-                           sizeof(*transfer_cmd) + regions_size,
-                           8U,
-                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (!transfer_cmd) {
-      cmd_buffer->state.status =
-         vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
+   for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) {
+      VkResult result;
 
-      return;
+      result =
+         pvr_cmd_copy_buffer_region(cmd_buffer,
+                                    src->dev_addr,
+                                    pCopyBufferInfo->pRegions[i].srcOffset,
+                                    dst->dev_addr,
+                                    pCopyBufferInfo->pRegions[i].dstOffset,
+                                    pCopyBufferInfo->pRegions[i].size);
+      if (result != VK_SUCCESS)
+         return;
    }
-
-   transfer_cmd->src = src;
-   transfer_cmd->dst = dst;
-   transfer_cmd->region_count = pCopyBufferInfo->regionCount;
-   memcpy(transfer_cmd->regions, pCopyBufferInfo->pRegions, regions_size);
-
-   pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd);
 }
 
 /**
diff --git a/src/imagination/vulkan/pvr_cmd_buffer.c b/src/imagination/vulkan/pvr_cmd_buffer.c
index 7a01b54..c7444ad 100644
--- a/src/imagination/vulkan/pvr_cmd_buffer.c
+++ b/src/imagination/vulkan/pvr_cmd_buffer.c
@@ -32,6 +32,7 @@
 #include "hwdef/rogue_hw_defs.h"
 #include "hwdef/rogue_hw_utils.h"
 #include "pvr_bo.h"
+#include "pvr_common.h"
 #include "pvr_csb.h"
 #include "pvr_csb_enum_helpers.h"
 #include "pvr_device_info.h"
@@ -604,37 +605,6 @@ err_csb_finish:
    return result;
 }
 
-struct pvr_combined_image_sampler_descriptor {
-   /* | TEXSTATE_IMAGE_WORD0 | TEXSTATE_{STRIDE_,}IMAGE_WORD1 | */
-   uint64_t image[ROGUE_NUM_TEXSTATE_IMAGE_WORDS];
-   union pvr_sampler_descriptor sampler;
-};
-
-#define CHECK_STRUCT_FIELD_SIZE(_struct_type, _field_name, _size)      \
-   static_assert(sizeof(((struct _struct_type *)NULL)->_field_name) == \
-                    (_size),                                           \
-                 "Size of '" #_field_name "' in '" #_struct_type       \
-                 "' differs from expected")
-
-CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor,
-                        image,
-                        ROGUE_NUM_TEXSTATE_IMAGE_WORDS * sizeof(uint64_t));
-CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor,
-                        image,
-                        PVR_IMAGE_DESCRIPTOR_SIZE * sizeof(uint32_t));
-CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor,
-                        image,
-                        (pvr_cmd_length(TEXSTATE_IMAGE_WORD0) +
-                         pvr_cmd_length(TEXSTATE_IMAGE_WORD1)) *
-                           sizeof(uint32_t));
-CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor,
-                        image,
-                        (pvr_cmd_length(TEXSTATE_IMAGE_WORD0) +
-                         pvr_cmd_length(TEXSTATE_STRIDE_IMAGE_WORD1)) *
-                           sizeof(uint32_t));
-
-#undef CHECK_STRUCT_FIELD_SIZE
-
 static VkResult pvr_setup_texture_state_words(
    struct pvr_device *device,
    struct pvr_combined_image_sampler_descriptor *descriptor,
diff --git a/src/imagination/vulkan/pvr_common.h b/src/imagination/vulkan/pvr_common.h
index b6d683d..04caadb 100644
--- a/src/imagination/vulkan/pvr_common.h
+++ b/src/imagination/vulkan/pvr_common.h
@@ -39,9 +39,11 @@
  * relevant for the driver/compiler interface (no Vulkan types).
  */
 
+#include "hwdef/rogue_hw_defs.h"
 #include "pvr_limits.h"
 #include "pvr_types.h"
 #include "util/list.h"
+#include "util/macros.h"
 #include "vk_object.h"
 #include "vk_sync.h"
 
@@ -146,6 +148,13 @@ enum pvr_stage_allocation {
    PVR_STAGE_ALLOCATION_COUNT
 };
 
+enum pvr_filter {
+   PVR_FILTER_DONTCARE, /* Any filtering mode is acceptable. */
+   PVR_FILTER_POINT,
+   PVR_FILTER_LINEAR,
+   PVR_FILTER_BICUBIC,
+};
+
 enum pvr_resolve_op {
    PVR_RESOLVE_BLEND,
    PVR_RESOLVE_MIN,
@@ -202,6 +211,42 @@ union pvr_sampler_descriptor {
    } data;
 };
 
+struct pvr_combined_image_sampler_descriptor {
+   /* | TEXSTATE_IMAGE_WORD0 | TEXSTATE_{STRIDE_,}IMAGE_WORD1 | */
+   uint64_t image[ROGUE_NUM_TEXSTATE_IMAGE_WORDS];
+   union pvr_sampler_descriptor sampler;
+};
+
+#define CHECK_STRUCT_FIELD_SIZE(_struct_type, _field_name, _size)      \
+   static_assert(sizeof(((struct _struct_type *)NULL)->_field_name) == \
+                    (_size),                                           \
+                 "Size of '" #_field_name "' in '" #_struct_type       \
+                 "' differs from expected")
+
+CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor,
+                        image,
+                        ROGUE_NUM_TEXSTATE_IMAGE_WORDS * sizeof(uint64_t));
+CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor,
+                        image,
+                        PVR_IMAGE_DESCRIPTOR_SIZE * sizeof(uint32_t));
+#if 0
+/* TODO: Don't really want to include pvr_csb.h in here since this header is
+ * shared with the compiler. Figure out a better place for these.
+ */
+CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor,
+                        image,
+                        (pvr_cmd_length(TEXSTATE_IMAGE_WORD0) +
+                         pvr_cmd_length(TEXSTATE_IMAGE_WORD1)) *
+                           sizeof(uint32_t));
+CHECK_STRUCT_FIELD_SIZE(pvr_combined_image_sampler_descriptor,
+                        image,
+                        (pvr_cmd_length(TEXSTATE_IMAGE_WORD0) +
+                         pvr_cmd_length(TEXSTATE_STRIDE_IMAGE_WORD1)) *
+                           sizeof(uint32_t));
+#endif
+
+#undef CHECK_STRUCT_FIELD_SIZE
+
 struct pvr_sampler {
    struct vk_object_base base;
 
diff --git a/src/imagination/vulkan/pvr_device.c b/src/imagination/vulkan/pvr_device.c
index 9bec439..7a65deb 100644
--- a/src/imagination/vulkan/pvr_device.c
+++ b/src/imagination/vulkan/pvr_device.c
@@ -1265,6 +1265,7 @@ static VkResult pvr_pds_idfwdf_programs_create_and_upload(
                                                     0,
                                                     shareds,
                                                     shareds_buffer_addr.addr,
+                                                    false,
                                                     dev_info);
 
    /* DMA temp regs. */
diff --git a/src/imagination/vulkan/pvr_job_context.c b/src/imagination/vulkan/pvr_job_context.c
index 159ab7c..f809651 100644
--- a/src/imagination/vulkan/pvr_job_context.c
+++ b/src/imagination/vulkan/pvr_job_context.c
@@ -25,21 +25,25 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <vulkan/vulkan.h>
 
 #include "hwdef/rogue_hw_utils.h"
 #include "pvr_bo.h"
 #include "pvr_cdm_load_sr.h"
+#include "pvr_common.h"
 #include "pvr_csb.h"
 #include "pvr_job_context.h"
 #include "pvr_pds.h"
 #include "pvr_private.h"
+#include "pvr_transfer_frag_store.h"
 #include "pvr_types.h"
 #include "pvr_uscgen.h"
 #include "pvr_vdm_load_sr.h"
 #include "pvr_vdm_store_sr.h"
 #include "pvr_winsys.h"
 #include "util/macros.h"
+#include "util/os_file.h"
 #include "util/u_dynarray.h"
 #include "vk_alloc.h"
 #include "vk_log.h"
@@ -1259,15 +1263,30 @@ static void pvr_transfer_eot_shaders_fini(struct pvr_device *device,
 static VkResult pvr_transfer_ctx_shaders_init(struct pvr_device *device,
                                               struct pvr_transfer_ctx *ctx)
 {
-   /* TODO: Setup USC fragments. */
+   VkResult result;
+
+   result = pvr_transfer_frag_store_init(device, &ctx->frag_store);
+   if (result != VK_SUCCESS)
+      goto err_out;
+
+   result = pvr_transfer_eot_shaders_init(device, ctx);
+   if (result != VK_SUCCESS)
+      goto err_frag_store_fini;
 
-   return pvr_transfer_eot_shaders_init(device, ctx);
+   return VK_SUCCESS;
+
+err_frag_store_fini:
+   pvr_transfer_frag_store_fini(device, &ctx->frag_store);
+
+err_out:
+   return result;
 }
 
 static void pvr_transfer_ctx_shaders_fini(struct pvr_device *device,
                                           struct pvr_transfer_ctx *ctx)
 {
    pvr_transfer_eot_shaders_fini(device, ctx);
+   pvr_transfer_frag_store_fini(device, &ctx->frag_store);
 }
 
 VkResult pvr_transfer_ctx_create(struct pvr_device *const device,
diff --git a/src/imagination/vulkan/pvr_job_context.h b/src/imagination/vulkan/pvr_job_context.h
index 5a09f9f..b07bee7 100644
--- a/src/imagination/vulkan/pvr_job_context.h
+++ b/src/imagination/vulkan/pvr_job_context.h
@@ -24,8 +24,11 @@
 #ifndef PVR_JOB_CONTEXT_H
 #define PVR_JOB_CONTEXT_H
 
+#include "pvr_common.h"
 #include "pvr_private.h"
+#include "pvr_transfer_frag_store.h"
 #include "pvr_types.h"
+#include "pvr_uscgen.h"
 #include "pvr_winsys.h"
 
 /* Support PDS code/data loading/storing to the 'B' shared register state
@@ -143,6 +146,8 @@ struct pvr_transfer_ctx {
 
    struct pvr_winsys_transfer_ctx *ws_ctx;
 
+   struct pvr_transfer_frag_store frag_store;
+
    struct pvr_bo *usc_eot_bos[PVR_TRANSFER_MAX_RENDER_TARGETS];
 
    struct pvr_pds_upload pds_unitex_code[PVR_TRANSFER_MAX_TEXSTATE_DMA]
diff --git a/src/imagination/vulkan/pvr_job_transfer.c b/src/imagination/vulkan/pvr_job_transfer.c
index a0c2e79..44a8cf9 100644
--- a/src/imagination/vulkan/pvr_job_transfer.c
+++ b/src/imagination/vulkan/pvr_job_transfer.c
@@ -21,84 +21,4931 @@
  * SOFTWARE.
  */
 
+#include <assert.h>
 #include <stdbool.h>
-#include <stdint.h>
 #include <stddef.h>
+#include <stdint.h>
+#include <string.h>
 #include <vulkan/vulkan.h>
 
+#include "pvr_csb.h"
+#include "pvr_csb_enum_helpers.h"
+#include "pvr_formats.h"
 #include "pvr_job_common.h"
 #include "pvr_job_context.h"
 #include "pvr_job_transfer.h"
 #include "pvr_private.h"
+#include "pvr_tex_state.h"
+#include "pvr_transfer_frag_store.h"
+#include "pvr_types.h"
+#include "pvr_uscgen.h"
+#include "pvr_util.h"
 #include "pvr_winsys.h"
+#include "util/bitscan.h"
 #include "util/list.h"
 #include "util/macros.h"
+#include "util/u_math.h"
+#include "util/xxhash.h"
+#include "vk_format.h"
+#include "vk_log.h"
 #include "vk_sync.h"
 
-/* FIXME: Implement gpu based transfer support. */
-VkResult pvr_transfer_job_submit(struct pvr_device *device,
-                                 struct pvr_transfer_ctx *ctx,
-                                 struct pvr_sub_cmd_transfer *sub_cmd,
-                                 struct vk_sync *wait_sync,
-                                 struct vk_sync *signal_sync)
+#define PVR_TRANSFER_MAX_PASSES 10U
+#define PVR_TRANSFER_MAX_CLIP_RECTS 4U
+#define PVR_TRANSFER_MAX_PREPARES_PER_SUBMIT 16U
+
+/* Number of triangles sent to the TSP per raster. */
+#define PVR_TRANSFER_NUM_LAYERS 1U
+
+#define PVR_MAX_WIDTH 16384
+#define PVR_MAX_HEIGHT 16384
+
+#define PVR_MAX_CLIP_SIZE(dev_info) \
+   (PVR_HAS_FEATURE(dev_info, screen_size8K) ? 8192U : 16384U)
+
+enum pvr_paired_tiles {
+   PVR_PAIRED_TILES_NONE,
+   PVR_PAIRED_TILES_X,
+   PVR_PAIRED_TILES_Y
+};
+
+struct pvr_transfer_pass {
+   uint32_t dst_offset;
+
+   uint32_t src_offset;
+   uint32_t mapping_count;
+   struct pvr_rect_mapping mappings[PVR_TRANSFER_MAX_CUSTOM_MAPPINGS];
+   bool extend_height;
+   bool byte_unwind;
+
+   uint32_t clip_rects_count;
+   VkRect2D clip_rects[PVR_TRANSFER_MAX_CLIP_RECTS];
+};
+
+/* Structure representing a layer iteration. */
+struct pvr_transfer_custom_mapping {
+   bool double_stride;
+   uint32_t texel_unwind_src;
+   uint32_t texel_unwind_dst;
+   uint32_t texel_extend_src;
+   uint32_t texel_extend_dst;
+   uint32_t pass_count;
+   struct pvr_transfer_pass passes[PVR_TRANSFER_MAX_PASSES];
+   uint32_t max_clip_rects;
+   int32_t max_clip_size;
+   uint32_t byte_unwind_src;
+};
+
+struct pvr_transfer_3d_iteration {
+   uint32_t texture_coords[12];
+};
+
+struct pvr_transfer_3d_state {
+   struct pvr_winsys_transfer_regs regs;
+
+   bool empty_dst;
+   bool down_scale;
+   /* Write all channels present in the dst from the USC even if those are
+    * constants.
+    */
+   bool dont_force_pbe;
+
+   /* The rate of the shader. */
+   uint32_t msaa_multiplier;
+   /* Top left corner of the render in ISP tiles. */
+   uint32_t origin_x_in_tiles;
+   /* Top left corner of the render in ISP tiles. */
+   uint32_t origin_y_in_tiles;
+   /* Width of the render in ISP tiles. */
+   uint32_t width_in_tiles;
+   /* Height of the render in ISP tiles. */
+   uint32_t height_in_tiles;
+
+   /* Width of a sample in registers (pixel partition width). */
+   uint32_t usc_pixel_width;
+
+   /* Properties of the USC shader. */
+   struct pvr_tq_shader_properties shader_props;
+
+   /* TODO: Use pvr_dev_addr_t of an offset type for these. */
+   uint32_t pds_shader_task_offset;
+   uint32_t tex_state_data_offset;
+   uint32_t uni_tex_code_offset;
+
+   uint32_t uniform_data_size;
+   uint32_t tex_state_data_size;
+   uint32_t usc_coeff_regs;
+
+   /* Pointer into the common store. */
+   uint32_t common_ptr;
+   /* Pointer into the dynamic constant reg buffer. */
+   uint32_t dynamic_const_reg_ptr;
+   /* Pointer into the USC constant reg buffer. */
+   uint32_t usc_const_reg_ptr;
+
+   uint32_t pds_coeff_task_offset;
+   uint32_t coeff_data_size;
+
+   /* Number of temporary 32bit registers used by PDS. */
+   uint32_t pds_temps;
+
+   struct pvr_transfer_custom_mapping custom_mapping;
+   uint32_t pass_idx;
+
+   enum pvr_filter filter;
+   bool custom_filter;
+
+   enum pvr_paired_tiles pair_tiles;
+};
+
+struct pvr_transfer_prep_data {
+   uint32_t flags;
+   struct pvr_transfer_3d_state state;
+};
+
+struct pvr_transfer_submit {
+   uint32_t prep_count;
+   struct pvr_transfer_prep_data
+      prep_array[PVR_TRANSFER_MAX_PREPARES_PER_SUBMIT];
+};
+
+static enum pvr_transfer_pbe_pixel_src pvr_pbe_src_format_raw(VkFormat format)
+{
+   uint32_t bpp = vk_format_get_blocksizebits(format);
+
+   if (bpp <= 32U)
+      return PVR_TRANSFER_PBE_PIXEL_SRC_RAW32;
+   else if (bpp <= 64U)
+      return PVR_TRANSFER_PBE_PIXEL_SRC_RAW64;
+
+   return PVR_TRANSFER_PBE_PIXEL_SRC_RAW128;
+}
+
+/**
+ * How the PBE expects the output buffer for an RGBA space conversion.
+ */
+static VkResult
+pvr_pbe_src_format_normal(VkFormat src_format,
+                          VkFormat dst_format,
+                          bool down_scale,
+                          bool dont_force_pbe,
+                          enum pvr_transfer_pbe_pixel_src *src_format_out)
+{
+   bool dst_signed = vk_format_is_sint(dst_format);
+
+   if (vk_format_is_int(dst_format)) {
+      uint32_t red_width;
+      bool src_signed;
+      uint32_t count;
+
+      if (!vk_format_is_int(src_format))
+         return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+
+      src_signed = vk_format_is_sint(src_format);
+
+      red_width = vk_format_get_component_bits(dst_format,
+                                               UTIL_FORMAT_COLORSPACE_RGB,
+                                               0);
+
+      switch (red_width) {
+      case 8:
+         if (!src_signed && !dst_signed)
+            *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_UU8888;
+         else if (src_signed && !dst_signed)
+            *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_SU8888;
+         else if (!src_signed && dst_signed)
+            *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_US8888;
+         else
+            *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_SS8888;
+
+         break;
+
+      case 10:
+         switch (dst_format) {
+         case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+            *src_format_out = src_signed ? PVR_TRANSFER_PBE_PIXEL_SRC_SU1010102
+                                         : PVR_TRANSFER_PBE_PIXEL_SRC_UU1010102;
+            break;
+
+         case VK_FORMAT_A2R10G10B10_UINT_PACK32:
+            *src_format_out = src_signed
+                                 ? PVR_TRANSFER_PBE_PIXEL_SRC_RBSWAP_SU1010102
+                                 : PVR_TRANSFER_PBE_PIXEL_SRC_RBSWAP_UU1010102;
+            break;
+
+         default:
+            return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+         }
+         break;
+
+      case 16:
+         if (!src_signed && !dst_signed)
+            *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_UU16U16;
+         else if (src_signed && !dst_signed)
+            *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_SU16U16;
+         else if (!src_signed && dst_signed)
+            *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_US16S16;
+         else
+            *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_SS16S16;
+
+         break;
+
+      case 32:
+         if (dont_force_pbe) {
+            count = vk_format_get_blocksizebits(dst_format) / 32U;
+         } else {
+            count =
+               vk_format_get_common_color_channel_count(src_format, dst_format);
+         }
+
+         if (!src_signed && !dst_signed) {
+            *src_format_out = (count > 2U) ? PVR_TRANSFER_PBE_PIXEL_SRC_RAW128
+                                           : PVR_TRANSFER_PBE_PIXEL_SRC_RAW64;
+         } else if (src_signed && !dst_signed) {
+            *src_format_out = (count > 2U) ? PVR_TRANSFER_PBE_PIXEL_SRC_S4XU32
+                                           : PVR_TRANSFER_PBE_PIXEL_SRC_SU32U32;
+         } else if (!src_signed && dst_signed) {
+            *src_format_out = (count > 2U) ? PVR_TRANSFER_PBE_PIXEL_SRC_U4XS32
+                                           : PVR_TRANSFER_PBE_PIXEL_SRC_US32S32;
+         } else {
+            *src_format_out = (count > 2U) ? PVR_TRANSFER_PBE_PIXEL_SRC_RAW128
+                                           : PVR_TRANSFER_PBE_PIXEL_SRC_RAW64;
+         }
+         break;
+
+      default:
+         return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+      }
+
+   } else if (vk_format_is_float(dst_format) ||
+              vk_format_is_normalized(dst_format)) {
+      bool is_float = true;
+
+      if (!vk_format_is_float(src_format) &&
+          !vk_format_is_normalized(src_format)) {
+         return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+      }
+
+      if (vk_format_is_normalized(dst_format)) {
+         uint32_t chan_width;
+
+         is_float = false;
+
+         /* Alpha only. */
+         switch (dst_format) {
+         case VK_FORMAT_D16_UNORM:
+            chan_width = 16;
+            break;
+
+         default:
+            chan_width =
+               vk_format_get_component_bits(dst_format,
+                                            UTIL_FORMAT_COLORSPACE_RGB,
+                                            0U);
+            break;
+         }
+
+         if (src_format == dst_format) {
+            switch (chan_width) {
+            case 16U:
+               if (down_scale) {
+                  *src_format_out = dst_signed
+                                       ? PVR_TRANSFER_PBE_PIXEL_SRC_S16NORM
+                                       : PVR_TRANSFER_PBE_PIXEL_SRC_U16NORM;
+               } else {
+                  *src_format_out = dst_signed
+                                       ? PVR_TRANSFER_PBE_PIXEL_SRC_SS16S16
+                                       : PVR_TRANSFER_PBE_PIXEL_SRC_UU16U16;
+               }
+               break;
+
+            case 32U:
+               *src_format_out = pvr_pbe_src_format_raw(dst_format);
+               break;
+            default:
+               is_float = true;
+               break;
+            }
+         } else {
+            switch (chan_width) {
+            case 16U:
+               *src_format_out = dst_signed
+                                    ? PVR_TRANSFER_PBE_PIXEL_SRC_S16NORM
+                                    : PVR_TRANSFER_PBE_PIXEL_SRC_U16NORM;
+               break;
+            default:
+               is_float = true;
+               break;
+            }
+         }
+      }
+
+      if (is_float) {
+         if (vk_format_has_32bit_component(dst_format)) {
+            uint32_t count;
+
+            if (dont_force_pbe) {
+               count = vk_format_get_blocksizebits(dst_format) / 32U;
+            } else {
+               count = vk_format_get_common_color_channel_count(src_format,
+                                                                dst_format);
+            }
+
+            switch (count) {
+            case 1U:
+               *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_F32;
+               break;
+            case 2U:
+               *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_F32X2;
+               break;
+            default:
+               *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_F32X4;
+               break;
+            }
+         } else {
+            if (dst_format == VK_FORMAT_B8G8R8A8_UNORM ||
+                dst_format == VK_FORMAT_R8G8B8A8_UNORM) {
+               *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_F16_U8;
+            } else {
+               *src_format_out = PVR_TRANSFER_PBE_PIXEL_SRC_F16F16;
+            }
+         }
+      }
+   } else {
+      return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+   }
+
+   return VK_SUCCESS;
+}
+
+static inline uint32_t
+pvr_get_blit_flags(const struct pvr_transfer_cmd *transfer_cmd)
+{
+   return transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FAST2D
+             ? 0
+             : transfer_cmd->flags;
+}
+
+static VkResult pvr_pbe_src_format(struct pvr_transfer_cmd *transfer_cmd,
+                                   struct pvr_transfer_3d_state *state,
+                                   struct pvr_tq_shader_properties *prop)
+{
+   struct pvr_tq_layer_properties *layer = &prop->layer_props;
+   VkFormat dst_format = transfer_cmd->dst.vk_format;
+   VkFormat src_format;
+   bool down_scale;
+
+   if (transfer_cmd->src_present) {
+      src_format = transfer_cmd->src.vk_format;
+      down_scale = transfer_cmd->resolve_op == PVR_RESOLVE_BLEND &&
+                   transfer_cmd->src.sample_count > 1U &&
+                   transfer_cmd->dst.sample_count <= 1U;
+   } else {
+      src_format = dst_format;
+      down_scale = false;
+   }
+
+   /* This has to come before the rest as S8 for instance is integer and
+    * signedness check fails on D24S8.
+    */
+   if (!down_scale &&
+       (vk_format_is_depth_or_stencil(src_format) ||
+        vk_format_is_depth_or_stencil(dst_format) ||
+        pvr_get_blit_flags(transfer_cmd) & PVR_TRANSFER_CMD_FLAGS_DSMERGE)) {
+      pvr_finishme("Complete pvr_pbe_src_format().");
+   }
+
+   return pvr_pbe_src_format_normal(src_format,
+                                    dst_format,
+                                    down_scale,
+                                    state->dont_force_pbe,
+                                    &layer->pbe_format);
+}
+
+static inline void pvr_setup_hwbg_object(const struct pvr_device_info *dev_info,
+                                         struct pvr_transfer_3d_state *state)
+{
+   struct pvr_winsys_transfer_regs *regs = &state->regs;
+
+   pvr_csb_pack (&regs->pds_bgnd0_base, CR_PDS_BGRND0_BASE, reg) {
+      reg.shader_addr = PVR_DEV_ADDR(state->pds_shader_task_offset);
+      assert(pvr_dev_addr_is_aligned(
+         reg.shader_addr,
+         PVRX(CR_PDS_BGRND0_BASE_SHADER_ADDR_ALIGNMENT)));
+      reg.texunicode_addr = PVR_DEV_ADDR(state->uni_tex_code_offset);
+      assert(pvr_dev_addr_is_aligned(
+         reg.texunicode_addr,
+         PVRX(CR_PDS_BGRND0_BASE_TEXUNICODE_ADDR_ALIGNMENT)));
+   }
+
+   pvr_csb_pack (&regs->pds_bgnd1_base, CR_PDS_BGRND1_BASE, reg) {
+      reg.texturedata_addr = PVR_DEV_ADDR(state->tex_state_data_offset);
+      assert(pvr_dev_addr_is_aligned(
+         reg.texturedata_addr,
+         PVRX(CR_PDS_BGRND1_BASE_TEXTUREDATA_ADDR_ALIGNMENT)));
+   }
+
+   /* BGRND 2 not needed, background object PDS doesn't use uniform program. */
+
+   pvr_csb_pack (&regs->pds_bgnd3_sizeinfo, CR_PDS_BGRND3_SIZEINFO, reg) {
+      reg.usc_sharedsize =
+         DIV_ROUND_UP(state->common_ptr,
+                      PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE));
+
+      assert(!(state->uniform_data_size &
+               (PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_UNIFORMSIZE_UNIT_SIZE) - 1)));
+      reg.pds_uniformsize =
+         state->uniform_data_size /
+         PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_UNIFORMSIZE_UNIT_SIZE);
+
+      assert(
+         !(state->tex_state_data_size &
+           (PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE) - 1)));
+      reg.pds_texturestatesize =
+         state->tex_state_data_size /
+         PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE);
+
+      reg.pds_tempsize =
+         DIV_ROUND_UP(state->pds_temps,
+                      PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE));
+   }
+}
+
+static inline bool
+pvr_is_surface_aligned(pvr_dev_addr_t dev_addr, bool is_input, uint32_t bpp)
 {
+   /* 96 bpp is 32 bit granular. */
+   if (bpp == 64U || bpp == 128U) {
+      uint64_t mask = (uint64_t)((bpp >> 3U) - 1U);
+
+      if ((dev_addr.addr & mask) != 0ULL)
+         return false;
+   }
+
+   if (is_input) {
+      if ((dev_addr.addr &
+           (PVRX(TEXSTATE_STRIDE_IMAGE_WORD1_TEXADDR_ALIGNMENT) - 1U)) !=
+          0ULL) {
+         return false;
+      }
+   } else {
+      if ((dev_addr.addr &
+           (PVRX(PBESTATE_STATE_WORD0_ADDRESS_LOW_ALIGNMENT) - 1U)) != 0ULL) {
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static inline VkResult
+pvr_mem_layout_spec(const struct pvr_transfer_cmd_surface *surface,
+                    uint32_t load,
+                    bool is_input,
+                    uint32_t *width_out,
+                    uint32_t *height_out,
+                    uint32_t *stride_out,
+                    enum pvr_memlayout *mem_layout_out,
+                    pvr_dev_addr_t *dev_addr_out)
+{
+   const uint32_t bpp = vk_format_get_blocksizebits(surface->vk_format);
+   uint32_t unsigned_stride;
+
+   *mem_layout_out = surface->mem_layout;
+   *height_out = surface->height;
+   *width_out = surface->width;
+   *stride_out = surface->stride;
+   *dev_addr_out = surface->dev_addr;
+
+   if (surface->mem_layout != PVR_MEMLAYOUT_LINEAR &&
+       !pvr_is_surface_aligned(*dev_addr_out, is_input, bpp)) {
+      return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+   }
+
+   switch (surface->mem_layout) {
+   case PVR_MEMLAYOUT_LINEAR:
+      if (surface->stride == 0U)
+         return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+
+      unsigned_stride = *stride_out;
+
+      if (!pvr_is_surface_aligned(*dev_addr_out, is_input, bpp))
+         return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+
+      if (unsigned_stride < *width_out)
+         return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+
+      if (!is_input) {
+         if (unsigned_stride == 1U) {
+            /* Change the setup to twiddling as that doesn't hit the stride
+             * limit and twiddled == strided when 1px stride.
+             */
+            *mem_layout_out = PVR_MEMLAYOUT_TWIDDLED;
+         }
+      }
+
+      *stride_out = unsigned_stride;
+      break;
+
+   case PVR_MEMLAYOUT_TWIDDLED:
+   case PVR_MEMLAYOUT_3DTWIDDLED:
+      if (surface->stride != 0U)
+         mesa_logi("Ignoring stride value for twiddled/tiled surface!");
+
+      *stride_out = *width_out;
+      break;
+
+   default:
+      return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+pvr_pbe_setup_codegen_defaults(const struct pvr_device_info *dev_info,
+                               const struct pvr_transfer_cmd *transfer_cmd,
+                               struct pvr_transfer_3d_state *state,
+                               struct pvr_pbe_surf_params *surface_params,
+                               struct pvr_pbe_render_params *render_params)
+{
+   const struct pvr_transfer_cmd_surface *dst = &transfer_cmd->dst;
+   const uint8_t *swizzle;
+   VkFormat format;
    VkResult result;
 
-   result = vk_sync_wait(&device->vk,
-                         wait_sync,
-                         0U,
-                         VK_SYNC_WAIT_COMPLETE,
-                         UINT64_MAX);
+   switch (dst->vk_format) {
+   case VK_FORMAT_D24_UNORM_S8_UINT:
+   case VK_FORMAT_X8_D24_UNORM_PACK32:
+      format = VK_FORMAT_R32_UINT;
+      break;
+
+   default:
+      format = dst->vk_format;
+      break;
+   }
+
+   swizzle = pvr_get_format_swizzle(format);
+   memcpy(surface_params->swizzle, swizzle, sizeof(surface_params->swizzle));
+
+   pvr_pbe_get_src_format_and_gamma(format,
+                                    PVR_PBE_GAMMA_NONE,
+                                    false,
+                                    &surface_params->source_format,
+                                    &surface_params->gamma);
+
+   surface_params->is_normalized = vk_format_is_normalized(format);
+   surface_params->pbe_packmode = pvr_get_pbe_packmode(format);
+   surface_params->nr_components = vk_format_get_nr_components(format);
+
+   result = pvr_mem_layout_spec(dst,
+                                0U,
+                                false,
+                                &surface_params->width,
+                                &surface_params->height,
+                                &surface_params->stride,
+                                &surface_params->mem_layout,
+                                &surface_params->addr);
    if (result != VK_SUCCESS)
       return result;
 
-   list_for_each_entry_safe (struct pvr_transfer_cmd,
-                             transfer_cmd,
-                             &sub_cmd->transfer_cmds,
-                             link) {
-      bool src_mapped = false;
-      bool dst_mapped = false;
-      void *src_addr;
-      void *dst_addr;
-      void *ret_ptr;
+   surface_params->z_only_render = false;
+   surface_params->gamma = PVR_PBE_GAMMA_NONE;
+   surface_params->depth = dst->depth;
+   surface_params->down_scale = state->down_scale;
+
+   if (surface_params->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED)
+      render_params->slice = (uint32_t)MAX2(dst->z_position, 0.0f);
+   else
+      render_params->slice = 0U;
+
+   uint32_t tile_size_x = PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 0U);
+   uint32_t tile_size_y = PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 0U);
+
+   /* If the rectangle happens to be empty / off-screen we clip away
+    * everything.
+    */
+   if (state->empty_dst) {
+      render_params->min_x_clip = 2U * tile_size_x;
+      render_params->max_x_clip = 3U * tile_size_x;
+      render_params->min_y_clip = 2U * tile_size_y;
+      render_params->max_y_clip = 3U * tile_size_y;
+      state->origin_x_in_tiles = 0U;
+      state->origin_y_in_tiles = 0U;
+      state->height_in_tiles = 1U;
+      state->width_in_tiles = 1U;
+   } else {
+      const VkRect2D *scissor = &transfer_cmd->scissor;
+
+      /* Clamp */
+      render_params->min_x_clip =
+         MAX2(MIN2(scissor->offset.x, (int32_t)surface_params->width), 0U);
+      render_params->max_x_clip =
+         MAX2(MIN2(scissor->offset.x + scissor->extent.width,
+                   (int32_t)surface_params->width),
+              0U) -
+         1U;
+
+      render_params->min_y_clip =
+         MAX2(MIN2(scissor->offset.y, surface_params->height), 0U);
+      render_params->max_y_clip =
+         MAX2(MIN2(scissor->offset.y + scissor->extent.height,
+                   surface_params->height),
+              0U) -
+         1U;
+
+      if (state->custom_mapping.pass_count > 0U) {
+         struct pvr_transfer_pass *pass =
+            &state->custom_mapping.passes[state->pass_idx];
+
+         render_params->min_x_clip = (uint32_t)pass->clip_rects[0U].offset.x;
+         render_params->max_x_clip =
+            (uint32_t)(pass->clip_rects[0U].offset.x +
+                       pass->clip_rects[0U].extent.width) -
+            1U;
+         render_params->min_y_clip = (uint32_t)pass->clip_rects[0U].offset.y;
+         render_params->max_y_clip =
+            (uint32_t)(pass->clip_rects[0U].offset.y +
+                       pass->clip_rects[0U].extent.height) -
+            1U;
+      }
+
+      state->origin_x_in_tiles = render_params->min_x_clip / tile_size_x;
+      state->origin_y_in_tiles = render_params->min_y_clip / tile_size_y;
+      state->width_in_tiles =
+         (render_params->max_x_clip + tile_size_x) / tile_size_x;
+      state->height_in_tiles =
+         (render_params->max_y_clip + tile_size_y) / tile_size_y;
+
+      /* Be careful here as this isn't the same as ((max_x_clip -
+       * min_x_clip) + tile_size_x) >> tile_size_x.
+       */
+      state->width_in_tiles -= state->origin_x_in_tiles;
+      state->height_in_tiles -= state->origin_y_in_tiles;
+   }
+
+   render_params->source_start = PVR_PBE_STARTPOS_BIT0;
+   render_params->mrt_index = 0U;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+pvr_pbe_setup_modify_defaults(const struct pvr_transfer_cmd_surface *dst,
+                              struct pvr_transfer_3d_state *state,
+                              uint32_t rt_idx,
+                              struct pvr_pbe_surf_params *surf_params,
+                              struct pvr_pbe_render_params *render_params)
+{
+   pvr_finishme("Implement pvr_pbe_setup_modify_defaults().");
+   return VK_SUCCESS;
+}
+
+static uint32_t
+pvr_pbe_get_pixel_size(enum pvr_transfer_pbe_pixel_src pixel_format)
+{
+   switch (pixel_format) {
+   case PVR_TRANSFER_PBE_PIXEL_SRC_CONV_D24_D32:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_CONV_D32_D24S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_CONV_S8D24_D24S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D24S8_D24S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32_D24S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F16_U8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F32:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_RAW32:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_RBSWAP_SU1010102:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_RBSWAP_UU1010102:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D24S8_D24S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_S8_D24S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SS8888:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SU1010102:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SU8888:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SWAP_LMSB:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_US8888:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_UU1010102:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_UU8888:
+      return 1U;
+
+   case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32S8_D32S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F16F16:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F32X2:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_MOV_BY45:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_RAW64:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_S16NORM:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D24S8_D32S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D32S8_D32S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_S8_D32S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SS16S16:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SU16U16:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SU32U32:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_U16NORM:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_US16S16:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_US32S32:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_UU16U16:
+      return 2U;
+
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F32X4:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_RAW128:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_S4XU32:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_U4XS32:
+      return 4U;
+
+   case PVR_TRANSFER_PBE_PIXEL_SRC_NUM:
+   default:
+      break;
+   }
+
+   return 0U;
+}
 
-      /* Map if bo is not mapped. */
-      if (!transfer_cmd->src->vma->bo->map) {
-         src_mapped = true;
-         ret_ptr = device->ws->ops->buffer_map(transfer_cmd->src->vma->bo);
-         if (!ret_ptr)
-            return vk_error(device, VK_ERROR_MEMORY_MAP_FAILED);
+static void pvr_pbe_setup_swizzle(const struct pvr_transfer_cmd *transfer_cmd,
+                                  struct pvr_transfer_3d_state *state,
+                                  struct pvr_pbe_surf_params *surf_params)
+{
+   bool color_fill = !!(transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FILL);
+   const struct pvr_transfer_cmd_surface *dst = &transfer_cmd->dst;
+
+   const uint32_t pixel_size =
+      pvr_pbe_get_pixel_size(state->shader_props.layer_props.pbe_format);
+
+   state->usc_pixel_width = MAX2(pixel_size, 1U);
+
+   switch (dst->vk_format) {
+   case VK_FORMAT_X8_D24_UNORM_PACK32:
+   case VK_FORMAT_D24_UNORM_S8_UINT:
+      unreachable("Handle depth stencil format swizzle.");
+      break;
+
+   default: {
+      const uint32_t red_width =
+         vk_format_get_component_bits(dst->vk_format,
+                                      UTIL_FORMAT_COLORSPACE_RGB,
+                                      0U);
+
+      if (transfer_cmd->src_present && vk_format_is_alpha(dst->vk_format)) {
+         if (vk_format_has_alpha(transfer_cmd->src.vk_format)) {
+            /* Modify the destination format swizzle to always source from
+             * src0.
+             */
+            surf_params->swizzle[0U] = PIPE_SWIZZLE_X;
+            surf_params->swizzle[1U] = PIPE_SWIZZLE_0;
+            surf_params->swizzle[2U] = PIPE_SWIZZLE_0;
+            surf_params->swizzle[3U] = PIPE_SWIZZLE_1;
+            break;
+         }
+
+         /* Source format having no alpha channel still allocates 4 output
+          * buffer registers.
+          */
+      }
+
+      if (vk_format_is_normalized(dst->vk_format)) {
+         if (color_fill && (dst->vk_format == VK_FORMAT_B8G8R8A8_UNORM ||
+                            dst->vk_format == VK_FORMAT_R8G8B8A8_UNORM)) {
+            surf_params->source_format =
+               PVRX(PBESTATE_SOURCE_FORMAT_8_PER_CHANNEL);
+         } else if (state->shader_props.layer_props.pbe_format ==
+                    PVR_TRANSFER_PBE_PIXEL_SRC_F16_U8) {
+            surf_params->source_format =
+               PVRX(PBESTATE_SOURCE_FORMAT_8_PER_CHANNEL);
+         } else if (red_width <= 8U) {
+            surf_params->source_format =
+               PVRX(PBESTATE_SOURCE_FORMAT_F16_PER_CHANNEL);
+         }
+      } else if (red_width == 32U && !state->dont_force_pbe) {
+         uint32_t count = 0U;
+
+         if (transfer_cmd->src_present) {
+            VkFormat src_format = transfer_cmd->src.vk_format;
+            count = vk_format_get_common_color_channel_count(src_format,
+                                                             dst->vk_format);
+         }
+
+         switch (count) {
+         case 1U:
+            surf_params->swizzle[1U] = PIPE_SWIZZLE_0;
+            FALLTHROUGH;
+         case 2U:
+            surf_params->swizzle[2U] = PIPE_SWIZZLE_0;
+            FALLTHROUGH;
+         case 3U:
+            surf_params->swizzle[3U] = PIPE_SWIZZLE_1;
+            break;
+
+         case 4U:
+         default:
+            break;
+         }
+      } else {
+         unreachable("Invalid case in pvr_pbe_setup_swizzle.");
+      }
+      break;
+   }
+   }
+}
+
+/**
+ * Calculates the required PBE byte mask based on the incoming transfer command.
+ *
+ * @param transfer_cmd  the transfer command
+ * @return the bytemask (active high disable mask)
+ */
+
+static uint64_t pvr_pbe_byte_mask(const struct pvr_device_info *dev_info,
+                                  const struct pvr_transfer_cmd *transfer_cmd)
+{
+   uint32_t flags = pvr_get_blit_flags(transfer_cmd);
+
+   assert(PVR_HAS_ERN(dev_info, 42064));
+
+   if (flags & PVR_TRANSFER_CMD_FLAGS_DSMERGE) {
+      uint32_t mask = 0U;
+
+      switch (transfer_cmd->dst.vk_format) {
+      case VK_FORMAT_D32_SFLOAT_S8_UINT:
+         mask = 0xF0F0F0F0U;
+         break;
+      case VK_FORMAT_D24_UNORM_S8_UINT:
+         mask = 0x88888888U;
+         break;
+      default:
+         break;
       }
 
-      if (!transfer_cmd->dst->vma->bo->map) {
-         dst_mapped = true;
-         ret_ptr = device->ws->ops->buffer_map(transfer_cmd->dst->vma->bo);
-         if (!ret_ptr)
-            return vk_error(device, VK_ERROR_MEMORY_MAP_FAILED);
+      if ((flags & PVR_TRANSFER_CMD_FLAGS_PICKD) == 0U)
+         mask = ~mask;
+
+      return mask;
+   }
+
+   /* The mask is as it was inactive on cores without the ERN. This keeps the
+    * firmware agnostic to the feature.
+    */
+   return 0U;
+}
+
+static VkResult pvr_pbe_setup_emit(const struct pvr_transfer_cmd *transfer_cmd,
+                                   struct pvr_transfer_ctx *ctx,
+                                   struct pvr_transfer_3d_state *state,
+                                   uint32_t rt_count,
+                                   uint32_t *pbe_setup_words)
+{
+   struct pvr_device *const device = ctx->device;
+   const struct pvr_device_info *const dev_info = &device->pdevice->dev_info;
+
+   struct pvr_winsys_transfer_regs *regs = &state->regs;
+   struct pvr_pds_event_program program = {
+      .emit_words = pbe_setup_words,
+      .num_emit_word_pairs = rt_count,
+   };
+   struct pvr_pds_upload pds_upload;
+   uint32_t staging_buffer_size;
+   uint32_t *staging_buffer;
+   pvr_dev_addr_t addr;
+   VkResult result;
+
+   /* Precondition, make sure to use a valid index for ctx->usc_eot_bos. */
+   assert(rt_count <= ARRAY_SIZE(ctx->usc_eot_bos));
+   assert(rt_count > 0U);
+
+   addr.addr = ctx->usc_eot_bos[rt_count - 1U]->vma->dev_addr.addr -
+               device->heaps.usc_heap->base_addr.addr;
+
+   pvr_pds_setup_doutu(&program.task_control,
+                       addr.addr,
+                       0U,
+                       PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
+                       false);
+
+   pvr_pds_set_sizes_pixel_event(&program, dev_info);
+
+   staging_buffer_size =
+      (program.code_size + program.data_size) * sizeof(*staging_buffer);
+
+   staging_buffer = vk_alloc(&device->vk.alloc,
+                             staging_buffer_size,
+                             8U,
+                             VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+   if (!staging_buffer)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   pvr_pds_generate_pixel_event_data_segment(&program,
+                                             staging_buffer,
+                                             dev_info);
+
+   /* TODO: We can save some memory by generating a code segment for each
+    * rt_count, which at the time of writing is a maximum of 3, in
+    * pvr_setup_transfer_eot_shaders() when we setup the corresponding EOT
+    * USC programs.
+    */
+   pvr_pds_generate_pixel_event_code_segment(&program,
+                                             staging_buffer + program.data_size,
+                                             dev_info);
+
+   result =
+      pvr_cmd_buffer_upload_pds(transfer_cmd->cmd_buffer,
+                                staging_buffer,
+                                program.data_size,
+                                PVRX(CR_EVENT_PIXEL_PDS_DATA_ADDR_ALIGNMENT),
+                                staging_buffer + program.data_size,
+                                program.code_size,
+                                PVRX(CR_EVENT_PIXEL_PDS_CODE_ADDR_ALIGNMENT),
+                                PVRX(CR_EVENT_PIXEL_PDS_DATA_ADDR_ALIGNMENT),
+                                &pds_upload);
+   vk_free(&device->vk.alloc, staging_buffer);
+   if (result != VK_SUCCESS)
+      return result;
+
+   pvr_csb_pack (&regs->event_pixel_pds_info, CR_EVENT_PIXEL_PDS_INFO, reg) {
+      reg.temp_stride = 0U;
+      reg.const_size =
+         DIV_ROUND_UP(program.data_size,
+                      PVRX(CR_EVENT_PIXEL_PDS_INFO_CONST_SIZE_UNIT_SIZE));
+      reg.usc_sr_size =
+         DIV_ROUND_UP(rt_count * PVR_STATE_PBE_DWORDS,
+                      PVRX(CR_EVENT_PIXEL_PDS_INFO_USC_SR_SIZE_UNIT_SIZE));
+   }
+
+   pvr_csb_pack (&regs->event_pixel_pds_data, CR_EVENT_PIXEL_PDS_DATA, reg) {
+      reg.addr = PVR_DEV_ADDR(pds_upload.data_offset);
+   }
+
+   pvr_csb_pack (&regs->event_pixel_pds_code, CR_EVENT_PIXEL_PDS_CODE, reg) {
+      reg.addr = PVR_DEV_ADDR(pds_upload.code_offset);
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult pvr_pbe_setup(const struct pvr_transfer_cmd *transfer_cmd,
+                              struct pvr_transfer_ctx *ctx,
+                              struct pvr_transfer_3d_state *state)
+{
+   struct pvr_device *const device = ctx->device;
+   const struct pvr_device_info *const dev_info = &device->pdevice->dev_info;
+
+   const struct pvr_transfer_cmd_surface *dst = &transfer_cmd->dst;
+   uint32_t num_rts = vk_format_get_plane_count(dst->vk_format);
+   uint32_t pbe_setup_words[PVR_TRANSFER_MAX_RENDER_TARGETS *
+                            ROGUE_NUM_PBESTATE_STATE_WORDS];
+   struct pvr_pbe_render_params render_params;
+   struct pvr_pbe_surf_params surf_params;
+   VkResult result;
+
+   if (state->custom_mapping.pass_count > 0U)
+      num_rts = state->custom_mapping.passes[state->pass_idx].clip_rects_count;
+
+   if (PVR_HAS_FEATURE(dev_info, paired_tiles))
+      state->pair_tiles = PVR_PAIRED_TILES_NONE;
+
+   for (uint32_t i = 0U; i < num_rts; i++) {
+      uint64_t *pbe_regs;
+      uint32_t *pbe_words;
+
+      /* Ensure the access into the pbe_wordx_mrty is made within its bounds. */
+      assert(i * ROGUE_NUM_PBESTATE_REG_WORDS <
+             ARRAY_SIZE(state->regs.pbe_wordx_mrty));
+      /* Ensure the access into pbe_setup_words is made within its bounds. */
+      assert(i * ROGUE_NUM_PBESTATE_STATE_WORDS < ARRAY_SIZE(pbe_setup_words));
+
+      pbe_regs = &state->regs.pbe_wordx_mrty[i * ROGUE_NUM_PBESTATE_REG_WORDS];
+      pbe_words = &pbe_setup_words[i * ROGUE_NUM_PBESTATE_STATE_WORDS];
+
+      if (PVR_HAS_ERN(dev_info, 42064))
+         pbe_regs[2U] = 0UL;
+
+      if (i == 0U) {
+         result = pvr_pbe_setup_codegen_defaults(dev_info,
+                                                 transfer_cmd,
+                                                 state,
+                                                 &surf_params,
+                                                 &render_params);
+         if (result != VK_SUCCESS)
+            return result;
+      } else {
+         result = pvr_pbe_setup_modify_defaults(dst,
+                                                state,
+                                                i,
+                                                &surf_params,
+                                                &render_params);
+         if (result != VK_SUCCESS)
+            return result;
       }
 
-      src_addr =
-         transfer_cmd->src->vma->bo->map + transfer_cmd->src->vma->bo_offset;
-      dst_addr =
-         transfer_cmd->dst->vma->bo->map + transfer_cmd->dst->vma->bo_offset;
+      pvr_pbe_setup_swizzle(transfer_cmd, state, &surf_params);
 
-      for (uint32_t i = 0; i < transfer_cmd->region_count; i++) {
-         VkBufferCopy2 *region = &transfer_cmd->regions[i];
+      pvr_pbe_pack_state(dev_info,
+                         &surf_params,
+                         &render_params,
+                         pbe_words,
+                         pbe_regs);
 
-         memcpy(dst_addr + region->dstOffset,
-                src_addr + region->srcOffset,
-                region->size);
+      if (PVR_HAS_ERN(dev_info, 42064)) {
+         uint64_t temp_reg;
+
+         pvr_csb_pack (&temp_reg, PBESTATE_REG_WORD2, reg) {
+            reg.sw_bytemask = pvr_pbe_byte_mask(dev_info, transfer_cmd);
+         }
+
+         pbe_regs[2U] |= temp_reg;
+      }
+
+      if (PVR_HAS_FEATURE(dev_info, paired_tiles)) {
+         if (pbe_regs[2U] &
+             (1ULL << PVRX(PBESTATE_REG_WORD2_PAIR_TILES_SHIFT))) {
+            if (transfer_cmd->dst.mem_layout == PVR_MEMLAYOUT_TWIDDLED)
+               state->pair_tiles = PVR_PAIRED_TILES_Y;
+            else
+               state->pair_tiles = PVR_PAIRED_TILES_X;
+         }
       }
+   }
+
+   result =
+      pvr_pbe_setup_emit(transfer_cmd, ctx, state, num_rts, pbe_setup_words);
+   if (result != VK_SUCCESS)
+      return result;
 
-      if (src_mapped)
-         device->ws->ops->buffer_unmap(transfer_cmd->src->vma->bo);
+   /* Adjust tile origin and width to include all emits. */
+   if (state->custom_mapping.pass_count > 0U) {
+      const uint32_t tile_size_x =
+         PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 0U);
+      const uint32_t tile_size_y =
+         PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 0U);
+      struct pvr_transfer_pass *pass =
+         &state->custom_mapping.passes[state->pass_idx];
+      VkOffset2D offset = { 0U, 0U };
+      VkOffset2D end = { 0U, 0U };
 
-      if (dst_mapped)
-         device->ws->ops->buffer_unmap(transfer_cmd->dst->vma->bo);
+      for (uint32_t i = 0U; i < pass->clip_rects_count; i++) {
+         VkRect2D *rect = &pass->clip_rects[i];
+
+         offset.x = MIN2(offset.x, rect->offset.x);
+         offset.y = MIN2(offset.y, rect->offset.y);
+         end.x = MAX2(end.x, rect->offset.x + rect->extent.width);
+         end.y = MAX2(end.y, rect->offset.y + rect->extent.height);
+      }
+
+      state->origin_x_in_tiles = (uint32_t)offset.x / tile_size_x;
+      state->origin_y_in_tiles = (uint32_t)offset.y / tile_size_y;
+      state->width_in_tiles =
+         DIV_ROUND_UP((uint32_t)end.x, tile_size_x) - state->origin_x_in_tiles;
+      state->height_in_tiles =
+         DIV_ROUND_UP((uint32_t)end.y, tile_size_y) - state->origin_y_in_tiles;
    }
 
-   /* Given we are doing CPU based copy, completion fence should always be
-    * signaled. This should be fixed when GPU based copy is implemented.
+   return VK_SUCCESS;
+}
+
+/**
+ * Writes the ISP tile registers according to the MSAA state. Sets up the USC
+ * pixel partition allocations and the number of tiles in flight.
+ */
+static VkResult pvr_isp_tiles(const struct pvr_device *device,
+                              struct pvr_transfer_3d_state *state)
+{
+   const struct pvr_device_runtime_info *dev_runtime_info =
+      &device->pdevice->dev_runtime_info;
+   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
+   const uint32_t isp_samples =
+      PVR_GET_FEATURE_VALUE(dev_info, isp_samples_per_pixel, 1U);
+   uint32_t origin_x = state->origin_x_in_tiles;
+   uint32_t origin_y = state->origin_y_in_tiles;
+   uint32_t width = state->width_in_tiles;
+   uint32_t height = state->height_in_tiles;
+   uint32_t isp_tiles_in_flight;
+
+   /* msaa_multiplier is calculated by sample_count & ~1U. Given sample
+    * count is always in powers of two, we can get the sample count from
+    * msaa_multiplier using the following logic.
     */
-   return vk_sync_signal(&device->vk, signal_sync, 0);
+   const uint32_t samples = MAX2(state->msaa_multiplier, 1U);
+
+   /* isp_samples_per_pixel feature is also know as "2x/4x for free", when
+    * this is present SAMPLES_PER_PIXEL is 2/4, otherwise 1. The following
+    * logic should end up with these numbers:
+    *
+    * |---------------------------------|
+    * | 4 SAMPLES / ISP PIXEL           |
+    * |-----------------------+----+----|
+    * |                  MSAA | X* | Y* |
+    * |                    2X |  1 |  1 |
+    * |                    4X |  1 |  1 |
+    * |---------------------------------|
+    * | 2 SAMPLES / ISP PIXEL           |
+    * |-----------------------+----+----|
+    * |                  MSAA | X* | Y* |
+    * |                    2X |  1 |  1 |
+    * |                    4X |  1 |  2 |
+    * |                    8X |  2 |  2 |
+    * |-----------------------+----+----|
+    * |  1 SAMPLE / ISP PIXEL           |
+    * |-----------------------+----+----|
+    * |                  MSAA | X* | Y* |
+    * |                    2X |  1 |  2 |
+    * |                    4X |  2 |  2 |
+    * |-----------------------+----+----|
+    */
+
+   origin_x <<= (state->msaa_multiplier >> (isp_samples + 1U)) & 1U;
+   origin_y <<= ((state->msaa_multiplier >> (isp_samples + 1U)) |
+                 (state->msaa_multiplier >> isp_samples)) &
+                1U;
+   width <<= (state->msaa_multiplier >> (isp_samples + 1U)) & 1U;
+   height <<= ((state->msaa_multiplier >> (isp_samples + 1U)) |
+               (state->msaa_multiplier >> isp_samples)) &
+              1U;
+
+   if (PVR_HAS_FEATURE(dev_info, paired_tiles) &&
+       state->pair_tiles != PVR_PAIRED_TILES_NONE) {
+      width = ALIGN_POT(width, 2U);
+      height = ALIGN_POT(height, 2U);
+   }
+
+   pvr_csb_pack (&state->regs.isp_mtile_size, CR_ISP_MTILE_SIZE, reg) {
+      reg.x = width;
+      reg.y = height;
+   }
+
+   pvr_csb_pack (&state->regs.isp_render_origin, CR_ISP_RENDER_ORIGIN, reg) {
+      reg.x = origin_x;
+      reg.y = origin_y;
+   }
+
+   pvr_setup_tiles_in_flight(dev_info,
+                             dev_runtime_info,
+                             pvr_cr_isp_aa_mode_type(samples),
+                             state->usc_pixel_width,
+                             state->pair_tiles != PVR_PAIRED_TILES_NONE,
+                             0,
+                             &isp_tiles_in_flight,
+                             &state->regs.usc_pixel_output_ctrl);
+
+   pvr_csb_pack (&state->regs.isp_ctl, CR_ISP_CTL, reg) {
+      reg.process_empty_tiles = true;
+
+      if (PVR_HAS_FEATURE(dev_info, paired_tiles)) {
+         if (state->pair_tiles == PVR_PAIRED_TILES_X) {
+            reg.pair_tiles = true;
+         } else if (state->pair_tiles == PVR_PAIRED_TILES_Y) {
+            reg.pair_tiles = true;
+            reg.pair_tiles_vert = true;
+         }
+      }
+   }
+
+   state->regs.isp_ctl |= isp_tiles_in_flight;
+
+   return VK_SUCCESS;
+}
+
+static bool
+pvr_int_pbe_pixel_changes_dst_rate(const struct pvr_device_info *dev_info,
+                                   enum pvr_transfer_pbe_pixel_src pbe_format)
+{
+   /* We don't emulate rate change from the USC with the pbe_yuv feature. */
+   if (!PVR_HAS_FEATURE(dev_info, pbe_yuv) &&
+       (pbe_format == PVR_TRANSFER_PBE_PIXEL_SRC_Y_UV_INTERLEAVED ||
+        pbe_format == PVR_TRANSFER_PBE_PIXEL_SRC_Y_U_V)) {
+      return true;
+   }
+
+   return false;
+}
+
+/**
+ * Number of DWORDs from the unified store that floating texture coefficients
+ * take up.
+ */
+static void pvr_uv_space(const struct pvr_device_info *dev_info,
+                         const struct pvr_transfer_cmd *transfer_cmd,
+                         struct pvr_transfer_3d_state *state)
+{
+   const struct pvr_transfer_cmd_surface *dst = &transfer_cmd->dst;
+   const VkRect2D *dst_rect = &transfer_cmd->scissor;
+
+   /* This also avoids division by 0 in pvr_dma_texture_floats(). */
+   if (state->custom_mapping.pass_count == 0U &&
+       (dst_rect->extent.width == 0U || dst_rect->extent.height == 0U ||
+        MAX2(dst_rect->offset.x, dst_rect->offset.x + dst_rect->extent.width) <
+           0U ||
+        MIN2(dst_rect->offset.x, dst_rect->offset.x + dst_rect->extent.width) >
+           (int32_t)dst->width ||
+        MAX2(dst_rect->offset.y, dst_rect->offset.y + dst_rect->extent.height) <
+           0U ||
+        MIN2(dst_rect->offset.y, dst_rect->offset.y + dst_rect->extent.height) >
+           (int32_t)dst->height)) {
+      state->empty_dst = true;
+   } else {
+      state->empty_dst = false;
+
+      if (transfer_cmd->src_present) {
+         struct pvr_tq_layer_properties *layer =
+            &state->shader_props.layer_props;
+
+         const VkRect2D *src_rect = &transfer_cmd->mappings[0U].src_rect;
+         const VkRect2D *dst_rect = &transfer_cmd->mappings[0U].dst_rect;
+         int32_t dst_x1 = dst_rect->offset.x + dst_rect->extent.width;
+         int32_t dst_y1 = dst_rect->offset.y + dst_rect->extent.height;
+         int32_t src_x1 = src_rect->offset.x + src_rect->extent.width;
+         int32_t src_y1 = src_rect->offset.y + src_rect->extent.height;
+
+         if (state->filter > PVR_FILTER_POINT) {
+            layer->layer_floats = PVR_INT_COORD_SET_FLOATS_4;
+         } else if (src_rect->extent.width == 0U ||
+                    src_rect->extent.height == 0U) {
+            layer->layer_floats = PVR_INT_COORD_SET_FLOATS_0;
+         } else if ((src_rect->offset.x * dst_x1 !=
+                     src_x1 * dst_rect->offset.x) ||
+                    (src_rect->offset.y * dst_y1 !=
+                     src_y1 * dst_rect->offset.y) ||
+                    (src_rect->extent.width != dst_rect->extent.width) ||
+                    (src_rect->extent.height != dst_rect->extent.height)) {
+            layer->layer_floats = PVR_INT_COORD_SET_FLOATS_4;
+         } else {
+            layer->layer_floats = PVR_INT_COORD_SET_FLOATS_0;
+         }
+
+         /* We have to adjust the rate. */
+         if (layer->layer_floats != PVR_INT_COORD_SET_FLOATS_0 &&
+             pvr_int_pbe_pixel_changes_dst_rate(dev_info, layer->pbe_format)) {
+            layer->layer_floats = PVR_INT_COORD_SET_FLOATS_6;
+         }
+      }
+   }
+}
+
+static uint32_t pvr_int_pbe_pixel_num_sampler_and_image_states(
+   enum pvr_transfer_pbe_pixel_src pbe_format,
+   uint32_t alpha_type)
+{
+   switch (pbe_format) {
+   case PVR_TRANSFER_PBE_PIXEL_SRC_Y_UV_INTERLEAVED:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_Y_U_V:
+      return 1U;
+   default:
+      return pvr_pbe_pixel_num_loads(pbe_format, alpha_type);
+   }
+}
+
+static VkResult pvr_sampler_state_for_surface(
+   const struct pvr_device_info *dev_info,
+   const struct pvr_transfer_cmd_surface *surface,
+   enum pvr_filter filter,
+   const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout,
+   uint32_t sampler,
+   uint32_t *mem_ptr)
+{
+   uint64_t sampler_state[2U] = { 0UL, 0UL };
+
+   pvr_csb_pack (&sampler_state[0U], TEXSTATE_SAMPLER, reg) {
+      reg.anisoctl = PVRX(TEXSTATE_ANISOCTL_DISABLED);
+      reg.minlod = PVRX(TEXSTATE_CLAMP_MIN);
+      reg.maxlod = PVRX(TEXSTATE_CLAMP_MIN);
+      reg.dadjust = PVRX(TEXSTATE_DADJUST_MIN_UINT);
+
+      if (filter == PVR_FILTER_DONTCARE || filter == PVR_FILTER_POINT) {
+         reg.minfilter = PVRX(TEXSTATE_FILTER_POINT);
+         reg.magfilter = PVRX(TEXSTATE_FILTER_POINT);
+      } else if (filter == PVR_FILTER_LINEAR) {
+         reg.minfilter = PVRX(TEXSTATE_FILTER_LINEAR);
+         reg.magfilter = PVRX(TEXSTATE_FILTER_LINEAR);
+      } else {
+         assert(PVR_HAS_FEATURE(dev_info, tf_bicubic_filter));
+         reg.minfilter = PVRX(TEXSTATE_FILTER_BICUBIC);
+         reg.magfilter = PVRX(TEXSTATE_FILTER_BICUBIC);
+      }
+
+      reg.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
+      reg.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
+
+      if (surface->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED)
+         reg.addrmode_w = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
+   }
+
+   assert(sampler < PVR_TRANSFER_MAX_IMAGES);
+
+   assert(sampler <= sh_reg_layout->combined_image_samplers.count);
+   mem_ptr += sh_reg_layout->combined_image_samplers.offsets[sampler].sampler;
+
+   memcpy(mem_ptr, sampler_state, sizeof(sampler_state));
+
+   return VK_SUCCESS;
+}
+
+static inline VkResult pvr_image_state_set_codegen_defaults(
+   struct pvr_device *device,
+   struct pvr_transfer_3d_state *state,
+   const struct pvr_transfer_cmd_surface *surface,
+   uint32_t load,
+   uint64_t *mem_ptr)
+{
+   struct pvr_tq_layer_properties *layer = &state->shader_props.layer_props;
+   struct pvr_texture_state_info info = { 0U };
+   VkResult result;
+
+   switch (surface->vk_format) {
+   /* ERN 46863 */
+   case VK_FORMAT_D32_SFLOAT_S8_UINT:
+      switch (layer->pbe_format) {
+      case PVR_TRANSFER_PBE_PIXEL_SRC_RAW32:
+      case PVR_TRANSFER_PBE_PIXEL_SRC_RAW64:
+      case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_S8_D24S8:
+      case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D32S8_D32S8:
+      case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32S8_D32S8:
+      case PVR_TRANSFER_PBE_PIXEL_SRC_CONV_D32_D24S8:
+      case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32_D24S8:
+         info.format = VK_FORMAT_R32G32_UINT;
+         break;
+      default:
+         break;
+      }
+      break;
+
+   case VK_FORMAT_D24_UNORM_S8_UINT:
+   case VK_FORMAT_X8_D24_UNORM_PACK32:
+      info.format = VK_FORMAT_R32_UINT;
+      break;
+
+   default:
+      info.format = surface->vk_format;
+      break;
+   }
+
+   info.flags = 0U;
+   info.base_level = 0U;
+   info.mip_levels = 1U;
+   info.mipmaps_present = false;
+   info.sample_count = MAX2(surface->sample_count, 1U);
+
+   if (surface->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED)
+      info.extent.depth = surface->depth;
+   else
+      info.extent.depth = 0U;
+
+   if (PVR_HAS_FEATURE(&device->pdevice->dev_info, tpu_array_textures))
+      info.array_size = 0U;
+
+   result = pvr_mem_layout_spec(surface,
+                                load,
+                                true,
+                                &info.extent.width,
+                                &info.extent.height,
+                                &info.stride,
+                                &info.mem_layout,
+                                &info.addr);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (state->custom_mapping.texel_extend_dst > 1U) {
+      info.extent.width /= state->custom_mapping.texel_extend_dst;
+      info.stride /= state->custom_mapping.texel_extend_dst;
+   }
+
+   info.tex_state_type = PVR_TEXTURE_STATE_SAMPLE;
+   memcpy(info.swizzle,
+          pvr_get_format_swizzle(info.format),
+          sizeof(info.swizzle));
+
+   if (info.extent.depth > 0U)
+      info.type = VK_IMAGE_VIEW_TYPE_3D;
+   else if (info.extent.height > 1U)
+      info.type = VK_IMAGE_VIEW_TYPE_2D;
+   else
+      info.type = VK_IMAGE_VIEW_TYPE_1D;
+
+   result = pvr_pack_tex_state(device, &info, mem_ptr);
+   if (result != VK_SUCCESS)
+      return result;
+
+   return VK_SUCCESS;
+}
+
+static VkResult pvr_image_state_for_surface(
+   const struct pvr_transfer_ctx *ctx,
+   const struct pvr_transfer_cmd *transfer_cmd,
+   const struct pvr_transfer_cmd_surface *surface,
+   uint32_t load,
+   const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout,
+   struct pvr_transfer_3d_state *state,
+   uint32_t uf_image,
+   uint32_t *mem_ptr)
+{
+   uint32_t tex_state[ROGUE_MAXIMUM_IMAGE_STATE_SIZE] = { 0U };
+   VkResult result;
+   uint8_t offset;
+
+   result = pvr_image_state_set_codegen_defaults(ctx->device,
+                                                 state,
+                                                 surface,
+                                                 load,
+                                                 (uint64_t *)tex_state);
+   if (result != VK_SUCCESS)
+      return result;
+
+   assert(uf_image < PVR_TRANSFER_MAX_IMAGES);
+
+   /* Offset of the shared registers containing the hardware image state. */
+   assert(uf_image < sh_reg_layout->combined_image_samplers.count);
+   offset = sh_reg_layout->combined_image_samplers.offsets[uf_image].image;
+
+   /* Copy the image state to the buffer which is loaded into the shared
+    * registers.
+    */
+   memcpy(mem_ptr + offset, tex_state, sizeof(tex_state));
+
+   return VK_SUCCESS;
+}
+
+/* Writes the texture state/sampler state into DMAed memory. */
+static VkResult
+pvr_sampler_image_state(struct pvr_transfer_ctx *ctx,
+                        const struct pvr_transfer_cmd *transfer_cmd,
+                        const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout,
+                        struct pvr_transfer_3d_state *state,
+                        uint32_t *mem_ptr)
+{
+   if (!state->empty_dst) {
+      struct pvr_tq_layer_properties *layer =
+         &state->shader_props.layer_props;
+      uint32_t max_load =
+         pvr_pbe_pixel_num_loads(layer->pbe_format,
+                                 state->shader_props.alpha_type);
+      uint32_t uf_sampler = 0U;
+      uint32_t uf_image = 0U;
+
+      for (uint32_t load = 0U; load < max_load; load++) {
+         const struct pvr_transfer_cmd_surface *surface;
+         enum pvr_filter filter;
+         VkResult result;
+
+         switch (layer->pbe_format) {
+         case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_S8_D32S8:
+         case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D24S8_D32S8:
+         case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D32S8_D32S8:
+         case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32S8_D32S8:
+         case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_S8_D24S8:
+         case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D24S8_D24S8:
+         case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D24S8_D24S8:
+         case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D32_D24S8:
+         case PVR_TRANSFER_PBE_PIXEL_SRC_F16F16:
+         case PVR_TRANSFER_PBE_PIXEL_SRC_F16_U8:
+            if (load > 0U) {
+               surface = &transfer_cmd->dst;
+
+               if (state->shader_props.alpha_type != PVR_ALPHA_NONE)
+                  filter = PVR_FILTER_POINT;
+               else
+                  filter = transfer_cmd->filter;
+            } else {
+               surface = &transfer_cmd->src;
+               filter = state->filter;
+            }
+            break;
+
+         case PVR_TRANSFER_PBE_PIXEL_SRC_Y_UV_INTERLEAVED:
+         case PVR_TRANSFER_PBE_PIXEL_SRC_Y_U_V:
+            surface = &transfer_cmd->src;
+            filter = state->filter;
+            break;
+
+         default:
+            surface = &transfer_cmd->src;
+            filter = state->filter;
+            break;
+         }
+
+         if (load < pvr_int_pbe_pixel_num_sampler_and_image_states(
+                       layer->pbe_format,
+                       state->shader_props.alpha_type)) {
+            const struct pvr_device_info *dev_info =
+               &transfer_cmd->cmd_buffer->device->pdevice->dev_info;
+
+            result = pvr_sampler_state_for_surface(dev_info,
+                                                   surface,
+                                                   filter,
+                                                   sh_reg_layout,
+                                                   uf_sampler,
+                                                   mem_ptr);
+            if (result != VK_SUCCESS)
+               return result;
+
+            uf_sampler++;
+
+            result = pvr_image_state_for_surface(ctx,
+                                                 transfer_cmd,
+                                                 surface,
+                                                 load,
+                                                 sh_reg_layout,
+                                                 state,
+                                                 uf_image,
+                                                 mem_ptr);
+            if (result != VK_SUCCESS)
+               return result;
+
+            uf_image++;
+         }
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+/* The returned offset is in dwords. */
+static inline uint32_t pvr_dynamic_const_reg_advance(
+   const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout,
+   struct pvr_transfer_3d_state *state)
+{
+   const uint32_t offset = sh_reg_layout->dynamic_consts.offset;
+
+   assert(state->dynamic_const_reg_ptr < sh_reg_layout->dynamic_consts.count);
+
+   return offset + state->dynamic_const_reg_ptr++;
+}
+
+static inline void
+pvr_dma_global_alpha(const struct pvr_transfer_alpha *alpha,
+                     struct pvr_transfer_3d_state *state,
+                     const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout,
+                     uint32_t *mem_ptr)
+{
+   float global = (float)alpha->global / 255.0f;
+
+   mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] = fui(global);
+}
+
+/** Scales coefficients for sampling. (non normalized). */
+static inline void
+pvr_dma_texture_floats(const struct pvr_transfer_cmd *transfer_cmd,
+                       struct pvr_transfer_3d_state *state,
+                       const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout,
+                       uint32_t *mem_ptr)
+
+{
+   if (transfer_cmd->src_present) {
+      struct pvr_tq_layer_properties *layer = &state->shader_props.layer_props;     
+      const struct pvr_rect_mapping *mapping = &transfer_cmd->mappings[0U];
+      VkRect2D src_rect = mapping->src_rect;
+      VkRect2D dst_rect = mapping->dst_rect;
+
+      switch (layer->layer_floats) {
+      case PVR_INT_COORD_SET_FLOATS_0:
+         break;
+
+      case PVR_INT_COORD_SET_FLOATS_6:
+      case PVR_INT_COORD_SET_FLOATS_4: {
+         int32_t consts[2U] = { 0U, 0U };
+         int32_t denom[2U] = { 0U, 0U };
+         int32_t nums[2U] = { 0U, 0U };
+         int32_t src_x, dst_x;
+         int32_t src_y, dst_y;
+         float offset = 0.0f;
+         float tmp;
+
+         dst_x = dst_rect.extent.width;
+         dst_y = dst_rect.extent.height;
+         src_x = src_rect.extent.width;
+         src_y = src_rect.extent.height;
+
+         nums[0U] = src_x;
+         denom[0U] = dst_x;
+         consts[0U] = src_rect.offset.x * dst_x - src_x * dst_rect.offset.x;
+         nums[1U] = src_y;
+         denom[1U] = dst_y;
+         consts[1U] = src_rect.offset.y * dst_y - src_y * dst_rect.offset.y;
+
+         for (uint32_t i = 0U; i < 2U; i++) {
+            tmp = (float)(nums[i]) / (float)(denom[i]);
+            mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] =
+               fui(tmp);
+
+            tmp = ((float)(consts[i]) + (i == 1U ? offset : 0.0f)) /
+                  (float)(denom[i]);
+            mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] =
+               fui(tmp);
+         }
+
+         if (layer->layer_floats == PVR_INT_COORD_SET_FLOATS_6) {
+            tmp = (float)MIN2(dst_rect.offset.x, dst_rect.offset.x + dst_x);
+            mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] =
+               fui(tmp);
+
+            tmp = (float)MIN2(dst_rect.offset.y, dst_rect.offset.y + dst_y);
+            mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] =
+               fui(tmp);
+         }
+         break;
+      }
+
+      default:
+         unreachable("Unknown COORD_SET_FLOATS.");
+         break;
+      }
+   }
+}
+
+static bool pvr_int_pbe_pixel_requires_usc_filter(
+   const struct pvr_device_info *dev_info,
+   enum pvr_transfer_pbe_pixel_src pixel_format)
+{
+   switch (pixel_format) {
+   case PVR_TRANSFER_PBE_PIXEL_SRC_SMRG_D24S8_D24S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_DMRG_D24S8_D24S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_U16NORM:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_S16NORM:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F32:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F32X2:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F32X4:
+      return true;
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F16F16:
+      return !PVR_HAS_FEATURE(dev_info, pbe_filterable_f16);
+   default:
+      return false;
+   }
+}
+
+/**
+ * Sets up the MSAA related bits in the operation
+ *
+ * TPU sample count is read directly from transfer_cmd in the TPU code. An MSAA
+ * src can be read from sample rate or instance rate shaders as long as the
+ * sample count is set on the TPU. If a layer is single sample we expect the
+ * same sample replicated in full rate shaders. If the layer is multi sample,
+ * instance rate shaders are used to emulate the filter or to select the
+ * specified sample. The sample number is static in the programs.
+ */
+static VkResult pvr_msaa_state(const struct pvr_device_info *dev_info,
+                               const struct pvr_transfer_cmd *transfer_cmd,
+                               struct pvr_transfer_3d_state *state)
+{
+   struct pvr_tq_shader_properties *shader_props = &state->shader_props;
+   struct pvr_tq_layer_properties *layer = &shader_props->layer_props;
+   struct pvr_winsys_transfer_regs *const regs = &state->regs;
+   uint32_t src_sample_count = transfer_cmd->src.sample_count & ~1U;
+   uint32_t dst_sample_count = transfer_cmd->dst.sample_count & ~1U;
+   uint32_t bsample_count = 0U;
+
+   shader_props->full_rate = false;
+   state->msaa_multiplier = 1U;
+   state->down_scale = false;
+
+   /* clang-format off */
+   pvr_csb_pack (&regs->isp_aa, CR_ISP_AA, reg);
+   /* clang-format on */
+
+   layer->sample_count = 1U;
+   layer->resolve_op = PVR_RESOLVE_BLEND;
+
+   bsample_count |= src_sample_count | dst_sample_count;
+
+   if (bsample_count > PVR_GET_FEATURE_VALUE(dev_info, max_multisample, 0U))
+      return vk_error(transfer_cmd->cmd_buffer, VK_ERROR_FORMAT_NOT_SUPPORTED);
+
+   /* Shouldn't get two distinct bits set (implies different sample counts).
+    * The reason being the rate at which the shader runs has to match.
+    */
+   if ((bsample_count & (bsample_count - 1U)) != 0U)
+      return vk_error(transfer_cmd->cmd_buffer, VK_ERROR_FORMAT_NOT_SUPPORTED);
+
+   if (src_sample_count == 0U && dst_sample_count == 0U) {
+      /* S -> S (no MSAA involved). */
+      layer->msaa = false;
+   } else if (src_sample_count != 0U && dst_sample_count == 0U) {
+      /* M -> S (resolve). */
+      layer->resolve_op = transfer_cmd->resolve_op;
+
+      if ((uint32_t)layer->resolve_op >=
+          (src_sample_count + (uint32_t)PVR_RESOLVE_SAMPLE0)) {
+         return vk_error(transfer_cmd->cmd_buffer,
+                         VK_ERROR_FORMAT_NOT_SUPPORTED);
+      }
+
+      layer->msaa = true;
+
+      switch (layer->resolve_op) {
+      case PVR_RESOLVE_MIN:
+      case PVR_RESOLVE_MAX:
+         switch (transfer_cmd->src.vk_format) {
+         case VK_FORMAT_D32_SFLOAT:
+         case VK_FORMAT_D16_UNORM:
+         case VK_FORMAT_S8_UINT:
+         case VK_FORMAT_D24_UNORM_S8_UINT:
+         case VK_FORMAT_X8_D24_UNORM_PACK32:
+            if (transfer_cmd->src.vk_format != transfer_cmd->dst.vk_format) {
+               return vk_error(transfer_cmd->cmd_buffer,
+                               VK_ERROR_FORMAT_NOT_SUPPORTED);
+            }
+            break;
+
+         default:
+            return vk_error(transfer_cmd->cmd_buffer,
+                            VK_ERROR_FORMAT_NOT_SUPPORTED);
+         }
+
+         /* Instance rate. */
+         layer->sample_count = src_sample_count;
+         state->shader_props.full_rate = false;
+         break;
+
+      case PVR_RESOLVE_BLEND:
+         if (pvr_int_pbe_pixel_requires_usc_filter(dev_info,
+                                                   layer->pbe_format)) {
+            /* Instance rate. */
+            layer->sample_count = src_sample_count;
+            state->shader_props.full_rate = false;
+         } else {
+            /* Sample rate. */
+            state->shader_props.full_rate = true;
+            state->msaa_multiplier = src_sample_count;
+            state->down_scale = true;
+
+            pvr_csb_pack (&regs->isp_aa, CR_ISP_AA, reg) {
+               reg.mode = pvr_cr_isp_aa_mode_type(src_sample_count);
+            }
+         }
+         break;
+
+      default:
+         /* Shader doesn't have to know the number of samples. It's enough
+          * if the TPU knows, and the shader sets the right sno (given to the
+          * shader in resolve_op).
+          */
+         state->shader_props.full_rate = false;
+         break;
+      }
+   } else {
+      state->msaa_multiplier = dst_sample_count;
+
+      pvr_csb_pack (&regs->isp_aa, CR_ISP_AA, reg) {
+         reg.mode = pvr_cr_isp_aa_mode_type(dst_sample_count);
+      }
+
+      if (src_sample_count == 0U && dst_sample_count != 0U) {
+         /* S -> M (replicate samples) */
+         layer->msaa = false;
+         state->shader_props.full_rate = !state->shader_props.iterated;
+      } else {
+         /* M -> M (sample to sample) */
+         layer->msaa = true;
+         state->shader_props.full_rate = true;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static bool pvr_requires_usc_linear_filter(VkFormat format)
+{
+   switch (format) {
+   case VK_FORMAT_R32_SFLOAT:
+   case VK_FORMAT_R32G32_SFLOAT:
+   case VK_FORMAT_R32G32B32_SFLOAT:
+   case VK_FORMAT_R32G32B32A32_SFLOAT:
+   case VK_FORMAT_D32_SFLOAT:
+   case VK_FORMAT_D24_UNORM_S8_UINT:
+   case VK_FORMAT_X8_D24_UNORM_PACK32:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+pvr_int_pbe_usc_linear_filter(enum pvr_transfer_pbe_pixel_src pbe_format,
+                              bool sample,
+                              bool msaa,
+                              bool full_rate)
+{
+   if (sample || msaa || full_rate)
+      return false;
+
+   switch (pbe_format) {
+   case PVR_TRANSFER_PBE_PIXEL_SRC_D24S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_S8D24:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_D32S8:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F32:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F32X2:
+   case PVR_TRANSFER_PBE_PIXEL_SRC_F32X4:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool pvr_pick_component_needed(
+   const struct pvr_transfer_custom_mapping *custom_mapping)
+{
+   return custom_mapping->pass_count > 0U &&
+          custom_mapping->texel_extend_dst > 1U &&
+          custom_mapping->texel_extend_src <= 1U;
+}
+
+/** Writes the shader related constants into the DMA space. */
+static void
+pvr_write_usc_constants(const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout,
+                        uint32_t *dma_space)
+{
+   const uint32_t reg = sh_reg_layout->driver_total;
+   const uint32_t consts_count =
+      sh_reg_layout->compiler_out.usc_constants.count;
+
+   /* If not we likely need to write more consts. */
+   assert(consts_count == sh_reg_layout->compiler_out_total);
+
+   /* Append the usc consts after the driver allocated regs. */
+   for (uint32_t i = 0U; i < consts_count; i++)
+      dma_space[reg + i] = sh_reg_layout->compiler_out.usc_constants.values[i];
+}
+
+static inline void
+pvr_dma_texel_unwind(struct pvr_transfer_3d_state *state,
+                     const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout,
+                     uint32_t *mem_ptr)
+
+{
+   const uint32_t coord_sample_mask =
+      state->custom_mapping.texel_extend_dst - 1U;
+
+   mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] =
+      coord_sample_mask;
+   mem_ptr[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] =
+      state->custom_mapping.texel_unwind_dst;
+}
+
+/** Writes the Uniform/Texture state data segments + the UniTex code. */
+static inline VkResult
+pvr_pds_unitex(const struct pvr_device_info *dev_info,
+               struct pvr_transfer_ctx *ctx,
+               const struct pvr_transfer_cmd *transfer_cmd,
+               struct pvr_pds_pixel_shader_sa_program *program,
+               struct pvr_transfer_prep_data *prep_data)
+{
+   struct pvr_pds_upload *unitex_code =
+      &ctx->pds_unitex_code[program->num_texture_dma_kicks]
+                           [program->num_uniform_dma_kicks];
+   struct pvr_transfer_3d_state *state = &prep_data->state;
+   struct pvr_bo *pvr_bo;
+   VkResult result;
+
+   /* Uniform program is not used. */
+   assert(program->num_uniform_dma_kicks == 0U);
+
+   if (program->num_texture_dma_kicks == 0U) {
+      state->uniform_data_size = 0U;
+      state->tex_state_data_size = 0U;
+      state->tex_state_data_offset = 0U;
+      state->uni_tex_code_offset = 0U;
+
+      return VK_SUCCESS;
+   }
+
+   pvr_pds_set_sizes_pixel_shader_sa_uniform_data(program, dev_info);
+   assert(program->data_size == 0U);
+   state->uniform_data_size = 0U;
+
+   pvr_pds_set_sizes_pixel_shader_sa_texture_data(program, dev_info);
+   state->tex_state_data_size =
+      ALIGN_POT(program->data_size,
+                PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE));
+
+   result = pvr_cmd_buffer_alloc_mem(transfer_cmd->cmd_buffer,
+                                     ctx->device->heaps.pds_heap,
+                                     state->tex_state_data_size << 2U,
+                                     PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                                     &pvr_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   state->tex_state_data_offset =
+      pvr_bo->vma->dev_addr.addr - ctx->device->heaps.pds_heap->base_addr.addr;
+
+   pvr_pds_generate_pixel_shader_sa_texture_state_data(program,
+                                                       pvr_bo->bo->map,
+                                                       dev_info);
+
+   pvr_bo_cpu_unmap(transfer_cmd->cmd_buffer->device, pvr_bo);
+
+   /* Save the dev_addr and size in the 3D state. */
+   state->uni_tex_code_offset = unitex_code->code_offset;
+   state->pds_temps = program->temps_used;
+
+   return VK_SUCCESS;
+}
+
+/** Converts a float in range 0 to 1 to an N-bit fixed-point integer. */
+static uint32_t pvr_float_to_ufixed(float value, uint32_t bits)
+{
+   uint32_t max = (1U << bits) - 1U;
+
+   /* NaN and Inf and overflow. */
+   if (util_is_inf_or_nan(value) || value >= 1.0f)
+      return max;
+   else if (value < 0.0f)
+      return 0U;
+
+   /* Normalise. */
+   value = value * (float)max;
+
+   /* Cast to double so that we can accurately represent the sum for N > 23. */
+   return (uint32_t)floor((double)value + 0.5f);
+}
+
+/** Converts a float in range -1 to 1 to a signed N-bit fixed-point integer. */
+static uint32_t pvr_float_to_sfixed(float value, uint32_t N)
+{
+   int32_t max = (1 << (N - 1)) - 1;
+   int32_t min = 0 - (1 << (N - 1));
+   union fi x;
+
+   /* NaN and Inf and overflow. */
+   if (util_is_inf_or_nan(value) || value >= 1.0f)
+      return (uint32_t)max;
+   else if (value == 0.0f)
+      return 0U;
+   else if (value <= -1.0f)
+      return (uint32_t)min;
+
+   /* Normalise. */
+   value *= (float)max;
+
+   /* Cast to double so that we can accurately represent the sum for N > 23. */
+   if (value > 0.0f)
+      x.i = (int32_t)floor((double)value + 0.5f);
+   else
+      x.i = (int32_t)floor((double)value - 0.5f);
+
+   return x.ui;
+}
+
+/** Convert a value in IEEE single precision format to 16-bit floating point
+ * format.
+ */
+/* TODO: See if we can use _mesa_float_to_float16_rtz_slow() instead. */
+static uint16_t pvr_float_to_f16(float value, bool round_to_even)
+{
+   uint32_t input_value;
+   uint32_t exponent;
+   uint32_t mantissa;
+   uint16_t output;
+
+   /* 0.0f can be exactly expressed in binary using IEEE float format. */
+   if (value == 0.0f)
+      return 0U;
+
+   if (value < 0U) {
+      output = 0x8000;
+      value = -value;
+   } else {
+      output = 0U;
+   }
+
+   /* 2^16 * (2 - 1/1024) = highest f16 representable value. */
+   value = MIN2(value, 131008);
+   input_value = fui(value);
+
+   /* Extract the exponent and mantissa. */
+   exponent = util_get_float32_exponent(value) + 15;
+   mantissa = input_value & ((1 << 23) - 1);
+
+   /* If the exponent is outside the supported range then denormalise the
+    * mantissa.
+    */
+   if ((int32_t)exponent <= 0) {
+      uint32_t shift;
+
+      mantissa |= (1 << 23);
+      exponent = input_value >> 23;
+      shift = -14 + 127 - exponent;
+
+      if (shift < 24)
+         mantissa >>= shift;
+      else
+         mantissa = 0;
+   } else {
+      output = (uint16_t)(output | ((exponent << 10) & 0x7C00));
+   }
+
+   output = (uint16_t)(output | (((mantissa >> 13) << 0) & 0x03FF));
+
+   if (round_to_even) {
+      /* Round to nearest even. */
+      if ((((int)value) % 2 != 0) && (((1 << 13) - 1) & mantissa))
+         output++;
+   } else {
+      /* Round to nearest. */
+      if (mantissa & (1 << 12))
+         output++;
+   }
+
+   return output;
+}
+
+static VkResult pvr_pack_clear_color(VkFormat format,
+                                     const union fi color[static 4],
+                                     uint32_t pkd_color[static 4])
+{
+   const uint32_t red_width =
+      vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0U);
+   uint32_t pbe_pack_mode = pvr_get_pbe_packmode(format);
+   const bool pbe_norm = vk_format_is_normalized(format);
+
+   if (pbe_pack_mode == PVRX(PBESTATE_PACKMODE_INVALID))
+      return vk_error(NULL, VK_ERROR_FORMAT_NOT_SUPPORTED);
+
+   if (format == VK_FORMAT_A2B10G10R10_UINT_PACK32 ||
+       format == VK_FORMAT_A2R10G10B10_UINT_PACK32) {
+      pbe_pack_mode = PVRX(PBESTATE_PACKMODE_R10B10G10A2);
+   } else if (format == VK_FORMAT_B8G8R8A8_UNORM ||
+              format == VK_FORMAT_R8G8B8A8_UNORM) {
+      pbe_pack_mode = PVRX(PBESTATE_PACKMODE_U8U8U8U8);
+   } else if (format != VK_FORMAT_D16_UNORM &&
+              format != VK_FORMAT_X8_D24_UNORM_PACK32 && red_width <= 8U &&
+              vk_format_is_normalized(format)) {
+      pbe_pack_mode = PVRX(PBESTATE_PACKMODE_F16F16F16F16);
+   } else if (vk_format_is_srgb(format)) {
+      pbe_pack_mode = PVRX(PBESTATE_PACKMODE_F16F16F16F16);
+   }
+
+   /* Set packed color based on PBE pack mode and PBE norm. */
+   switch (pbe_pack_mode) {
+   case PVRX(PBESTATE_PACKMODE_U8U8U8U8):
+   case PVRX(PBESTATE_PACKMODE_A1R5G5B5):
+   case PVRX(PBESTATE_PACKMODE_R5G5B5A1):
+   case PVRX(PBESTATE_PACKMODE_A4R4G4B4):
+   case PVRX(PBESTATE_PACKMODE_A8R3G3B2):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 8) & 0xFFU;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 8) & 0xFFU) << 8;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[2].f, 8) & 0xFFU) << 16;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[3].f, 8) & 0xFFU) << 24;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFU) << 8;
+         pkd_color[0] |= (color[2].ui & 0xFFU) << 16;
+         pkd_color[0] |= (color[3].ui & 0xFFU) << 24;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S8S8S8S8):
+   case PVRX(PBESTATE_PACKMODE_X8U8S8S8):
+   case PVRX(PBESTATE_PACKMODE_X8S8S8U8):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_sfixed(color[0].f, 8) & 0xFFU;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 8) & 0xFFU) << 8;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[2].f, 8) & 0xFFU) << 16;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[3].f, 8) & 0xFFU) << 24;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFU) << 8;
+         pkd_color[0] |= (color[2].ui & 0xFFU) << 16;
+         pkd_color[0] |= (color[3].ui & 0xFFU) << 24;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_U16U16U16U16):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 16) & 0xFFFFU;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 16) & 0xFFFFU) << 16;
+         pkd_color[1] = pvr_float_to_ufixed(color[2].f, 16) & 0xFFFFU;
+         pkd_color[1] |= (pvr_float_to_ufixed(color[3].f, 16) & 0xFFFFU) << 16;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16;
+         pkd_color[1] = color[2].ui & 0xFFFFU;
+         pkd_color[1] |= (color[3].ui & 0xFFFFU) << 16;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S16S16S16S16):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_sfixed(color[0].f, 16) & 0xFFFFU;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 16) & 0xFFFFU) << 16;
+         pkd_color[1] = (pvr_float_to_sfixed(color[2].f, 16) & 0xFFFFU);
+         pkd_color[1] |= (pvr_float_to_sfixed(color[3].f, 16) & 0xFFFFU) << 16;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16;
+         pkd_color[1] = color[2].ui & 0xFFFFU;
+         pkd_color[1] |= (color[3].ui & 0xFFFFU) << 16;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_A2_XRBIAS_U10U10U10):
+   case PVRX(PBESTATE_PACKMODE_ARGBV16_XR10):
+   case PVRX(PBESTATE_PACKMODE_F16F16F16F16):
+   case PVRX(PBESTATE_PACKMODE_A2R10B10G10):
+      if (red_width > 0) {
+         pkd_color[0] = (uint32_t)pvr_float_to_f16(color[0].f, false);
+         pkd_color[0] |= (uint32_t)pvr_float_to_f16(color[1].f, false) << 16;
+         pkd_color[1] = (uint32_t)pvr_float_to_f16(color[2].f, false);
+         pkd_color[1] |= (uint32_t)pvr_float_to_f16(color[3].f, false) << 16;
+      } else {
+         /* Swizzle only uses first channel for alpha formats. */
+         pkd_color[0] = (uint32_t)pvr_float_to_f16(color[3].f, false);
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_U32U32U32U32):
+      pkd_color[0] = color[0].ui;
+      pkd_color[1] = color[1].ui;
+      pkd_color[2] = color[2].ui;
+      pkd_color[3] = color[3].ui;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S32S32S32S32):
+      pkd_color[0] = (uint32_t)color[0].i;
+      pkd_color[1] = (uint32_t)color[1].i;
+      pkd_color[2] = (uint32_t)color[2].i;
+      pkd_color[3] = (uint32_t)color[3].i;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_F32F32F32F32):
+      memcpy(pkd_color, &color[0].f, 4U * sizeof(float));
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_R10B10G10A2):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 10) & 0xFFU;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 10) & 0xFFU) << 10;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[2].f, 10) & 0xFFU) << 20;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[3].f, 2) & 0xFFU) << 30;
+      } else if (format == VK_FORMAT_A2R10G10B10_UINT_PACK32) {
+         pkd_color[0] = color[2].ui & 0x3FFU;
+         pkd_color[0] |= (color[1].ui & 0x3FFU) << 10;
+         pkd_color[0] |= (color[0].ui & 0x3FFU) << 20;
+         pkd_color[0] |= (color[3].ui & 0x3U) << 30;
+      } else {
+         pkd_color[0] = color[0].ui & 0x3FFU;
+         pkd_color[0] |= (color[1].ui & 0x3FFU) << 10;
+         pkd_color[0] |= (color[2].ui & 0x3FFU) << 20;
+         pkd_color[0] |= (color[3].ui & 0x3U) << 30;
+      }
+
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_A2F10F10F10):
+   case PVRX(PBESTATE_PACKMODE_F10F10F10A2):
+      pkd_color[0] = pvr_float_to_sfixed(color[0].f, 10) & 0xFFU;
+      pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 10) & 0xFFU) << 10;
+      pkd_color[0] |= (pvr_float_to_sfixed(color[2].f, 10) & 0xFFU) << 20;
+      pkd_color[0] |= (pvr_float_to_sfixed(color[3].f, 2) & 0xFFU) << 30;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_U8U8U8):
+   case PVRX(PBESTATE_PACKMODE_R5G6B5):
+   case PVRX(PBESTATE_PACKMODE_R5SG5SB6):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 8) & 0xFFU;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 8) & 0xFFU) << 8;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[2].f, 8) & 0xFFU) << 16;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFU) << 8;
+         pkd_color[0] |= (color[2].ui & 0xFFU) << 16;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S8S8S8):
+   case PVRX(PBESTATE_PACKMODE_B6G5SR5S):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_sfixed(color[0].f, 8) & 0xFFU;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 8) & 0xFFU) << 8;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[2].f, 8) & 0xFFU) << 16;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFU) << 8;
+         pkd_color[0] |= (color[2].ui & 0xFFU) << 16;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_U16U16U16):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 16) & 0xFFFFU;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 16) & 0xFFFFU) << 16;
+         pkd_color[1] = (pvr_float_to_ufixed(color[2].f, 16) & 0xFFFFU);
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16;
+         pkd_color[1] = color[2].ui & 0xFFFFU;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S16S16S16):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_sfixed(color[0].f, 16) & 0xFFFFU;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 16) & 0xFFFFU) << 16;
+         pkd_color[1] = pvr_float_to_sfixed(color[2].f, 16) & 0xFFFFU;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16;
+         pkd_color[1] = color[2].ui & 0xFFFFU;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_F16F16F16):
+   case PVRX(PBESTATE_PACKMODE_F11F11F10):
+   case PVRX(PBESTATE_PACKMODE_F10F11F11):
+   case PVRX(PBESTATE_PACKMODE_SE9995):
+      pkd_color[0] = (uint32_t)pvr_float_to_f16(color[0].f, true);
+      pkd_color[0] |= (uint32_t)pvr_float_to_f16(color[1].f, true) << 16;
+      pkd_color[1] = (uint32_t)pvr_float_to_f16(color[2].f, true);
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_U32U32U32):
+      pkd_color[0] = color[0].ui;
+      pkd_color[1] = color[1].ui;
+      pkd_color[2] = color[2].ui;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S32S32S32):
+      pkd_color[0] = (uint32_t)color[0].i;
+      pkd_color[1] = (uint32_t)color[1].i;
+      pkd_color[2] = (uint32_t)color[2].i;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_X24G8X32):
+   case PVRX(PBESTATE_PACKMODE_U8X24):
+      pkd_color[1] = (color[1].ui & 0xFFU) << 24;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_F32F32F32):
+      memcpy(pkd_color, &color[0].f, 3U * sizeof(float));
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_U8U8):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 8) & 0xFFU;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 8) & 0xFFU) << 8;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFU) << 8;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S8S8):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_sfixed(color[0].f, 8) & 0xFFU;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 8) & 0xFFU) << 8;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[2].f, 8) & 0xFFU) << 16;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[3].f, 8) & 0xFFU) << 24;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFU) << 8;
+         pkd_color[0] |= (color[2].ui & 0xFFU) << 16;
+         pkd_color[0] |= (color[3].ui & 0xFFU) << 24;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_U16U16):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 16) & 0xFFFFU;
+         pkd_color[0] |= (pvr_float_to_ufixed(color[1].f, 16) & 0xFFFFU) << 16;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S16S16):
+      if (pbe_norm) {
+         pkd_color[0] = pvr_float_to_sfixed(color[0].f, 16) & 0xFFFFU;
+         pkd_color[0] |= (pvr_float_to_sfixed(color[1].f, 16) & 0xFFFFU) << 16;
+      } else {
+         pkd_color[0] = color[0].ui & 0xFFFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFFFU) << 16;
+      }
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_F16F16):
+      pkd_color[0] = (uint32_t)pvr_float_to_f16(color[0].f, true);
+      pkd_color[0] |= (uint32_t)pvr_float_to_f16(color[1].f, true) << 16;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_U32U32):
+      pkd_color[0] = color[0].ui;
+      pkd_color[1] = color[1].ui;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S32S32):
+      pkd_color[0] = (uint32_t)color[0].i;
+      pkd_color[1] = (uint32_t)color[1].i;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_X24U8F32):
+   case PVRX(PBESTATE_PACKMODE_X24X8F32):
+      memcpy(pkd_color, &color[0].f, 1U * sizeof(float));
+      pkd_color[1] = color[1].ui & 0xFFU;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_F32F32):
+      memcpy(pkd_color, &color[0].f, 2U * sizeof(float));
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_ST8U24):
+      pkd_color[0] = pvr_float_to_ufixed(color[0].f, 24) & 0xFFFFFFU;
+      pkd_color[0] |= color[1].ui << 24;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_U8):
+      if (format == VK_FORMAT_S8_UINT)
+         pkd_color[0] = color[1].ui & 0xFFU;
+      else if (pbe_norm)
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 8) & 0xFFU;
+      else
+         pkd_color[0] = color[0].ui & 0xFFU;
+
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S8):
+      if (pbe_norm)
+         pkd_color[0] = pvr_float_to_sfixed(color[0].f, 8) & 0xFFU;
+      else
+         pkd_color[0] = color[0].ui & 0xFFU;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_U16):
+      if (pbe_norm)
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 16) & 0xFFFFU;
+      else
+         pkd_color[0] = color[0].ui & 0xFFFFU;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_S16):
+      if (pbe_norm)
+         pkd_color[0] = pvr_float_to_sfixed(color[0].f, 16) & 0xFFFFU;
+      else
+         pkd_color[0] = color[0].ui & 0xFFFFU;
+      break;
+
+   case PVRX(PBESTATE_PACKMODE_F16):
+      pkd_color[0] = (uint32_t)pvr_float_to_f16(color[0].f, true);
+      break;
+
+   /* U32 */
+   case PVRX(PBESTATE_PACKMODE_U32):
+      if (format == VK_FORMAT_X8_D24_UNORM_PACK32) {
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 24) & 0xFFFFFFU;
+      } else if (format == VK_FORMAT_D24_UNORM_S8_UINT) {
+         pkd_color[0] = pvr_float_to_ufixed(color[0].f, 24) & 0xFFFFFFU;
+         pkd_color[0] |= (color[1].ui & 0xFFU) << 24;
+      } else {
+         pkd_color[0] = color[0].ui;
+      }
+      break;
+
+   /* U24ST8 */
+   case PVRX(PBESTATE_PACKMODE_U24ST8):
+      pkd_color[1] = (color[1].ui & 0xFFU) << 24;
+      pkd_color[1] |= pvr_float_to_ufixed(color[0].f, 24) & 0xFFFFFFU;
+      break;
+
+   /* S32 */
+   case PVRX(PBESTATE_PACKMODE_S32):
+      pkd_color[0] = (uint32_t)color[0].i;
+      break;
+
+   /* F32 */
+   case PVRX(PBESTATE_PACKMODE_F32):
+      memcpy(pkd_color, &color[0].f, sizeof(float));
+      break;
+
+   /* X8U24 */
+   case PVRX(PBESTATE_PACKMODE_X8U24):
+      pkd_color[0] = pvr_float_to_ufixed(color[0].f, 24) & 0xFFFFFFU;
+      break;
+
+   default:
+      break;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult pvr_3d_copy_blit_core(struct pvr_transfer_ctx *ctx,
+                                      struct pvr_transfer_cmd *transfer_cmd,
+                                      struct pvr_transfer_prep_data *prep_data,
+                                      uint32_t pass_idx,
+                                      bool *finished_out)
+{
+   struct pvr_transfer_3d_state *const state = &prep_data->state;
+   struct pvr_winsys_transfer_regs *const regs = &state->regs;
+   struct pvr_device *const device = ctx->device;
+   const struct pvr_device_info *const dev_info = &device->pdevice->dev_info;
+
+   VkResult result;
+
+   *finished_out = true;
+
+   state->common_ptr = 0U;
+   state->dynamic_const_reg_ptr = 0U;
+   state->usc_const_reg_ptr = 0U;
+
+   if ((transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FILL) != 0U) {
+      uint32_t packed_color[4U] = { 0U };
+
+      if (vk_format_is_compressed(transfer_cmd->dst.vk_format))
+         return vk_error(device, VK_ERROR_FORMAT_NOT_SUPPORTED);
+
+      /* No shader. */
+      state->pds_temps = 0U;
+      state->uniform_data_size = 0U;
+      state->tex_state_data_size = 0U;
+
+      /* No background enabled. */
+      /* clang-format off */
+      pvr_csb_pack (&regs->isp_bgobjvals, CR_ISP_BGOBJVALS, reg);
+      /* clang-format on */
+      pvr_csb_pack (&regs->isp_aa, CR_ISP_AA, reg) {
+         reg.mode = pvr_cr_isp_aa_mode_type(transfer_cmd->dst.sample_count);
+      }
+
+      result = pvr_pack_clear_color(transfer_cmd->dst.vk_format,
+                                    transfer_cmd->clear_color,
+                                    packed_color);
+      if (result != VK_SUCCESS)
+         return result;
+
+      pvr_csb_pack (&regs->usc_clear_register0, CR_USC_CLEAR_REGISTER0, reg) {
+         reg.val = packed_color[0U];
+      }
+
+      pvr_csb_pack (&regs->usc_clear_register1, CR_USC_CLEAR_REGISTER1, reg) {
+         reg.val = packed_color[1U];
+      }
+
+      pvr_csb_pack (&regs->usc_clear_register2, CR_USC_CLEAR_REGISTER2, reg) {
+         reg.val = packed_color[2U];
+      }
+
+      pvr_csb_pack (&regs->usc_clear_register3, CR_USC_CLEAR_REGISTER3, reg) {
+         reg.val = packed_color[3U];
+      }
+
+      state->msaa_multiplier = transfer_cmd->dst.sample_count & ~1U;
+      state->pds_shader_task_offset = 0U;
+      state->uni_tex_code_offset = 0U;
+      state->tex_state_data_offset = 0U;
+   } else if (transfer_cmd->src_present) {
+      const struct pvr_tq_frag_sh_reg_layout nop_sh_reg_layout = {
+         /* TODO: Setting this to 1 so that we don't try to pvr_bo_alloc() with
+          * zero size. The device will ignore the PDS program if USC_SHAREDSIZE
+          * is zero and in the case of the nop shader we're expecting it to be
+          * zero. See if we can safely pass PVR_DEV_ADDR_INVALID for the unitex
+          * program.
+          */
+         .driver_total = 1,
+      };
+      const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout;
+      struct pvr_pds_pixel_shader_sa_program unitex_prog = { 0U };
+      uint32_t tex_state_dma_size_dw;
+      struct pvr_bo *pvr_bo;
+      uint32_t *dma_space;
+
+      result = pvr_pbe_src_format(transfer_cmd, state, &state->shader_props);
+      if (result != VK_SUCCESS)
+         return result;
+
+      pvr_uv_space(dev_info, transfer_cmd, state);
+
+      state->shader_props.iterated = false;
+
+      state->shader_props.layer_props.byte_unwind = 0U;
+      state->shader_props.layer_props.sample =
+         transfer_cmd->src.mem_layout == PVR_MEMLAYOUT_3DTWIDDLED;
+
+      result = pvr_msaa_state(dev_info, transfer_cmd, state);
+      if (result != VK_SUCCESS)
+         return result;
+
+      state->shader_props.pick_component =
+         pvr_pick_component_needed(&state->custom_mapping);
+      state->shader_props.alpha_type = transfer_cmd->blit.alpha.type;
+
+      if (state->shader_props.alpha_type != PVR_ALPHA_NONE &&
+          (state->shader_props.layer_props.pbe_format !=
+              PVR_TRANSFER_PBE_PIXEL_SRC_F16F16 &&
+           state->shader_props.layer_props.pbe_format !=
+              PVR_TRANSFER_PBE_PIXEL_SRC_F16_U8)) {
+         return vk_error(device, VK_ERROR_FORMAT_NOT_SUPPORTED);
+      }
+
+      if (state->filter == PVR_FILTER_LINEAR &&
+          pvr_requires_usc_linear_filter(transfer_cmd->src.vk_format)) {
+         if (pvr_int_pbe_usc_linear_filter(
+                state->shader_props.layer_props.pbe_format,
+                state->shader_props.layer_props.sample,
+                state->shader_props.layer_props.msaa,
+                state->shader_props.full_rate)) {
+            state->shader_props.layer_props.linear = true;
+         } else {
+            mesa_logw("Transfer: F32 linear filter not supported.");
+         }
+      }
+
+      if (state->empty_dst) {
+         sh_reg_layout = &nop_sh_reg_layout;
+         state->pds_shader_task_offset = device->nop_program.pds.data_offset;
+      } else {
+         pvr_dev_addr_t kick_usc_pds_dev_addr;
+
+         result =
+            pvr_transfer_frag_store_get_shader_info(device,
+                                                    &ctx->frag_store,
+                                                    &state->shader_props,
+                                                    &kick_usc_pds_dev_addr,
+                                                    &sh_reg_layout);
+         if (result != VK_SUCCESS)
+            return result;
+
+         assert(kick_usc_pds_dev_addr.addr <= UINT32_MAX);
+         state->pds_shader_task_offset = (uint32_t)kick_usc_pds_dev_addr.addr;
+      }
+
+      unitex_prog.kick_usc = false;
+      unitex_prog.clear = false;
+
+      tex_state_dma_size_dw =
+         sh_reg_layout->driver_total + sh_reg_layout->compiler_out_total;
+
+      unitex_prog.num_texture_dma_kicks = 1U;
+      unitex_prog.num_uniform_dma_kicks = 0U;
+
+      result = pvr_cmd_buffer_alloc_mem(transfer_cmd->cmd_buffer,
+                                        device->heaps.general_heap,
+                                        tex_state_dma_size_dw << 2U,
+                                        PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                                        &pvr_bo);
+      if (result != VK_SUCCESS)
+         return result;
+
+      dma_space = (uint32_t *)pvr_bo->bo->map;
+
+      result = pvr_sampler_image_state(ctx,
+                                       transfer_cmd,
+                                       sh_reg_layout,
+                                       state,
+                                       dma_space);
+      if (result != VK_SUCCESS)
+         return result;
+
+      if (state->shader_props.alpha_type == PVR_ALPHA_GLOBAL ||
+          state->shader_props.alpha_type ==
+             PVR_ALPHA_PREMUL_SOURCE_WITH_GLOBAL) {
+         pvr_dma_global_alpha(&transfer_cmd->blit.alpha,
+                              state,
+                              sh_reg_layout,
+                              dma_space);
+      }
+
+      pvr_dma_texture_floats(transfer_cmd, state, sh_reg_layout, dma_space);
+
+      if (transfer_cmd->src.mem_layout == PVR_MEMLAYOUT_3DTWIDDLED) {
+         dma_space[pvr_dynamic_const_reg_advance(sh_reg_layout, state)] =
+            fui(transfer_cmd->src.z_position);
+      }
+
+      pvr_write_usc_constants(sh_reg_layout, dma_space);
+
+      if (pvr_pick_component_needed(&state->custom_mapping))
+         pvr_dma_texel_unwind(state, sh_reg_layout, dma_space);
+
+      pvr_pds_encode_dma_burst(unitex_prog.texture_dma_control,
+                               unitex_prog.texture_dma_address,
+                               state->common_ptr,
+                               tex_state_dma_size_dw,
+                               pvr_bo->vma->dev_addr.addr,
+                               true,
+                               dev_info);
+
+      state->common_ptr += tex_state_dma_size_dw;
+
+      result =
+         pvr_pds_unitex(dev_info, ctx, transfer_cmd, &unitex_prog, prep_data);
+      if (result != VK_SUCCESS)
+         return result;
+
+      pvr_csb_pack (&regs->isp_bgobjvals, CR_ISP_BGOBJVALS, reg) {
+         reg.enablebgtag = true;
+      }
+
+      /* clang-format off */
+      pvr_csb_pack (&regs->isp_aa, CR_ISP_AA, reg);
+      /* clang-format on */
+   } else {
+      /* No shader. */
+      state->pds_temps = 0U;
+      state->uniform_data_size = 0U;
+      state->tex_state_data_size = 0U;
+
+      /* No background enabled. */
+      /* clang-format off */
+      pvr_csb_pack (&regs->isp_bgobjvals, CR_ISP_BGOBJVALS, reg);
+      /* clang-format on */
+      pvr_csb_pack (&regs->isp_aa, CR_ISP_AA, reg) {
+         reg.mode = pvr_cr_isp_aa_mode_type(transfer_cmd->dst.sample_count);
+      }
+      state->msaa_multiplier = transfer_cmd->dst.sample_count & ~1U;
+      state->pds_shader_task_offset = 0U;
+      state->uni_tex_code_offset = 0U;
+      state->tex_state_data_offset = 0U;
+
+      result = pvr_pbe_src_format(transfer_cmd, state, &state->shader_props);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   pvr_setup_hwbg_object(dev_info, state);
+
+   pvr_csb_pack (&regs->isp_render, CR_ISP_RENDER, reg) {
+      reg.mode_type = PVRX(CR_ISP_RENDER_MODE_TYPE_FAST_SCALE);
+      pvr_finishme("Remove direction hardcoding.");
+      reg.dir_type = PVRX(CR_DIR_TYPE_TL2BR);
+   }
+
+   /* Set up pixel event handling. */
+   result = pvr_pbe_setup(transfer_cmd, ctx, state);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = pvr_isp_tiles(device, state);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (PVR_HAS_FEATURE(&device->pdevice->dev_info, gpu_multicore_support)) {
+      pvr_csb_pack (&regs->frag_screen, CR_FRAG_SCREEN, reg) {
+         reg.xmax = transfer_cmd->dst.width - 1;
+         reg.ymax = transfer_cmd->dst.height - 1;
+      }
+   }
+
+   if ((pass_idx + 1U) < state->custom_mapping.pass_count)
+      *finished_out = false;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+pvr_pbe_src_format_f2d(uint32_t merge_flags,
+                       struct pvr_transfer_cmd_surface *src,
+                       VkFormat dst_format,
+                       bool down_scale,
+                       bool dont_force_pbe,
+                       enum pvr_transfer_pbe_pixel_src *pixel_format_out)
+{
+   VkFormat src_format = src->vk_format;
+
+   /* This has to come before the rest as S8 for instance is integer and
+    * signedsess check fails on D24S8.
+    */
+   if (vk_format_is_depth_or_stencil(src_format) ||
+       vk_format_is_depth_or_stencil(dst_format) ||
+       merge_flags & PVR_TRANSFER_CMD_FLAGS_DSMERGE) {
+      pvr_finishme("Complete pvr_pbe_src_format_f2d().");
+   }
+
+   return pvr_pbe_src_format_normal(src_format,
+                                    dst_format,
+                                    down_scale,
+                                    dont_force_pbe,
+                                    pixel_format_out);
+}
+
+/** Writes the coefficient loading PDS task. */
+static inline VkResult
+pvr_pds_coeff_task(struct pvr_transfer_ctx *ctx,
+                   const struct pvr_transfer_cmd *transfer_cmd,
+                   const bool sample_3d,
+                   struct pvr_transfer_prep_data *prep_data)
+{
+   struct pvr_transfer_3d_state *state = &prep_data->state;
+   struct pvr_pds_coeff_loading_program program = { 0U };
+   struct pvr_bo *pvr_bo;
+   VkResult result;
+
+   program.num_fpu_iterators = 1U;
+
+   pvr_csb_pack (&program.FPU_iterators[0U],
+                 PDSINST_DOUT_FIELDS_DOUTI_SRC,
+                 reg) {
+      if (sample_3d)
+         reg.size = PVRX(PDSINST_DOUTI_SIZE_3D);
+      else
+         reg.size = PVRX(PDSINST_DOUTI_SIZE_2D);
+
+      reg.perspective = false;
+
+      /* Varying wrap on the TSP means that the TSP chooses the shorter path
+       * out of the normal and the wrapping path i.e. chooses between u0->u1
+       * and u1->1.0 == 0.0 -> u0. We don't need this behavior.
+       */
+      /*
+       * if RHW ever needed offset SRC_F32 to the first U in 16 bit units
+       * l0 U    <= offs 0
+       * l0 V
+       * l1 U    <= offs 4
+       * ...
+       */
+      reg.shademodel = PVRX(PDSINST_DOUTI_SHADEMODEL_GOURUAD);
+      reg.f32_offset = 0U;
+   }
+
+   if (sample_3d)
+      state->usc_coeff_regs = 12U;
+   else
+      state->usc_coeff_regs = 8U;
+
+   pvr_pds_set_sizes_coeff_loading(&program);
+
+   result =
+      pvr_cmd_buffer_alloc_mem(transfer_cmd->cmd_buffer,
+                               ctx->device->heaps.pds_heap,
+                               (program.data_size + program.code_size) << 2U,
+                               PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                               &pvr_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   state->pds_coeff_task_offset =
+      pvr_bo->vma->dev_addr.addr - ctx->device->heaps.pds_heap->base_addr.addr;
+
+   pvr_pds_generate_coeff_loading_program(&program, pvr_bo->bo->map);
+
+   state->coeff_data_size = program.data_size;
+   state->pds_temps = program.temps_used;
+
+   return VK_SUCCESS;
+}
+
+#define X 0U
+#define Y 1U
+#define Z 2U
+
+static void pvr_tsp_floats(const struct pvr_device_info *dev_info,
+                           VkRect2D *rect,
+                           const float recips[3U],
+                           bool custom_filter,
+                           bool z_present,
+                           float z_value,
+                           struct pvr_transfer_3d_iteration *layer)
+{
+#define U0 0U
+#define U1 1U
+#define V0 2U
+#define V1 3U
+
+   const uint32_t indices[8U] = { U0, V0, U0, V1, U1, V1, U1, V0 };
+   float delta[2U] = { 0.0f, 0.0f };
+   int32_t non_normalized[4U];
+   uint32_t src_flipped[2U];
+   uint32_t normalized[4U];
+   int32_t src_span[2U];
+
+   non_normalized[U0] = rect->offset.x;
+   non_normalized[U1] = rect->offset.x + rect->extent.width;
+   non_normalized[V0] = rect->offset.y;
+   non_normalized[V1] = rect->offset.y + rect->extent.height;
+
+   /* Filter adjust. */
+   src_span[X] = rect->extent.width;
+   src_flipped[X] = src_span[X] > 0U ? 0U : 1U;
+   src_span[Y] = rect->extent.height;
+   src_flipped[Y] = src_span[Y] > 0U ? 0U : 1U;
+   /*
+    * | X  | Y  | srcFlipX | srcFlipY |
+    * +----+----+----------+----------|
+    * | X  | Y  | 0        | 0        |
+    * | -X | Y  | 1        | 0        |
+    * | X  | -Y | 0        | 1        |
+    * | -X | -Y | 1        | 1        |
+    */
+   for (uint32_t i = X; i <= Y; i++) {
+      if (custom_filter) {
+         if (src_flipped[i] != 0U)
+            delta[i] += 0.25;
+         else
+            delta[i] -= 0.25;
+      }
+   }
+
+   /* Normalize. */
+   for (uint32_t i = 0U; i < ARRAY_SIZE(normalized); i++) {
+      uint32_t tmp;
+      float ftmp;
+
+      ftmp = (float)non_normalized[i] + delta[i >> 1U];
+      ftmp *= recips[i >> 1U];
+
+      tmp = fui(ftmp);
+      if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format))
+         tmp = XXH_rotl32(tmp, 1U);
+
+      normalized[i] = tmp;
+   }
+
+   /* Apply indices. */
+   for (uint32_t i = 0U; i < 8U; i++)
+      layer->texture_coords[i] = normalized[indices[i]];
+
+   if (z_present) {
+      uint32_t tmp = fui(z_value * recips[2U]);
+
+      if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format))
+         tmp = XXH_rotl32(tmp, 1U);
+
+      for (uint32_t i = 8U; i < 12U; i++)
+         layer->texture_coords[i] = tmp;
+   }
+
+#undef U0
+#undef U1
+#undef V0
+#undef V1
+}
+
+static void
+pvr_isp_prim_block_tsp_vertex_block(const struct pvr_device_info *dev_info,
+                                    const struct pvr_transfer_cmd_surface *src,
+                                    struct pvr_rect_mapping *mappings,
+                                    bool custom_filter,
+                                    uint32_t num_mappings,
+                                    uint32_t mapping_offset,
+                                    enum pvr_filter filter,
+                                    uint32_t tsp_comp_format_in_dw,
+                                    uint32_t **const cs_ptr_out)
+{
+   struct pvr_transfer_3d_iteration layer;
+   uint32_t *cs_ptr = *cs_ptr_out;
+
+   /*  |<-32b->|
+    *  +-------+-----
+    *  |  RHW  |    | X num_isp_vertices
+    *  +-------+--  |
+    *  |  U    | |  |
+    *  |  V    | | X PVR_TRANSFER_NUM_LAYERS
+    *  +-------+-----
+    *
+    * RHW is not there any more in the Transfer. The comment still explains
+    * where it should go if ever needed.
+    */
+   for (uint32_t i = mapping_offset; i < mapping_offset + num_mappings; i++) {
+      bool z_present = src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED;
+      const float recips[3U] = {
+         [X] = 1.0f / (float)src->width,
+         [Y] = 1.0f / (float)src->height,
+         [Z] = z_present ? 1.0f / (float)src->depth : 0.0f,
+      };
+      float z_pos = (filter < PVR_FILTER_LINEAR) ? floor(src->z_position + 0.5f)
+                                                 : src->z_position;
+
+      pvr_tsp_floats(dev_info,
+                     &mappings[i].src_rect,
+                     recips,
+                     custom_filter,
+                     z_present,
+                     z_pos,
+                     &layer);
+
+      /* We request UVs from TSP for ISP triangle:
+       *  0 u 1
+       *  +---,
+       * v|  /|
+       *  | / |
+       * 2'/--'3
+       */
+      for (uint32_t j = 0U; j < PVR_TRANSFER_NUM_LAYERS; j++) {
+         *cs_ptr++ = layer.texture_coords[0U];
+         *cs_ptr++ = layer.texture_coords[1U];
+      }
+
+      if (z_present) {
+         *cs_ptr++ = layer.texture_coords[8U];
+         *cs_ptr++ = 0U;
+      }
+
+      for (uint32_t j = 0U; j < PVR_TRANSFER_NUM_LAYERS; j++) {
+         *cs_ptr++ = layer.texture_coords[6U];
+         *cs_ptr++ = layer.texture_coords[7U];
+      }
+
+      if (z_present) {
+         *cs_ptr++ = layer.texture_coords[11U];
+         *cs_ptr++ = 0U;
+      }
+
+      for (uint32_t j = 0U; j < PVR_TRANSFER_NUM_LAYERS; j++) {
+         *cs_ptr++ = layer.texture_coords[2U];
+         *cs_ptr++ = layer.texture_coords[3U];
+      }
+
+      if (z_present) {
+         *cs_ptr++ = layer.texture_coords[9U];
+         *cs_ptr++ = 0U;
+      }
+
+      for (uint32_t j = 0U; j < PVR_TRANSFER_NUM_LAYERS; j++) {
+         *cs_ptr++ = layer.texture_coords[4U];
+         *cs_ptr++ = layer.texture_coords[5U];
+      }
+
+      if (z_present) {
+         *cs_ptr++ = layer.texture_coords[10U];
+         *cs_ptr++ = 0U;
+      }
+   }
+
+   if (!PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
+      /* Skipped optional primitive id. */
+      for (uint32_t i = 0U; i < tsp_comp_format_in_dw; i++)
+         *cs_ptr++ = 0x88888888U;
+   } else {
+      /* Align back to 64 bits. */
+      if (((uintptr_t)cs_ptr & 7U) != 0U)
+         cs_ptr++;
+   }
+
+   *cs_ptr_out = cs_ptr;
+}
+
+#undef X
+#undef Y
+#undef Z
+
+static void pvr_isp_prim_block_pds_state(const struct pvr_device_info *dev_info,
+                                         struct pvr_transfer_ctx *ctx,
+                                         struct pvr_transfer_3d_state *state,
+                                         uint32_t **const cs_ptr_out)
+{
+   uint32_t *cs_ptr = *cs_ptr_out;
+
+   pvr_csb_pack (cs_ptr, TA_STATE_PDS_SHADERBASE, shader_base) {
+      shader_base.addr = PVR_DEV_ADDR(state->pds_shader_task_offset);
+   }
+   cs_ptr++;
+
+   pvr_csb_pack (cs_ptr, TA_STATE_PDS_TEXUNICODEBASE, tex_base) {
+      tex_base.addr = PVR_DEV_ADDR(state->uni_tex_code_offset);
+   }
+   cs_ptr++;
+
+   pvr_csb_pack (cs_ptr, TA_STATE_PDS_SIZEINFO1, info1) {
+      info1.pds_uniformsize =
+         state->uniform_data_size /
+         PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE);
+
+      info1.pds_texturestatesize =
+         state->tex_state_data_size /
+         PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE);
+
+      info1.pds_varyingsize =
+         state->coeff_data_size /
+         PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE);
+
+      info1.usc_varyingsize =
+         ALIGN_POT(state->usc_coeff_regs,
+                   PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE)) /
+         PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE);
+
+      info1.pds_tempsize =
+         ALIGN_POT(state->pds_temps,
+                   PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE)) /
+         PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE);
+   }
+   cs_ptr++;
+
+   pvr_csb_pack (cs_ptr, TA_STATE_PDS_VARYINGBASE, base) {
+      base.addr = PVR_DEV_ADDR(state->pds_coeff_task_offset);
+   }
+   cs_ptr++;
+
+   pvr_csb_pack (cs_ptr, TA_STATE_PDS_TEXTUREDATABASE, base) {
+      base.addr = PVR_DEV_ADDR(state->tex_state_data_offset);
+   }
+   cs_ptr++;
+
+   /* PDS uniform program not used. */
+   pvr_csb_pack (cs_ptr, TA_STATE_PDS_UNIFORMDATABASE, base) {
+      base.addr = PVR_DEV_ADDR(0U);
+   }
+   cs_ptr++;
+
+   pvr_csb_pack (cs_ptr, TA_STATE_PDS_SIZEINFO2, info) {
+      info.usc_sharedsize =
+         ALIGN_POT(state->common_ptr,
+                   PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE)) /
+         PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE);
+      info.pds_tri_merge_disable = !PVR_HAS_ERN(dev_info, 42307);
+      info.pds_batchnum = 0U;
+   }
+   cs_ptr++;
+
+   /* Get back to 64 bits boundary. */
+   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format))
+      cs_ptr++;
+
+   *cs_ptr_out = cs_ptr;
+}
+
+static void pvr_isp_prim_block_isp_state(const struct pvr_device_info *dev_info,
+                                         UNUSED uint32_t tsp_comp_format_in_dw,
+                                         uint32_t tsp_data_size_in_bytes,
+                                         uint32_t num_isp_vertices,
+                                         bool read_bgnd,
+                                         uint32_t **const cs_ptr_out)
+{
+   uint32_t *cs_ptr = *cs_ptr_out;
+
+   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format_v2)) {
+      pvr_finishme("Unimplemented path.");
+   } else {
+      /* ISP state words. */
+
+      /* clang-format off */
+      pvr_csb_pack (cs_ptr, TA_STATE_ISPCTL, ispctl);
+      /* clang-format on */
+      cs_ptr += pvr_cmd_length(TA_STATE_ISPCTL);
+
+      pvr_csb_pack (cs_ptr, TA_STATE_ISPA, ispa) {
+         ispa.objtype = PVRX(TA_OBJTYPE_TRIANGLE);
+         ispa.passtype = read_bgnd ? PVRX(TA_PASSTYPE_TRANSLUCENT)
+                                   : PVRX(TA_PASSTYPE_OPAQUE);
+         ispa.dcmpmode = PVRX(TA_CMPMODE_ALWAYS);
+         ispa.dwritedisable = true;
+      }
+      cs_ptr += pvr_cmd_length(TA_STATE_ISPA);
+
+      /* How many bytes the TSP compression format needs? */
+      pvr_csb_pack (cs_ptr, IPF_COMPRESSION_SIZE_WORD, word) {
+         word.cs_isp_comp_table_size = 0U;
+         word.cs_tsp_comp_format_size = tsp_comp_format_in_dw;
+         word.cs_tsp_comp_table_size = 0U;
+         word.cs_tsp_comp_vertex_size =
+            tsp_data_size_in_bytes / num_isp_vertices;
+      }
+      cs_ptr += pvr_cmd_length(IPF_COMPRESSION_SIZE_WORD);
+
+      /* ISP vertex compression. */
+      pvr_csb_pack (cs_ptr, IPF_ISP_COMPRESSION_WORD_0, word0) {
+         word0.cf_isp_comp_fmt_x0 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE);
+         word0.cf_isp_comp_fmt_x1 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE);
+         word0.cf_isp_comp_fmt_x2 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE);
+         word0.cf_isp_comp_fmt_y0 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE);
+         word0.cf_isp_comp_fmt_y1 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE);
+         word0.cf_isp_comp_fmt_y2 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE);
+         word0.cf_isp_comp_fmt_z0 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE);
+         word0.cf_isp_comp_fmt_z1 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE);
+      }
+      cs_ptr += pvr_cmd_length(IPF_ISP_COMPRESSION_WORD_0);
+
+      pvr_csb_pack (cs_ptr, IPF_ISP_COMPRESSION_WORD_1, word1) {
+         word1.vf_prim_msaa = 0U;
+         word1.vf_prim_id_pres = 0U;
+         word1.vf_vertex_clipped = 0U;
+         word1.vf_vertex_total = num_isp_vertices - 1U;
+         word1.cf_isp_comp_fmt_z3 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE);
+         word1.cf_isp_comp_fmt_z2 = PVRX(IPF_COMPRESSION_FORMAT_RAW_BYTE);
+      }
+      cs_ptr += pvr_cmd_length(IPF_ISP_COMPRESSION_WORD_1);
+   }
+
+   *cs_ptr_out = cs_ptr;
+}
+
+static void
+pvr_isp_prim_block_index_block(const struct pvr_device_info *dev_info,
+                               uint32_t num_mappings,
+                               uint32_t **const cs_ptr_out)
+{
+   uint32_t *cs_ptr = *cs_ptr_out;
+
+   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
+      pvr_finishme("Unimplemented path.");
+   } else {
+      for (uint32_t i = 0U, j = 0U; i < num_mappings; i++, j += 4U) {
+         if ((i & 1U) == 0U) {
+            pvr_csb_pack (cs_ptr, IPF_INDEX_DATA, word) {
+               word.ix_index0_0 = j;
+               word.ix_index0_1 = j + 1U;
+               word.ix_index0_2 = j + 2U;
+               word.ix_index1_0 = j + 3U;
+            }
+            cs_ptr += pvr_cmd_length(IPF_INDEX_DATA);
+
+            /* Don't increment cs_ptr here. IPF_INDEX_DATA is patched in the
+             * else part and then cs_ptr is incremented.
+             */
+            pvr_csb_pack (cs_ptr, IPF_INDEX_DATA, word) {
+               word.ix_index0_0 = j + 2U;
+               word.ix_index0_1 = j + 1U;
+            }
+         } else {
+            uint32_t tmp;
+
+            pvr_csb_pack (&tmp, IPF_INDEX_DATA, word) {
+               word.ix_index0_2 = j;
+               word.ix_index1_0 = j + 1U;
+            }
+            *cs_ptr |= tmp;
+            cs_ptr += pvr_cmd_length(IPF_INDEX_DATA);
+
+            pvr_csb_pack (cs_ptr, IPF_INDEX_DATA, word) {
+               word.ix_index0_0 = j + 2U;
+               word.ix_index0_1 = j + 3U;
+               word.ix_index0_2 = j + 2U;
+               word.ix_index1_0 = j + 1U;
+            }
+            cs_ptr += pvr_cmd_length(IPF_INDEX_DATA);
+         }
+      }
+
+      /* The last pass didn't ++. */
+      if ((num_mappings & 1U) != 0U)
+         cs_ptr++;
+   }
+
+   *cs_ptr_out = cs_ptr;
+}
+
+/* Calculates a 24 bit fixed point (biased) representation of a signed integer.
+ */
+static inline VkResult
+pvr_int32_to_isp_xy_vtx(const struct pvr_device_info *dev_info,
+                        int32_t val,
+                        UNUSED bool bias,
+                        uint32_t *word_out)
+{
+   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
+      pvr_finishme("Unimplemented path.");
+      return VK_SUCCESS;
+   }
+
+   val += PVRX(IPF_ISP_VERTEX_XY_BIAS_VALUE);
+
+   if (((uint32_t)val & 0x7fff8000U) != 0U)
+      return vk_error(NULL, VK_ERROR_UNKNOWN);
+
+   pvr_csb_pack (word_out, IPF_ISP_VERTEX_XY, word) {
+      word.sign = val < 0U;
+      word.integer = val;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+pvr_isp_prim_block_isp_vertices(const struct pvr_device_info *dev_info,
+                                struct pvr_transfer_3d_state *state,
+                                struct pvr_rect_mapping *mappings,
+                                uint32_t num_mappings,
+                                uint32_t mapping_offset,
+                                uint32_t **const cs_ptr_out)
+{
+   uint32_t *cs_ptr = *cs_ptr_out;
+   bool bias = true;
+   uint32_t i;
+
+   if (PVR_HAS_FEATURE(dev_info, screen_size8K))
+      bias = state->width_in_tiles <= 256U && state->height_in_tiles <= 256U;
+
+   for (i = mapping_offset; i < mapping_offset + num_mappings; i++) {
+      uint32_t bottom = 0U;
+      uint32_t right = 0U;
+      uint32_t left = 0U;
+      uint32_t top = 0U;
+      VkResult result;
+
+      /* ISP vertex data (X, Y, Z). */
+      result = pvr_int32_to_isp_xy_vtx(dev_info,
+                                       mappings[i].dst_rect.offset.y,
+                                       bias,
+                                       &top);
+      if (result != VK_SUCCESS)
+         return result;
+
+      result = pvr_int32_to_isp_xy_vtx(dev_info,
+                                       mappings[i].dst_rect.offset.y +
+                                          mappings[i].dst_rect.extent.height,
+                                       bias,
+                                       &bottom);
+      if (result != VK_SUCCESS)
+         return result;
+
+      result = pvr_int32_to_isp_xy_vtx(dev_info,
+                                       mappings[i].dst_rect.offset.x,
+                                       bias,
+                                       &left);
+      if (result != VK_SUCCESS)
+         return result;
+
+      result = pvr_int32_to_isp_xy_vtx(dev_info,
+                                       mappings[i].dst_rect.offset.x +
+                                          mappings[i].dst_rect.extent.width,
+                                       bias,
+                                       &right);
+      if (result != VK_SUCCESS)
+         return result;
+
+      if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
+         pvr_finishme("Unimplemented path.");
+      } else {
+         /* ISP vertices 0 and 1. */
+         pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_0, word0) {
+            word0.x0 = left;
+            word0.y0 = top & 0xFF;
+         }
+         cs_ptr++;
+
+         pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_1, word1) {
+            word1.y0 = top >> PVRX(IPF_ISP_VERTEX_WORD_1_Y0_SHIFT);
+         }
+         cs_ptr++;
+
+         pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_2, word2) {
+            word2.x1 = right & 0xFFFF;
+            word2.z0 = 0U;
+         }
+         cs_ptr++;
+
+         pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_3, word3) {
+            word3.x1 = right >> PVRX(IPF_ISP_VERTEX_WORD_3_X1_SHIFT);
+            word3.y1 = top;
+         }
+         cs_ptr++;
+
+         pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_4, word4) {
+            word4.z1 = 0U;
+         }
+         cs_ptr++;
+
+         /* ISP vertices 2 and 3. */
+         pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_0, word0) {
+            word0.x0 = left;
+            word0.y0 = bottom & 0xFF;
+         }
+         cs_ptr++;
+
+         pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_1, word1) {
+            word1.y0 = bottom >> PVRX(IPF_ISP_VERTEX_WORD_1_Y0_SHIFT);
+         }
+         cs_ptr++;
+
+         pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_2, word2) {
+            word2.x1 = right & 0xFFFF;
+            word2.z0 = 0U;
+         }
+         cs_ptr++;
+
+         pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_3, word3) {
+            word3.x1 = right >> PVRX(IPF_ISP_VERTEX_WORD_3_X1_SHIFT);
+            word3.y1 = bottom;
+         }
+         cs_ptr++;
+
+         pvr_csb_pack (cs_ptr, IPF_ISP_VERTEX_WORD_4, word4) {
+            word4.z1 = 0U;
+         }
+         cs_ptr++;
+      }
+   }
+   *cs_ptr_out = cs_ptr;
+
+   return VK_SUCCESS;
+}
+
+static uint32_t
+pvr_isp_primitive_block_size(const struct pvr_device_info *dev_info,
+                             const struct pvr_transfer_cmd_surface *src,
+                             uint32_t num_mappings)
+{
+   uint32_t num_isp_vertices = num_mappings * 4U;
+   uint32_t num_tsp_vertices_per_isp_vertex;
+   uint32_t isp_vertex_data_size_dw;
+   bool color_fill = (src == NULL);
+   uint32_t tsp_comp_format_dw;
+   uint32_t isp_state_size_dw;
+   uint32_t pds_state_size_dw;
+   uint32_t idx_data_size_dw;
+   uint32_t tsp_data_size;
+   uint32_t stream_size;
+
+   if (color_fill) {
+      num_tsp_vertices_per_isp_vertex = 0U;
+   } else {
+      num_tsp_vertices_per_isp_vertex =
+         src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED ? 4U : 2U;
+   }
+
+   tsp_data_size = num_isp_vertices * PVR_TRANSFER_NUM_LAYERS * 4U *
+                   num_tsp_vertices_per_isp_vertex;
+
+   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
+      /* An XYZ vertex is 16/16/32 bits => 8 bytes. */
+      isp_vertex_data_size_dw = num_isp_vertices * 2U;
+
+      /* Round to even for 64 bit boundary. */
+      idx_data_size_dw = ALIGN_POT(num_mappings, 2U);
+      tsp_comp_format_dw = 0U;
+      isp_state_size_dw = 4U;
+      pds_state_size_dw = 8U;
+   } else {
+      tsp_comp_format_dw = color_fill ? 0U : PVR_TRANSFER_NUM_LAYERS;
+
+      if (!color_fill) {
+         if (src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED)
+            tsp_comp_format_dw *= 2U;
+      }
+
+      /* An XYZ vertex is 24/24/32 bits => 10 bytes with last padded to 4 byte
+       * burst align.
+       */
+      isp_vertex_data_size_dw = DIV_ROUND_UP(num_isp_vertices * 10U, 4U);
+
+      /* 4 triangles fit in 3 dw: t0t0t0t1_t1t1t2t2_t2t3t3t3. */
+      idx_data_size_dw = num_mappings + DIV_ROUND_UP(num_mappings, 2U);
+      isp_state_size_dw = 5U;
+      pds_state_size_dw = 7U;
+   }
+
+   stream_size = tsp_data_size + (idx_data_size_dw + tsp_comp_format_dw +
+                                  isp_vertex_data_size_dw + isp_state_size_dw +
+                                  pds_state_size_dw) *
+                                    4U;
+
+   return stream_size;
+}
+
+static VkResult
+pvr_isp_primitive_block(const struct pvr_device_info *dev_info,
+                        struct pvr_transfer_ctx *ctx,
+                        const struct pvr_transfer_cmd *transfer_cmd,
+                        struct pvr_transfer_prep_data *prep_data,
+                        const struct pvr_transfer_cmd_surface *src,
+                        bool custom_filter,
+                        struct pvr_rect_mapping *mappings,
+                        uint32_t num_mappings,
+                        uint32_t mapping_offset,
+                        bool read_bgnd,
+                        uint32_t *cs_start_offset,
+                        uint32_t **cs_ptr_out)
+{
+   struct pvr_transfer_3d_state *state = &prep_data->state;
+   uint32_t num_isp_vertices = num_mappings * 4U;
+   uint32_t num_tsp_vertices_per_isp_vert;
+   uint32_t tsp_data_size_in_bytes;
+   uint32_t tsp_comp_format_in_dw;
+   bool color_fill = src == NULL;
+   uint32_t stream_size_in_bytes;
+   uint32_t *cs_ptr_start;
+   VkResult result;
+
+   if (color_fill) {
+      num_tsp_vertices_per_isp_vert = 0U;
+   } else {
+      num_tsp_vertices_per_isp_vert =
+         src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED ? 4U : 2U;
+   }
+
+   tsp_data_size_in_bytes = num_isp_vertices * PVR_TRANSFER_NUM_LAYERS * 4U *
+                            num_tsp_vertices_per_isp_vert;
+
+   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
+      tsp_comp_format_in_dw = 0U;
+   } else {
+      tsp_comp_format_in_dw = color_fill ? 0U : PVR_TRANSFER_NUM_LAYERS;
+
+      if (!color_fill && src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED)
+         tsp_comp_format_in_dw *= 2U;
+   }
+
+   stream_size_in_bytes =
+      pvr_isp_primitive_block_size(dev_info, src, num_mappings);
+
+   cs_ptr_start = *cs_ptr_out;
+
+   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
+      pvr_finishme("Unimplemented path.");
+   } else {
+      if (!color_fill) {
+         /* This includes:
+          *    Compressed TSP vertex data & tables.
+          *    Primitive id.
+          *    TSP compression formats.
+          */
+         pvr_isp_prim_block_tsp_vertex_block(dev_info,
+                                             src,
+                                             mappings,
+                                             custom_filter,
+                                             num_mappings,
+                                             mapping_offset,
+                                             transfer_cmd->filter,
+                                             tsp_comp_format_in_dw,
+                                             cs_ptr_out);
+      }
+
+      pvr_isp_prim_block_pds_state(dev_info, ctx, state, cs_ptr_out);
+
+      /* Point the CS_PRIM_BASE here. */
+      *cs_start_offset = (*cs_ptr_out - cs_ptr_start) * sizeof(uint32_t);
+
+      /* This includes:
+       *    ISP state words.
+       *    Compression size word.
+       *    ISP compression and vertex formats.
+       */
+      pvr_isp_prim_block_isp_state(dev_info,
+                                   tsp_comp_format_in_dw,
+                                   tsp_data_size_in_bytes,
+                                   num_isp_vertices,
+                                   read_bgnd,
+                                   cs_ptr_out);
+
+      pvr_isp_prim_block_index_block(dev_info, num_mappings, cs_ptr_out);
+
+      result = pvr_isp_prim_block_isp_vertices(dev_info,
+                                               state,
+                                               mappings,
+                                               num_mappings,
+                                               mapping_offset,
+                                               cs_ptr_out);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   assert((*cs_ptr_out - cs_ptr_start) * sizeof(uint32_t) ==
+          stream_size_in_bytes);
+
+   return VK_SUCCESS;
+}
+
+static inline uint32_t
+pvr_transfer_prim_blocks_per_alloc(const struct pvr_device_info *dev_info)
+{
+   uint32_t ret = PVRX(IPF_CONTROL_STREAM_SIZE_DWORDS) * sizeof(uint32_t);
+
+   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format))
+      return ret / sizeof(uint64_t) / 2U;
+
+   return ret / sizeof(uint32_t) / 2U - 1U;
+}
+
+static inline uint32_t
+pvr_transfer_max_quads_per_pb(const struct pvr_device_info *dev_info)
+{
+   return PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) ? 4U
+                                                                      : 16U;
+}
+
+/**
+ * Writes ISP ctrl stream.
+ *
+ * We change sampler/texture state when we process a new TQ source. The
+ * primitive block contains the shader pointers, but we supply the primitive
+ * blocks with shaders from here.
+ */
+static VkResult pvr_isp_ctrl_stream(const struct pvr_device_info *dev_info,
+                                    struct pvr_transfer_ctx *ctx,
+                                    struct pvr_transfer_cmd *transfer_cmd,
+                                    struct pvr_transfer_prep_data *prep_data)
+{
+   const uint32_t max_mappings_per_pb = pvr_transfer_max_quads_per_pb(dev_info);
+   struct pvr_transfer_3d_state *const state = &prep_data->state;
+   struct pvr_winsys_transfer_regs *const regs = &state->regs;
+   struct pvr_transfer_pass *pass = NULL;
+   uint32_t flags = transfer_cmd->flags;
+   uint32_t num_prim_blks = 0U;
+   uint32_t prim_blk_size = 0U;
+   uint32_t region_arrays_size;
+   uint32_t num_region_arrays;
+   uint32_t total_stream_size;
+   struct pvr_bo *pvr_cs_bo;
+   uint32_t rem_mappings;
+   uint32_t *blk_cs_ptr;
+   uint32_t *cs_ptr;
+   VkResult result;
+
+   if (state->custom_mapping.pass_count > 0U) {
+      uint32_t num_mappings;
+
+      pass = &state->custom_mapping.passes[state->pass_idx];
+      num_mappings = pass->mapping_count;
+
+      while (num_mappings > 0U) {
+         if (flags & PVR_TRANSFER_CMD_FLAGS_FILL) {
+            prim_blk_size += pvr_isp_primitive_block_size(
+               dev_info,
+               NULL,
+               MIN2(max_mappings_per_pb, num_mappings));
+         }
+
+         if (transfer_cmd->src_present) {
+            prim_blk_size += pvr_isp_primitive_block_size(
+               dev_info,
+               &transfer_cmd->src,
+               MIN2(max_mappings_per_pb, num_mappings));
+         }
+
+         num_mappings -= MIN2(max_mappings_per_pb, num_mappings);
+         num_prim_blks++;
+      }
+   } else if ((flags & PVR_TRANSFER_CMD_FLAGS_FILL) != 0U) {
+      num_prim_blks = 1U;
+      prim_blk_size +=
+         pvr_isp_primitive_block_size(dev_info,
+                                      NULL,
+                                      MIN2(max_mappings_per_pb, 1U));
+
+      if (transfer_cmd->src_present) {
+         uint32_t num_mappings = transfer_cmd->mapping_count;
+
+         while (num_mappings > 0U) {
+            prim_blk_size += pvr_isp_primitive_block_size(
+               dev_info,
+               &transfer_cmd->src,
+               MIN2(max_mappings_per_pb, num_mappings));
+
+            num_mappings -= MIN2(max_mappings_per_pb, num_mappings);
+            num_prim_blks++;
+         }
+      }
+   } else {
+      pvr_finishme("Unimplemented path.");
+   }
+
+   num_region_arrays =
+      (num_prim_blks + (pvr_transfer_prim_blocks_per_alloc(dev_info) - 1U)) /
+      pvr_transfer_prim_blocks_per_alloc(dev_info);
+   region_arrays_size = PVRX(IPF_CONTROL_STREAM_SIZE_DWORDS) *
+                        sizeof(uint32_t) * num_region_arrays;
+   total_stream_size = region_arrays_size + prim_blk_size;
+
+   /* Allocate space for IPF control stream. */
+   result = pvr_cmd_buffer_alloc_mem(transfer_cmd->cmd_buffer,
+                                     ctx->device->heaps.transfer_3d_heap,
+                                     total_stream_size,
+                                     PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                                     &pvr_cs_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   cs_ptr = pvr_cs_bo->bo->map;
+   blk_cs_ptr = cs_ptr + region_arrays_size / sizeof(uint32_t);
+
+   if (flags & PVR_TRANSFER_CMD_FLAGS_FILL)
+      rem_mappings = pass ? pass->mapping_count : 1U;
+   else
+      rem_mappings = transfer_cmd->mapping_count;
+
+   if ((transfer_cmd->src_present || flags & PVR_TRANSFER_CMD_FLAGS_FILL) &&
+       rem_mappings != 0U) {
+      struct pvr_pds_pixel_shader_sa_program unitex_pds_prog = { 0U };
+      struct pvr_transfer_cmd_surface *src = &transfer_cmd->src;
+      struct pvr_rect_mapping fill_mapping;
+      uint32_t mapping_offset = 0U;
+      bool read_bgnd = false;
+
+      if (flags & PVR_TRANSFER_CMD_FLAGS_FILL) {
+         uint32_t packed_color[4U] = { 0U };
+
+         if (vk_format_is_compressed(transfer_cmd->dst.vk_format)) {
+            return vk_error(transfer_cmd->cmd_buffer,
+                            VK_ERROR_FORMAT_NOT_SUPPORTED);
+         }
+
+         state->pds_shader_task_offset = 0U;
+         state->uni_tex_code_offset = 0U;
+         state->tex_state_data_offset = 0U;
+         state->common_ptr = 0U;
+
+         result = pvr_pack_clear_color(transfer_cmd->dst.vk_format,
+                                       transfer_cmd->clear_color,
+                                       packed_color);
+         if (result != VK_SUCCESS)
+            return result;
+
+         fill_mapping.dst_rect = transfer_cmd->scissor;
+
+         pvr_csb_pack (&regs->usc_clear_register0,
+                       CR_USC_CLEAR_REGISTER0,
+                       reg) {
+            reg.val = packed_color[0U];
+         }
+
+         pvr_csb_pack (&regs->usc_clear_register1,
+                       CR_USC_CLEAR_REGISTER1,
+                       reg) {
+            reg.val = packed_color[1U];
+         }
+
+         pvr_csb_pack (&regs->usc_clear_register2,
+                       CR_USC_CLEAR_REGISTER2,
+                       reg) {
+            reg.val = packed_color[2U];
+         }
+
+         pvr_csb_pack (&regs->usc_clear_register3,
+                       CR_USC_CLEAR_REGISTER3,
+                       reg) {
+            reg.val = packed_color[3U];
+         }
+
+         state->pds_shader_task_offset =
+            transfer_cmd->cmd_buffer->device->nop_program.pds.data_offset;
+
+         unitex_pds_prog.kick_usc = false;
+         unitex_pds_prog.clear = false;
+      } else {
+         const bool down_scale =
+            transfer_cmd->resolve_op == PVR_RESOLVE_BLEND &&
+            src->sample_count > 1U && transfer_cmd->dst.sample_count <= 1U;
+         struct pvr_tq_shader_properties *shader_props =
+            &state->shader_props;
+         struct pvr_tq_layer_properties *layer = &shader_props->layer_props;
+         const struct pvr_tq_frag_sh_reg_layout *sh_reg_layout;
+         enum pvr_transfer_pbe_pixel_src pbe_src_format;
+         uint32_t tex_state_dma_size;
+         pvr_dev_addr_t dev_offset;
+         struct pvr_bo *pvr_bo;
+
+         /* Reset the shared register bank ptrs each src implies new texture
+          * state (Note that we don't change texture state per prim block).
+          */
+         state->common_ptr = 0U;
+         state->usc_const_reg_ptr = 0U;
+         /* We don't use state->dynamic_const_reg_ptr here. */
+
+         if (flags & PVR_TRANSFER_CMD_FLAGS_DSMERGE)
+            read_bgnd = true;
+
+         result = pvr_pbe_src_format_f2d(flags,
+                                         src,
+                                         transfer_cmd->dst.vk_format,
+                                         down_scale,
+                                         state->dont_force_pbe,
+                                         &pbe_src_format);
+         if (result != VK_SUCCESS)
+            return result;
+
+         memset(shader_props, 0U, sizeof(*shader_props));
+
+         if (state->custom_mapping.byte_unwind_src > 0U &&
+             state->custom_mapping.passes[0U].byte_unwind) {
+            pvr_finishme("Unimplemented path.");
+         }
+
+         layer->pbe_format = pbe_src_format;
+         layer->sample = (src->mem_layout == PVR_MEMLAYOUT_3DTWIDDLED);
+         shader_props->iterated = true;
+
+         shader_props->pick_component =
+            pvr_pick_component_needed(&state->custom_mapping);
+
+         result = pvr_msaa_state(dev_info, transfer_cmd, state);
+         if (result != VK_SUCCESS)
+            return result;
+
+         if (state->filter == PVR_FILTER_LINEAR &&
+             pvr_requires_usc_linear_filter(src->vk_format)) {
+            if (pvr_int_pbe_usc_linear_filter(layer->pbe_format,
+                                              layer->sample,
+                                              layer->msaa,
+                                              shader_props->full_rate)) {
+               layer->linear = true;
+            } else {
+               mesa_logw("Transfer: F32 linear filter not supported.");
+            }
+         }
+
+         result = pvr_transfer_frag_store_get_shader_info(
+            transfer_cmd->cmd_buffer->device,
+            &ctx->frag_store,
+            shader_props,
+            &dev_offset,
+            &sh_reg_layout);
+         if (result != VK_SUCCESS)
+            return result;
+
+         assert(dev_offset.addr <= UINT32_MAX);
+         prep_data->state.pds_shader_task_offset = (uint32_t)dev_offset.addr;
+
+         result =
+            pvr_pds_coeff_task(ctx, transfer_cmd, layer->sample, prep_data);
+         if (result != VK_SUCCESS)
+            return result;
+
+         unitex_pds_prog.kick_usc = false;
+         unitex_pds_prog.clear = false;
+
+         tex_state_dma_size =
+            sh_reg_layout->driver_total + sh_reg_layout->compiler_out_total;
+
+         unitex_pds_prog.num_texture_dma_kicks = 1U;
+         unitex_pds_prog.num_uniform_dma_kicks = 0U;
+
+         /* Allocate memory for DMA. */
+         result = pvr_cmd_buffer_alloc_mem(transfer_cmd->cmd_buffer,
+                                           ctx->device->heaps.general_heap,
+                                           tex_state_dma_size << 2U,
+                                           PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                                           &pvr_bo);
+         if (result != VK_SUCCESS)
+            return result;
+
+         result = pvr_sampler_state_for_surface(dev_info,
+                                                &transfer_cmd->src,
+                                                state->filter,
+                                                sh_reg_layout,
+                                                0U,
+                                                pvr_bo->bo->map);
+         if (result != VK_SUCCESS)
+            return result;
+
+         result = pvr_image_state_for_surface(ctx,
+                                              transfer_cmd,
+                                              &transfer_cmd->src,
+                                              0U,
+                                              sh_reg_layout,
+                                              state,
+                                              0U,
+                                              pvr_bo->bo->map);
+         if (result != VK_SUCCESS)
+            return result;
+
+         pvr_pds_encode_dma_burst(unitex_pds_prog.texture_dma_control,
+                                  unitex_pds_prog.texture_dma_address,
+                                  state->common_ptr,
+                                  tex_state_dma_size,
+                                  pvr_bo->vma->dev_addr.addr,
+                                  true,
+                                  dev_info);
+
+         state->common_ptr += tex_state_dma_size;
+
+         pvr_write_usc_constants(sh_reg_layout, pvr_bo->bo->map);
+
+         if (pvr_pick_component_needed(&state->custom_mapping)) {
+            pvr_dma_texel_unwind(state, sh_reg_layout, pvr_bo->bo->map);
+         }
+      }
+
+      result = pvr_pds_unitex(dev_info,
+                              ctx,
+                              transfer_cmd,
+                              &unitex_pds_prog,
+                              prep_data);
+      if (result != VK_SUCCESS)
+         return result;
+
+      while (rem_mappings > 0U) {
+         const uint64_t transfer_heap_base =
+            transfer_cmd->cmd_buffer->device->heaps.transfer_3d_heap
+               ->base_addr.addr;
+         const uint32_t num_mappings = MIN2(max_mappings_per_pb, rem_mappings);
+         struct pvr_rect_mapping *mappings = NULL;
+         uint32_t stream_start_offset = 0U;
+         pvr_dev_addr_t prim_blk_addr;
+
+         if (PVR_HAS_FEATURE(dev_info, ipf_creq_pf))
+            pvr_finishme("Unimplemented path.");
+
+         if (flags & PVR_TRANSFER_CMD_FLAGS_FILL)
+            mappings = pass ? pass->mappings : &fill_mapping;
+         else
+            mappings = transfer_cmd->mappings;
+
+         prim_blk_addr =
+            PVR_DEV_ADDR(pvr_cs_bo->vma->dev_addr.addr - transfer_heap_base);
+         prim_blk_addr.addr +=
+            (uintptr_t)blk_cs_ptr - (uintptr_t)pvr_cs_bo->bo->map;
+
+         result = pvr_isp_primitive_block(
+            dev_info,
+            ctx,
+            transfer_cmd,
+            prep_data,
+            flags & PVR_TRANSFER_CMD_FLAGS_FILL ? NULL : src,
+            state->custom_filter,
+            mappings,
+            num_mappings,
+            mapping_offset,
+            read_bgnd,
+            &stream_start_offset,
+            &blk_cs_ptr);
+         if (result != VK_SUCCESS)
+            return result;
+
+         prim_blk_addr.addr += stream_start_offset;
+
+         if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format_v2)) {
+            pvr_finishme("Unimplemented path.");
+         } else {
+            pvr_csb_pack (cs_ptr, IPF_PRIMITIVE_FORMAT, word) {
+               word.cs_type = PVRX(IPF_CS_TYPE_PRIM);
+               word.cs_isp_state_read = true;
+               word.cs_isp_state_size = 2U;
+               word.cs_prim_total = 2U * num_mappings - 1U;
+               word.cs_mask_fmt = PVRX(IPF_CS_MASK_FMT_FULL);
+               word.cs_prim_base_pres = true;
+            }
+            cs_ptr += pvr_cmd_length(IPF_PRIMITIVE_FORMAT);
+
+            pvr_csb_pack (cs_ptr, IPF_PRIMITIVE_BASE, word) {
+               word.cs_prim_base = prim_blk_addr;
+            }
+            cs_ptr += pvr_cmd_length(IPF_PRIMITIVE_BASE);
+         }
+
+         rem_mappings -= num_mappings;
+         mapping_offset += num_mappings;
+      }
+   }
+
+   if (PVR_HAS_FEATURE(dev_info, ipf_creq_pf))
+      pvr_finishme("Unimplemented path.");
+
+   pvr_csb_pack (cs_ptr, IPF_CONTROL_STREAM, word) {
+      word.cs_type = PVRX(IPF_CS_TYPE_TERM);
+   }
+   cs_ptr += pvr_cmd_length(IPF_CONTROL_STREAM);
+
+   pvr_csb_pack (&regs->isp_mtile_base, CR_ISP_MTILE_BASE, reg) {
+      reg.addr =
+         PVR_DEV_ADDR(pvr_cs_bo->vma->dev_addr.addr -
+                      ctx->device->heaps.transfer_3d_heap->base_addr.addr);
+   }
+
+   pvr_csb_pack (&regs->isp_render, CR_ISP_RENDER, reg) {
+      reg.mode_type = PVRX(CR_ISP_RENDER_MODE_TYPE_FAST_2D);
+   }
+
+   if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format_v2))
+      pvr_finishme("Unimplemented path.");
+   else
+      regs->isp_rgn = 0UL;
+
+   return VK_SUCCESS;
+}
+
+static void pvr_transfer_set_filter(struct pvr_transfer_cmd *transfer_cmd,
+                                    struct pvr_transfer_3d_state *state)
+{
+   VkRect2D *src = &transfer_cmd->mappings[0U].src_rect;
+   VkRect2D *dst = &transfer_cmd->mappings[0U].dst_rect;
+
+   if (!transfer_cmd->src_present)
+      return;
+
+   /* If no scaling is applied to the copy region, we can use point
+    * filtering.
+    */
+   if (!state->custom_filter && (src->extent.width == dst->extent.width) &&
+       (src->extent.height == dst->extent.height))
+      state->filter = PVR_FILTER_POINT;
+   else
+      state->filter = transfer_cmd->filter;
+}
+
+/** Generates hw resources to kick a 3D clip blit. */
+static VkResult pvr_3d_clip_blit(struct pvr_transfer_ctx *ctx,
+                                 struct pvr_transfer_cmd *transfer_cmd,
+                                 struct pvr_transfer_prep_data *prep_data,
+                                 uint32_t pass_idx,
+                                 bool *finished_out)
+{
+   struct pvr_transfer_3d_state *state = &prep_data->state;
+   uint32_t texel_unwind_src = state->custom_mapping.texel_unwind_src;
+   struct pvr_transfer_cmd bg_cmd = { 0U };
+   uint32_t control_reg;
+   VkResult result;
+
+   state->dont_force_pbe = false;
+   bg_cmd.scissor = transfer_cmd->scissor;
+   bg_cmd.cmd_buffer = transfer_cmd->cmd_buffer;
+   bg_cmd.flags = transfer_cmd->flags;
+   bg_cmd.flags &=
+      ~(PVR_TRANSFER_CMD_FLAGS_FAST2D | PVR_TRANSFER_CMD_FLAGS_FILL |
+        PVR_TRANSFER_CMD_FLAGS_DSMERGE | PVR_TRANSFER_CMD_FLAGS_PICKD);
+
+   bg_cmd.src_present = state->custom_mapping.pass_count > 0U ? false : true;
+   if (bg_cmd.src_present) {
+      bg_cmd.mappings[0U].src_rect = transfer_cmd->scissor;
+      bg_cmd.mappings[0U].dst_rect = transfer_cmd->scissor;
+      bg_cmd.resolve_op = PVR_RESOLVE_BLEND;
+      bg_cmd.src = transfer_cmd->dst;
+   }
+
+   state->filter = PVR_FILTER_DONTCARE;
+   bg_cmd.dst = transfer_cmd->dst;
+   state->custom_mapping.texel_unwind_src =
+      state->custom_mapping.texel_unwind_dst;
+
+   result =
+      pvr_3d_copy_blit_core(ctx, &bg_cmd, prep_data, pass_idx, finished_out);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* If the destination has 4 channels and the source has at most 2, we still
+    * need all 4 channels from the USC into the PBE.
+    */
+   state->dont_force_pbe = true;
+   state->custom_mapping.texel_unwind_src = texel_unwind_src;
+
+   /* We need the viewport mask, otherwise all pixels would be disabled. */
+   pvr_csb_pack (&control_reg, CR_ISP_BGOBJVALS, reg) {
+      reg.mask = true;
+   }
+   state->regs.isp_bgobjvals |= control_reg;
+
+   pvr_transfer_set_filter(transfer_cmd, state);
+   result = pvr_isp_ctrl_stream(&ctx->device->pdevice->dev_info,
+                                ctx,
+                                transfer_cmd,
+                                prep_data);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* In case of resolve M -> S, the accumulation is read from and written to a
+    * single sampled surface. Make sure that we are resolving and we have the
+    * right number of tiles.
+    */
+   if (state->down_scale) {
+      uint64_t tmp;
+
+      pvr_csb_pack (&tmp, CR_PBE_WORD0_MRT0, reg) {
+         reg.downscale = true;
+      }
+      state->regs.pbe_wordx_mrty[0U] |= tmp;
+
+      result = pvr_isp_tiles(ctx->device, state);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return VK_SUCCESS;
+}
+
+static bool pvr_texel_unwind(uint32_t bpp,
+                             pvr_dev_addr_t dev_addr,
+                             bool is_input,
+                             uint32_t texel_extend,
+                             uint32_t *texel_unwind_out)
+{
+   uint32_t texel_unwind = 0U;
+
+   for (uint32_t i = 0U; i < 16U; i++) {
+      if (pvr_is_surface_aligned(dev_addr, is_input, bpp)) {
+         break;
+      } else {
+         if (i == 15U) {
+            return false;
+         } else {
+            dev_addr.addr -= (bpp / texel_extend) / 8U;
+            texel_unwind++;
+         }
+      }
+   }
+
+   *texel_unwind_out = texel_unwind;
+
+   return true;
+}
+
+static bool pvr_byte_unwind(uint32_t bpp,
+                            pvr_dev_addr_t dev_addr,
+                            bool is_input,
+                            uint32_t *byte_unwind_out)
+{
+   uint32_t byte_unwind = 0U;
+
+   for (uint32_t i = 0U; i < 16U; i++) {
+      if (pvr_is_surface_aligned(dev_addr, is_input, bpp)) {
+         break;
+      } else {
+         if (i == 15U) {
+            return false;
+         } else {
+            dev_addr.addr -= 1U;
+            byte_unwind++;
+         }
+      }
+   }
+
+   *byte_unwind_out = byte_unwind;
+
+   return true;
+}
+
+static bool pvr_is_identity_mapping(const struct pvr_rect_mapping *mapping)
+{
+   return (mapping->src_rect.offset.x == mapping->dst_rect.offset.x &&
+           mapping->src_rect.offset.y == mapping->dst_rect.offset.y &&
+           mapping->src_rect.extent.width == mapping->dst_rect.extent.width &&
+           mapping->src_rect.extent.height == mapping->dst_rect.extent.height);
+}
+
+static inline bool pvr_is_pbe_stride_aligned(const uint32_t stride)
+{
+   if (stride == 1U)
+      return true;
+
+   return ((stride & (PVRX(PBESTATE_REG_WORD0_LINESTRIDE_UNIT_SIZE) - 1U)) ==
+           0x0U);
+}
+
+static struct pvr_transfer_pass *
+pvr_create_pass(struct pvr_transfer_custom_mapping *custom_mapping,
+                uint32_t dst_offset,
+                uint32_t src_offset,
+                bool extend_height)
+{
+   struct pvr_transfer_pass *pass;
+
+   assert(custom_mapping->pass_count < PVR_TRANSFER_MAX_PASSES);
+
+   pass = &custom_mapping->passes[custom_mapping->pass_count];
+   pass->clip_rects_count = 0U;
+   pass->dst_offset = dst_offset;
+   pass->src_offset = src_offset;
+   pass->mapping_count = 0U;
+   pass->extend_height = extend_height;
+
+   custom_mapping->pass_count++;
+
+   return pass;
+}
+
+/* Acquire pass with given offset. If one doesn't exist, create new. */
+static struct pvr_transfer_pass *
+pvr_acquire_pass(struct pvr_transfer_custom_mapping *custom_mapping,
+                 uint32_t dst_offset,
+                 uint32_t src_offset,
+                 bool extend_height)
+{
+   for (uint32_t i = 0U; i < custom_mapping->pass_count; i++) {
+      if (custom_mapping->passes[i].dst_offset == dst_offset)
+         return &custom_mapping->passes[i];
+   }
+
+   return pvr_create_pass(custom_mapping, dst_offset, src_offset, extend_height);
+}
+
+static void pvr_remove_mapping(struct pvr_transfer_pass *pass, uint32_t idx)
+{
+   assert(idx < pass->mapping_count);
+
+   for (uint32_t i = idx; i < (pass->mapping_count - 1U); i++)
+      pass->mappings[i] = pass->mappings[i + 1U];
+
+   pass->mapping_count--;
+}
+
+static struct pvr_rect_mapping *
+pvr_create_mapping(struct pvr_transfer_pass *pass)
+{
+   assert(pass->mapping_count < ARRAY_SIZE(pass->mappings));
+
+   return &pass->mappings[pass->mapping_count++];
+}
+
+/**
+ * If PBE can't write to surfaces with odd stride, the stride of
+ * destination surface is doubled to make it even. Height of the surface is
+ * halved. The source surface is not resized. Each half of the modified
+ * destination surface samples every second row from the source surface. This
+ * only works with nearest filtering.
+ */
+static bool pvr_double_stride(struct pvr_transfer_pass *pass, uint32_t stride)
+{
+   struct pvr_rect_mapping *mappings = pass->mappings;
+
+   if (stride == 1U)
+      return false;
+
+   if (mappings[0U].dst_rect.extent.height == 1U && pass->mapping_count == 1U) {
+      /* Only one mapping required if height is 1. */
+      if ((mappings[0U].dst_rect.offset.y & 1U) != 0U) {
+         mappings[0U].dst_rect.offset.x += (int32_t)stride;
+         mappings[0U].dst_rect.offset.y /= 2U;
+         mappings[0U].dst_rect.extent.height =
+            (mappings[0U].dst_rect.extent.height + 1U) / 2U;
+      } else {
+         mappings[0U].dst_rect.extent.height =
+            (mappings[0U].dst_rect.offset.y +
+             mappings[0U].dst_rect.extent.height + 1U) /
+               2U -
+            mappings[0U].dst_rect.offset.y;
+         mappings[0U].dst_rect.offset.y /= 2U;
+      }
+
+      return true;
+   }
+
+   pvr_finishme("Add support for multiple mappings.");
+
+   return false;
+}
+
+static void pvr_unwind_rects(uint32_t width,
+                             uint32_t height,
+                             uint32_t texel_unwind,
+                             bool input,
+                             struct pvr_transfer_pass *pass)
+{
+   pvr_finishme("Implement pvr_unwind_rects().");
+}
+
+/**
+ * Assign clip rects to rectangle mappings. TDM can only do two PBE clip
+ * rects per screen.
+ */
+static void
+pvr_map_clip_rects(struct pvr_transfer_custom_mapping *custom_mapping)
+{
+   for (uint32_t i = 0U; i < custom_mapping->pass_count; i++) {
+      struct pvr_transfer_pass *pass = &custom_mapping->passes[i];
+
+      pass->clip_rects_count = 0U;
+      for (uint32_t j = 0U; j < pass->mapping_count; j++) {
+         struct pvr_rect_mapping *mappings = pass->mappings;
+         VkRect2D *clip_rects = pass->clip_rects;
+         bool merged = false;
+
+         /* Try merge adjacent clip rects. */
+         for (uint32_t k = 0U; k < pass->clip_rects_count; k++) {
+            if (clip_rects[k].offset.y == mappings[j].dst_rect.offset.y &&
+                clip_rects[k].extent.height ==
+                   mappings[j].dst_rect.extent.height &&
+                clip_rects[k].offset.x + clip_rects[k].extent.width ==
+                   mappings[j].dst_rect.offset.x) {
+               clip_rects[k].extent.width += mappings[j].dst_rect.extent.width;
+               merged = true;
+               break;
+            }
+
+            if (clip_rects[k].offset.y == mappings[j].dst_rect.offset.y &&
+                clip_rects[k].extent.height ==
+                   mappings[j].dst_rect.extent.height &&
+                clip_rects[k].offset.x ==
+                   mappings[j].dst_rect.offset.x +
+                      mappings[j].dst_rect.extent.width) {
+               clip_rects[k].offset.x = mappings[j].dst_rect.offset.x;
+               clip_rects[k].extent.width += mappings[j].dst_rect.extent.width;
+               merged = true;
+               break;
+            }
+
+            if (clip_rects[k].offset.x == mappings[j].dst_rect.offset.x &&
+                clip_rects[k].extent.width ==
+                   mappings[j].dst_rect.extent.width &&
+                clip_rects[k].offset.y + clip_rects[k].extent.height ==
+                   mappings[j].dst_rect.offset.y) {
+               clip_rects[k].extent.height +=
+                  mappings[j].dst_rect.extent.height;
+               merged = true;
+               break;
+            }
+
+            if (clip_rects[k].offset.x == mappings[j].dst_rect.offset.x &&
+                clip_rects[k].extent.width ==
+                   mappings[j].dst_rect.extent.width &&
+                clip_rects[k].offset.y ==
+                   mappings[j].dst_rect.offset.y +
+                      mappings[j].dst_rect.extent.height) {
+               clip_rects[k].extent.height +=
+                  mappings[j].dst_rect.extent.height;
+               clip_rects[k].offset.y = mappings[j].dst_rect.offset.y;
+               merged = true;
+               break;
+            }
+         }
+
+         if (merged)
+            continue;
+
+         /* Create new pass if needed, TDM can only have 2 clip rects. */
+         if (pass->clip_rects_count >= custom_mapping->max_clip_rects) {
+            struct pvr_transfer_pass *new_pass =
+               pvr_create_pass(custom_mapping,
+                               pass->dst_offset,
+                               pass->src_offset,
+                               pass->extend_height);
+            struct pvr_rect_mapping *new_mapping = pvr_create_mapping(new_pass);
+
+            new_pass->clip_rects_count = 1U;
+            *new_mapping = pass->mappings[j];
+
+            pvr_remove_mapping(pass, j);
+
+            /* Redo - mapping was replaced. */
+            j--;
+         } else {
+            pass->clip_rects[pass->clip_rects_count] =
+               pass->mappings[j].dst_rect;
+
+            pass->clip_rects_count++;
+
+            assert(pass->clip_rects_count <= ARRAY_SIZE(pass->clip_rects));
+         }
+      }
+   }
+}
+
+static void
+pvr_generate_custom_mapping(uint32_t src_stride,
+                            uint32_t src_width,
+                            uint32_t src_height,
+                            uint32_t dst_stride,
+                            uint32_t dst_width,
+                            uint32_t dst_height,
+                            enum pvr_memlayout dst_mem_layout,
+                            struct pvr_transfer_custom_mapping *custom_mapping)
+{
+   src_stride *= custom_mapping->texel_extend_src;
+   src_width *= custom_mapping->texel_extend_src;
+   dst_stride *= custom_mapping->texel_extend_dst;
+   dst_width *= custom_mapping->texel_extend_dst;
+
+   if (custom_mapping->texel_unwind_src > 0U) {
+      pvr_unwind_rects(src_stride,
+                       src_height,
+                       custom_mapping->texel_unwind_src,
+                       true,
+                       &custom_mapping->passes[0U]);
+   }
+
+   if (custom_mapping->double_stride) {
+      custom_mapping->double_stride =
+         pvr_double_stride(&custom_mapping->passes[0U], dst_stride);
+
+      dst_stride *= 2U;
+   }
+
+   pvr_unwind_rects(dst_stride,
+                    dst_height,
+                    custom_mapping->texel_unwind_dst,
+                    false,
+                    &custom_mapping->passes[0U]);
+
+   pvr_map_clip_rects(custom_mapping);
+
+   /* If the last row of the source mapping is sampled, height of the surface
+    * can only be increased if the new area contains a valid region. Some blits
+    * are split to two sources.
+    */
+   if (custom_mapping->texel_unwind_src > 0U) {
+      pvr_finishme("Implement pvr_generate_custom_mapping().");
+   }
+}
+
+static bool
+pvr_get_custom_mapping(const struct pvr_device_info *dev_info,
+                       const struct pvr_transfer_cmd *transfer_cmd,
+                       uint32_t max_clip_rects,
+                       struct pvr_transfer_custom_mapping *custom_mapping)
+{
+   const uint32_t dst_bpp =
+      vk_format_get_blocksizebits(transfer_cmd->dst.vk_format);
+   struct pvr_transfer_pass *pass;
+   bool ret;
+
+   custom_mapping->max_clip_rects = max_clip_rects;
+   custom_mapping->texel_unwind_src = 0U;
+   custom_mapping->texel_unwind_dst = 0U;
+   custom_mapping->texel_extend_src = 1U;
+   custom_mapping->texel_extend_dst = 1U;
+   custom_mapping->byte_unwind_src = 0U;
+   custom_mapping->pass_count = 0U;
+
+   custom_mapping->max_clip_size = PVR_MAX_CLIP_SIZE(dev_info);
+
+   ret = pvr_texel_unwind(dst_bpp,
+                          transfer_cmd->dst.dev_addr,
+                          false,
+                          1U,
+                          &custom_mapping->texel_unwind_dst);
+   if (!ret) {
+      custom_mapping->texel_extend_dst = dst_bpp / 8U;
+      if (transfer_cmd->src_present) {
+         if (transfer_cmd->src.mem_layout == PVR_MEMLAYOUT_LINEAR) {
+            custom_mapping->texel_extend_src = custom_mapping->texel_extend_dst;
+         } else if (transfer_cmd->src.mem_layout == PVR_MEMLAYOUT_TWIDDLED &&
+                    transfer_cmd->src.height == 1U) {
+            custom_mapping->texel_extend_src = custom_mapping->texel_extend_dst;
+         }
+      }
+
+      ret = pvr_texel_unwind(dst_bpp,
+                             transfer_cmd->dst.dev_addr,
+                             false,
+                             custom_mapping->texel_extend_dst,
+                             &custom_mapping->texel_unwind_dst);
+      if (!ret)
+         return false;
+   }
+
+   if (transfer_cmd->src_present) {
+      const uint32_t src_bpp =
+         vk_format_get_blocksizebits(transfer_cmd->src.vk_format);
+
+      ret = pvr_is_surface_aligned(transfer_cmd->src.dev_addr, true, src_bpp);
+
+      if (!ret && (transfer_cmd->src.mem_layout == PVR_MEMLAYOUT_LINEAR ||
+                   transfer_cmd->src.height == 1U)) {
+         ret = pvr_texel_unwind(src_bpp,
+                                transfer_cmd->src.dev_addr,
+                                true,
+                                custom_mapping->texel_extend_src,
+                                &custom_mapping->texel_unwind_src);
+      }
+
+      if (!ret && dst_bpp != 24U) {
+         ret = pvr_byte_unwind(src_bpp,
+                               transfer_cmd->src.dev_addr,
+                               true,
+                               &custom_mapping->byte_unwind_src);
+      }
+
+      if (!ret) {
+         custom_mapping->texel_extend_src = dst_bpp / 8U;
+         custom_mapping->texel_extend_dst = custom_mapping->texel_extend_src;
+
+         ret = pvr_texel_unwind(src_bpp,
+                                transfer_cmd->src.dev_addr,
+                                true,
+                                custom_mapping->texel_extend_src,
+                                &custom_mapping->texel_unwind_src);
+      }
+
+      if (!ret)
+         return false;
+   }
+
+   VkRect2D rect = transfer_cmd->scissor;
+   assert(
+      (rect.offset.x + rect.extent.width) <= custom_mapping->max_clip_size &&
+      (rect.offset.y + rect.extent.height) <= custom_mapping->max_clip_size);
+
+   /* Texel extend only works with strided memory layout, because pixel width is
+    * changed. Texel unwind only works with strided memory layout. 1D blits are
+    * allowed.
+    */
+   if (transfer_cmd->src.height > 1U && transfer_cmd->src_present &&
+       (custom_mapping->texel_extend_src > 1U ||
+        custom_mapping->texel_unwind_src > 0U) &&
+       transfer_cmd->src.mem_layout != PVR_MEMLAYOUT_LINEAR) {
+      return false;
+   }
+
+   /* Texel extend only works with strided memory layout, because pixel width is
+    * changed. Texel unwind only works with strided memory layout. 1D blits are
+    * allowed.
+    */
+   if ((custom_mapping->texel_extend_dst > 1U ||
+        custom_mapping->texel_unwind_dst > 0U) &&
+       transfer_cmd->dst.mem_layout != PVR_MEMLAYOUT_LINEAR &&
+       transfer_cmd->dst.height > 1U) {
+      return false;
+   }
+
+   if (transfer_cmd->dst.mem_layout == PVR_MEMLAYOUT_LINEAR) {
+      custom_mapping->double_stride = !pvr_is_pbe_stride_aligned(
+         transfer_cmd->dst.stride * custom_mapping->texel_extend_dst);
+   }
+
+   if (custom_mapping->byte_unwind_src > 0U ||
+       custom_mapping->texel_unwind_src > 0U ||
+       custom_mapping->texel_unwind_dst > 0U || custom_mapping->double_stride) {
+      struct pvr_rect_mapping *mapping;
+
+      pass = pvr_acquire_pass(custom_mapping, 0U, 0U, false);
+      mapping = pvr_create_mapping(pass);
+
+      if (transfer_cmd->src_present) {
+         *mapping = transfer_cmd->mappings[0U];
+      } else {
+         mapping->src_rect = transfer_cmd->scissor;
+         mapping->dst_rect = transfer_cmd->scissor;
+      }
+   } else {
+      return false;
+   }
+
+   if (custom_mapping->texel_extend_src > 1U ||
+       custom_mapping->texel_extend_dst > 1U) {
+      pass->mappings[0U].src_rect.offset.x *=
+         (int32_t)custom_mapping->texel_extend_dst;
+      pass->mappings[0U].src_rect.extent.width *=
+         (int32_t)custom_mapping->texel_extend_dst;
+      pass->mappings[0U].dst_rect.offset.x *=
+         (int32_t)custom_mapping->texel_extend_dst;
+      pass->mappings[0U].dst_rect.extent.width *=
+         (int32_t)custom_mapping->texel_extend_dst;
+   }
+
+   if (transfer_cmd->src_present) {
+      pvr_generate_custom_mapping(transfer_cmd->src.stride,
+                                  transfer_cmd->src.width,
+                                  transfer_cmd->src.height,
+                                  transfer_cmd->dst.stride,
+                                  transfer_cmd->dst.width,
+                                  transfer_cmd->dst.height,
+                                  transfer_cmd->dst.mem_layout,
+                                  custom_mapping);
+   } else {
+      pvr_generate_custom_mapping(0U,
+                                  0U,
+                                  0U,
+                                  transfer_cmd->dst.stride,
+                                  transfer_cmd->dst.width,
+                                  transfer_cmd->dst.height,
+                                  transfer_cmd->dst.mem_layout,
+                                  custom_mapping);
+   }
+
+   return true;
+}
+
+static void
+pvr_modify_command(struct pvr_transfer_custom_mapping *custom_mapping,
+                   uint32_t pass_idx,
+                   struct pvr_transfer_cmd *transfer_cmd)
+{
+   struct pvr_transfer_pass *pass = &custom_mapping->passes[pass_idx];
+   uint32_t bpp;
+
+   if (custom_mapping->texel_extend_src > 1U) {
+      pvr_finishme("Complete pvr_modify_command().");
+   } else if (custom_mapping->texel_extend_dst > 1U) {
+      pvr_finishme("Complete pvr_modify_command().");
+   }
+
+   if (custom_mapping->double_stride) {
+      transfer_cmd->dst.width *= 2U;
+      transfer_cmd->dst.stride *= 2U;
+   }
+
+   if (custom_mapping->texel_unwind_src > 0U) {
+      if (transfer_cmd->src.height == 1U) {
+         transfer_cmd->src.width += custom_mapping->texel_unwind_src;
+         transfer_cmd->src.stride += custom_mapping->texel_unwind_src;
+      } else if (transfer_cmd->src.stride == 1U) {
+         transfer_cmd->src.height += custom_mapping->texel_unwind_src;
+      } else {
+         /* Increase source width by texel unwind. If texel unwind is less than
+          * the distance between width and stride. The blit can be done with one
+          * rectangle mapping, but the width of the surface needs be to
+          * increased in case we sample from the area between width and stride.
+          */
+         transfer_cmd->src.width =
+            MIN2(transfer_cmd->src.width + custom_mapping->texel_unwind_src,
+                 transfer_cmd->src.stride);
+      }
+   }
+
+   transfer_cmd->mapping_count = pass->mapping_count;
+   for (uint32_t i = 0U; i < transfer_cmd->mapping_count; i++)
+      transfer_cmd->mappings[i] = pass->mappings[i];
+
+   if (pass->extend_height)
+      transfer_cmd->src.height += 1U;
+
+   transfer_cmd->src.width = MIN2(PVR_MAX_WIDTH, transfer_cmd->src.width);
+   transfer_cmd->src.height = MIN2(PVR_MAX_WIDTH, transfer_cmd->src.height);
+   transfer_cmd->src.stride = MIN2(PVR_MAX_WIDTH, transfer_cmd->src.stride);
+
+   if (transfer_cmd->dst.height == 1U) {
+      transfer_cmd->dst.width =
+         transfer_cmd->dst.stride + custom_mapping->texel_unwind_dst;
+      transfer_cmd->dst.mem_layout = PVR_MEMLAYOUT_TWIDDLED;
+   }
+
+   if (transfer_cmd->dst.mem_layout == PVR_MEMLAYOUT_TWIDDLED) {
+      transfer_cmd->dst.width =
+         MIN2((uint32_t)custom_mapping->max_clip_size, transfer_cmd->dst.width);
+      transfer_cmd->dst.height = MIN2((uint32_t)custom_mapping->max_clip_size,
+                                      transfer_cmd->dst.height);
+   } else {
+      transfer_cmd->dst.width = MIN2(PVR_MAX_WIDTH, transfer_cmd->dst.width);
+   }
+
+   if (transfer_cmd->src_present) {
+      bpp = vk_format_get_blocksizebits(transfer_cmd->src.vk_format);
+
+      transfer_cmd->src.dev_addr.addr -=
+         custom_mapping->texel_unwind_src * bpp / 8U;
+      transfer_cmd->src.dev_addr.addr +=
+         MAX2(transfer_cmd->src.sample_count, 1U) * pass->src_offset * bpp / 8U;
+   }
+
+   bpp = vk_format_get_blocksizebits(transfer_cmd->dst.vk_format);
+   transfer_cmd->dst.dev_addr.addr -=
+      custom_mapping->texel_unwind_dst * bpp / 8U;
+   transfer_cmd->dst.dev_addr.addr +=
+      MAX2(transfer_cmd->dst.sample_count, 1U) * pass->dst_offset * bpp / 8U;
+
+   transfer_cmd->src_present =
+      transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FILL ? false : true;
+}
+
+static VkResult pvr_reroute_to_clip(struct pvr_transfer_ctx *ctx,
+                                    const struct pvr_transfer_cmd *transfer_cmd,
+                                    const struct VkRect2D *dst_rect,
+                                    struct pvr_transfer_prep_data *prep_data,
+                                    uint32_t pass_idx,
+                                    bool *finished_out)
+{
+   pvr_finishme("Unimplemented path.");
+   return VK_SUCCESS;
+}
+
+static VkResult pvr_3d_copy_blit(struct pvr_transfer_ctx *ctx,
+                                 struct pvr_transfer_cmd *transfer_cmd,
+                                 struct pvr_transfer_prep_data *prep_data,
+                                 uint32_t pass_idx,
+                                 bool *finished_out)
+{
+   const struct pvr_device_info *const dev_info =
+      &ctx->device->pdevice->dev_info;
+
+   const struct pvr_transfer_blit *blit = &transfer_cmd->blit;
+   struct pvr_transfer_3d_state *state = &prep_data->state;
+   struct pvr_transfer_cmd *active_cmd = transfer_cmd;
+   struct pvr_transfer_cmd int_cmd;
+   VkResult result;
+
+   state->dont_force_pbe = false;
+   state->pass_idx = pass_idx;
+
+   pvr_transfer_set_filter(transfer_cmd, state);
+
+   if (transfer_cmd->src_present) {
+      /* Try to work out a condition to map pixel formats to RAW. That is only
+       * possible if we don't perform any kind of 2D operation on the blit as we
+       * don't know the actual pixel values - i.e. it has to be point sampled -
+       * scaling doesn't matter as long as point sampled.
+       */
+      if (transfer_cmd->src.vk_format == transfer_cmd->dst.vk_format &&
+          state->filter == PVR_FILTER_POINT &&
+          transfer_cmd->src.sample_count <= transfer_cmd->dst.sample_count &&
+          (transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_DSMERGE) == 0U &&
+          transfer_cmd->blit.alpha.type == PVR_ALPHA_NONE) {
+         uint32_t bpp;
+
+         int_cmd = *transfer_cmd;
+         active_cmd = &int_cmd;
+         bpp = vk_format_get_blocksizebits(int_cmd.dst.vk_format);
+
+         if (bpp > 0U) {
+            switch (bpp) {
+            case 8U:
+               int_cmd.src.vk_format = VK_FORMAT_R8_UINT;
+               break;
+            case 16U:
+               int_cmd.src.vk_format = VK_FORMAT_R8G8_UINT;
+               break;
+            case 24U:
+               int_cmd.src.vk_format = VK_FORMAT_R8G8B8_UINT;
+               break;
+            case 32U:
+               int_cmd.src.vk_format = VK_FORMAT_R32_UINT;
+               break;
+            case 48U:
+               int_cmd.src.vk_format = VK_FORMAT_R16G16B16_UINT;
+               break;
+            case 64U:
+               int_cmd.src.vk_format = VK_FORMAT_R32G32_UINT;
+               break;
+            case 96U:
+               int_cmd.src.vk_format = VK_FORMAT_R32G32B32_UINT;
+               break;
+            case 128U:
+               int_cmd.src.vk_format = VK_FORMAT_R32G32B32A32_UINT;
+               break;
+            default:
+               active_cmd = transfer_cmd;
+               break;
+            }
+         }
+
+         int_cmd.dst.vk_format = int_cmd.src.vk_format;
+      }
+   }
+
+   if (pass_idx == 0U) {
+      pvr_get_custom_mapping(dev_info, active_cmd, 3U, &state->custom_mapping);
+
+      if (state->custom_mapping.texel_extend_src > 1U)
+         state->custom_mapping.texel_extend_dst = 1U;
+   }
+
+   if (state->custom_mapping.pass_count > 0U) {
+      struct pvr_transfer_pass *pass = &state->custom_mapping.passes[pass_idx];
+
+      if (blit->alpha.type != PVR_ALPHA_NONE)
+         return vk_error(ctx->device, VK_ERROR_FORMAT_NOT_SUPPORTED);
+
+      if (active_cmd != &int_cmd) {
+         int_cmd = *active_cmd;
+         active_cmd = &int_cmd;
+      }
+
+      state->custom_filter = true;
+
+      pvr_modify_command(&state->custom_mapping, pass_idx, active_cmd);
+
+      if (state->custom_mapping.double_stride || pass->mapping_count > 1U) {
+         result =
+            pvr_3d_clip_blit(ctx, active_cmd, prep_data, pass_idx, finished_out);
+      } else {
+         struct pvr_rect_mapping *mappings = &pass->mappings[0U];
+
+         mappings[0U].src_rect.offset.x /=
+            MAX2(1U, state->custom_mapping.texel_extend_dst);
+         mappings[0U].src_rect.extent.width /=
+            MAX2(1U, state->custom_mapping.texel_extend_dst);
+
+         if (int_cmd.src_present) {
+            for (uint32_t i = 0U; i < pass->mapping_count; i++)
+               active_cmd->mappings[i] = mappings[i];
+         }
+
+         active_cmd->scissor = mappings[0U].dst_rect;
+
+         result = pvr_3d_copy_blit_core(ctx,
+                                        active_cmd,
+                                        prep_data,
+                                        pass_idx,
+                                        finished_out);
+      }
+
+      return result;
+   }
+
+   /* Route DS merge blits to Clip blit. Background object is used to preserve
+    * the unmerged channel.
+    */
+   if ((transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_DSMERGE) != 0U) {
+      /* PBE byte mask could be used for DS merge with FastScale. Clearing the
+       * other channel on a DS merge requires Clip blit.
+       */
+      if (!PVR_HAS_ERN(dev_info, 42064) ||
+          ((transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FILL) != 0U)) {
+         return pvr_reroute_to_clip(ctx,
+                                    active_cmd,
+                                    &active_cmd->scissor,
+                                    prep_data,
+                                    pass_idx,
+                                    finished_out);
+      }
+   }
+
+   return pvr_3d_copy_blit_core(ctx,
+                                active_cmd,
+                                prep_data,
+                                pass_idx,
+                                finished_out);
+}
+
+static bool pvr_3d_validate_addr(struct pvr_transfer_cmd *transfer_cmd)
+{
+   /* TODO: Complete this function, based on TQ_3DValidateVaddr. */
+   pvr_finishme("Complete pvr_3d_validate_addr().");
+   return true;
+}
+
+static void
+pvr_submit_info_stream_init(struct pvr_transfer_ctx *ctx,
+                            struct pvr_transfer_prep_data *prep_data,
+                            struct pvr_winsys_transfer_cmd *cmd)
+{
+   const struct pvr_winsys_transfer_regs *const regs = &prep_data->state.regs;
+   const struct pvr_physical_device *const pdevice = ctx->device->pdevice;
+   const struct pvr_device_info *const dev_info = &pdevice->dev_info;
+
+   uint32_t *stream_ptr = (uint32_t *)cmd->fw_stream;
+
+   *(uint64_t *)stream_ptr = regs->pds_bgnd0_base;
+   stream_ptr += pvr_cmd_length(CR_PDS_BGRND0_BASE);
+
+   *(uint64_t *)stream_ptr = regs->pds_bgnd1_base;
+   stream_ptr += pvr_cmd_length(CR_PDS_BGRND1_BASE);
+
+   *(uint64_t *)stream_ptr = regs->pds_bgnd3_sizeinfo;
+   stream_ptr += pvr_cmd_length(CR_PDS_BGRND3_SIZEINFO);
+
+   *(uint64_t *)stream_ptr = regs->isp_mtile_base;
+   stream_ptr += pvr_cmd_length(CR_ISP_MTILE_BASE);
+
+   STATIC_ASSERT(ARRAY_SIZE(regs->pbe_wordx_mrty) == 9U);
+   STATIC_ASSERT(sizeof(regs->pbe_wordx_mrty[0]) == sizeof(uint64_t));
+   memcpy(stream_ptr, regs->pbe_wordx_mrty, sizeof(regs->pbe_wordx_mrty));
+   stream_ptr += 9U * 2U;
+
+   *stream_ptr = regs->isp_bgobjvals;
+   stream_ptr += pvr_cmd_length(CR_ISP_BGOBJVALS);
+
+   *stream_ptr = regs->usc_pixel_output_ctrl;
+   stream_ptr += pvr_cmd_length(CR_USC_PIXEL_OUTPUT_CTRL);
+
+   *stream_ptr = regs->usc_clear_register0;
+   stream_ptr += pvr_cmd_length(CR_USC_CLEAR_REGISTER0);
+
+   *stream_ptr = regs->usc_clear_register1;
+   stream_ptr += pvr_cmd_length(CR_USC_CLEAR_REGISTER1);
+
+   *stream_ptr = regs->usc_clear_register2;
+   stream_ptr += pvr_cmd_length(CR_USC_CLEAR_REGISTER2);
+
+   *stream_ptr = regs->usc_clear_register3;
+   stream_ptr += pvr_cmd_length(CR_USC_CLEAR_REGISTER3);
+
+   *stream_ptr = regs->isp_mtile_size;
+   stream_ptr += pvr_cmd_length(CR_ISP_MTILE_SIZE);
+
+   *stream_ptr = regs->isp_render_origin;
+   stream_ptr += pvr_cmd_length(CR_ISP_RENDER_ORIGIN);
+
+   *stream_ptr = regs->isp_ctl;
+   stream_ptr += pvr_cmd_length(CR_ISP_CTL);
+
+   *stream_ptr = regs->isp_aa;
+   stream_ptr += pvr_cmd_length(CR_ISP_AA);
+
+   *stream_ptr = regs->event_pixel_pds_info;
+   stream_ptr += pvr_cmd_length(CR_EVENT_PIXEL_PDS_INFO);
+
+   *stream_ptr = regs->event_pixel_pds_code;
+   stream_ptr += pvr_cmd_length(CR_EVENT_PIXEL_PDS_CODE);
+
+   *stream_ptr = regs->event_pixel_pds_data;
+   stream_ptr += pvr_cmd_length(CR_EVENT_PIXEL_PDS_DATA);
+
+   *stream_ptr = regs->isp_render;
+   stream_ptr += pvr_cmd_length(CR_ISP_RENDER);
+
+   *stream_ptr = regs->isp_rgn;
+   stream_ptr++;
+
+   if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support)) {
+      *stream_ptr = regs->frag_screen;
+      stream_ptr++;
+   }
+
+   cmd->fw_stream_len = (uint8_t *)stream_ptr - cmd->fw_stream;
+   assert(cmd->fw_stream_len <= ARRAY_SIZE(cmd->fw_stream));
+}
+
+static void pvr_transfer_job_ws_submit_info_init(
+   struct pvr_transfer_ctx *ctx,
+   struct pvr_transfer_submit *submit,
+   struct vk_sync *wait,
+   struct pvr_winsys_transfer_submit_info *submit_info)
+{
+   const struct pvr_device *const device = ctx->device;
+
+   submit_info->frame_num = device->global_queue_present_count;
+   submit_info->job_num = device->global_cmd_buffer_submit_count;
+   submit_info->wait = wait;
+   submit_info->cmd_count = submit->prep_count;
+
+   for (uint32_t i = 0U; i < submit->prep_count; i++) {
+      struct pvr_winsys_transfer_cmd *const cmd = &submit_info->cmds[i];
+      struct pvr_transfer_prep_data *prep_data = &submit->prep_array[i];
+
+      pvr_submit_info_stream_init(ctx, prep_data, cmd);
+
+      cmd->flags = prep_data->flags;
+   }
+}
+
+static VkResult pvr_submit_transfer(struct pvr_transfer_ctx *ctx,
+                                    struct pvr_transfer_submit *submit,
+                                    struct vk_sync *wait,
+                                    struct vk_sync *signal_sync)
+{
+   struct pvr_winsys_transfer_submit_info submit_info;
+
+   pvr_transfer_job_ws_submit_info_init(ctx, submit, wait, &submit_info);
+
+   return ctx->device->ws->ops->transfer_submit(ctx->ws_ctx,
+                                                &submit_info,
+                                                &ctx->device->pdevice->dev_info,
+                                                signal_sync);
+}
+
+static VkResult pvr_queue_transfer(struct pvr_transfer_ctx *ctx,
+                                   struct pvr_transfer_cmd *transfer_cmd,
+                                   struct vk_sync *wait,
+                                   struct vk_sync *signal_sync)
+{
+   struct pvr_transfer_prep_data *prep_data = NULL;
+   struct pvr_transfer_prep_data *prev_prep_data;
+   struct pvr_transfer_submit submit = { 0U };
+   bool finished = false;
+   uint32_t pass = 0U;
+   VkResult result;
+
+   /* Transfer queue might decide to do a blit in multiple passes. When the
+    * prepare doesn't set the finished flag this code will keep calling the
+    * prepare with increasing pass. If queued transfers are submitted from
+    * here we submit them straight away. That's why we only need a single
+    * prepare for the blit rather then one for each pass. Otherwise we insert
+    * each prepare into the prepare array. When the client does blit batching
+    * and we split the blit into multiple passes each pass in each queued
+    * transfer adds one more prepare. Thus the prepare array after 2
+    * pvr_queue_transfer calls might look like:
+    *
+    * +------+------++-------+-------+-------+
+    * |B0/P0 |B0/P1 || B1/P0 | B1/P1 | B1/P2 |
+    * +------+------++-------+-------+-------+
+    * F           S/U F                    S/U
+    *
+    * Bn/Pm : nth blit (queue transfer call) / mth prepare
+    * F     : fence point
+    * S/U   : update / server sync update point
+    */
+
+   while (!finished) {
+      prev_prep_data = prep_data;
+      prep_data = &submit.prep_array[submit.prep_count++];
+
+      /* Clear down the memory before we write to this prep. */
+      memset(prep_data, 0U, sizeof(*prep_data));
+
+      if (pass == 0U) {
+         if (!pvr_3d_validate_addr(transfer_cmd))
+            return vk_error(ctx->device, VK_ERROR_FEATURE_NOT_PRESENT);
+      } else {
+         /* Transfer queue workarounds could use more than one pass with 3D
+          * path.
+          */
+         prep_data->state = prev_prep_data->state;
+      }
+
+      if (transfer_cmd->flags & PVR_TRANSFER_CMD_FLAGS_FAST2D) {
+         result =
+            pvr_3d_clip_blit(ctx, transfer_cmd, prep_data, pass, &finished);
+      } else {
+         result =
+            pvr_3d_copy_blit(ctx, transfer_cmd, prep_data, pass, &finished);
+      }
+      if (result != VK_SUCCESS)
+         return result;
+
+      /* Submit if we have finished the blit or if we are out of prepares. */
+      if (finished || submit.prep_count == ARRAY_SIZE(submit.prep_array)) {
+         result = pvr_submit_transfer(ctx,
+                                      &submit,
+                                      wait,
+                                      finished ? signal_sync : NULL);
+         if (result != VK_SUCCESS)
+            return result;
+
+         /* Check if we need to reset prep_count. */
+         if (submit.prep_count == ARRAY_SIZE(submit.prep_array))
+            submit.prep_count = 0U;
+      }
+
+      pass++;
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult pvr_transfer_job_submit(struct pvr_transfer_ctx *ctx,
+                                 struct pvr_sub_cmd_transfer *sub_cmd,
+                                 struct vk_sync *wait_sync,
+                                 struct vk_sync *signal_sync)
+{
+   list_for_each_entry_safe (struct pvr_transfer_cmd,
+                             transfer_cmd,
+                             &sub_cmd->transfer_cmds,
+                             link) {
+      /* The fw guarantees that any kick on the same context will be
+       * synchronized in submission order. This means only the first kick must
+       * wait, and only the last kick need signal.
+       */
+      struct vk_sync *first_cmd_wait_sync = NULL;
+      struct vk_sync *last_cmd_signal_sync = NULL;
+      VkResult result;
+
+      if (list_first_entry(&sub_cmd->transfer_cmds,
+                          struct pvr_transfer_cmd,
+                          link) == transfer_cmd) {
+         first_cmd_wait_sync = wait_sync;
+      }
+
+      if (list_last_entry(&sub_cmd->transfer_cmds,
+                          struct pvr_transfer_cmd,
+                          link) == transfer_cmd) {
+         last_cmd_signal_sync = signal_sync;
+      }
+
+      result = pvr_queue_transfer(ctx,
+                                  transfer_cmd,
+                                  first_cmd_wait_sync,
+                                  last_cmd_signal_sync);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return VK_SUCCESS;
 }
diff --git a/src/imagination/vulkan/pvr_job_transfer.h b/src/imagination/vulkan/pvr_job_transfer.h
index 0084969..8c3ea4c 100644
--- a/src/imagination/vulkan/pvr_job_transfer.h
+++ b/src/imagination/vulkan/pvr_job_transfer.h
@@ -27,13 +27,23 @@
 #include <stdint.h>
 #include <vulkan/vulkan.h>
 
-struct pvr_device;
 struct pvr_sub_cmd_transfer;
 struct pvr_transfer_ctx;
 struct vk_sync;
 
-VkResult pvr_transfer_job_submit(struct pvr_device *device,
-                                 struct pvr_transfer_ctx *ctx,
+/**
+ * Destination pixels not covered by any of the destination rectangles but
+ * inside the scissor are filled with the clear color.
+ */
+#define PVR_TRANSFER_CMD_FLAGS_FILL 0x00000800U
+/** If using TQ3D, route to fast2d. */
+#define PVR_TRANSFER_CMD_FLAGS_FAST2D 0x00200000U
+/** Merge a depth or stencil against a depth + stencil texture. */
+#define PVR_TRANSFER_CMD_FLAGS_DSMERGE 0x00000200U
+/** Valid if doing a DS merge with depth + stencil to depth + stencil. */
+#define PVR_TRANSFER_CMD_FLAGS_PICKD 0x00000400U
+
+VkResult pvr_transfer_job_submit(struct pvr_transfer_ctx *ctx,
                                  struct pvr_sub_cmd_transfer *sub_cmd,
                                  struct vk_sync *wait,
                                  struct vk_sync *signal_sync);
diff --git a/src/imagination/vulkan/pvr_private.h b/src/imagination/vulkan/pvr_private.h
index c0aa911..2a40d53 100644
--- a/src/imagination/vulkan/pvr_private.h
+++ b/src/imagination/vulkan/pvr_private.h
@@ -58,6 +58,7 @@
 #include "util/macros.h"
 #include "util/simple_mtx.h"
 #include "util/u_dynarray.h"
+#include "util/u_math.h"
 #include "vk_buffer.h"
 #include "vk_command_buffer.h"
 #include "vk_device.h"
@@ -353,16 +354,118 @@ struct pvr_buffer_view {
    uint64_t texture_state[2];
 };
 
+#define PVR_TRANSFER_MAX_CUSTOM_MAPPINGS 6U
+
+/** A surface describes a source or destination for a transfer operation. */
+struct pvr_transfer_cmd_surface {
+   pvr_dev_addr_t dev_addr;
+
+   /* Memory address for extra U/V planes. */
+   pvr_dev_addr_t uv_address[2];
+
+   /* Surface width in texels. */
+   uint32_t width;
+
+   /* Surface height in texels. */
+   uint32_t height;
+
+   uint32_t depth;
+
+   /* Z position in a 3D tecture. 0.0f <= z_position <= depth. */
+   float z_position;
+
+   /* Stride in texels. */
+   uint32_t stride;
+
+   VkFormat vk_format;
+
+   enum pvr_memlayout mem_layout;
+
+   uint32_t sample_count;
+};
+
+struct pvr_rect_mapping {
+   VkRect2D src_rect;
+   VkRect2D dst_rect;
+};
+
+/* Describes an Alpha-Transparency configuration - for Transfer Queue Use. */
+struct pvr_transfer_alpha {
+   enum pvr_alpha_type type;
+   /* Global alpha value. */
+   uint32_t global;
+
+   /* Custom blend op for rgb. */
+   uint32_t custom_rgb;
+   /* Custom blend op for alpha. */
+   uint32_t custom_alpha;
+   /* Custom global alpha value for alpha output. */
+   uint32_t global2;
+   /* Custom multiplication of global and source alpha. */
+   bool glob_src_mul;
+   /* Custom zero source alpha transparency stage. */
+   bool zero_src_a_trans;
+
+   /* Enable argb1555 alpha components. */
+   bool alpha_components;
+   /* Source alpha value when argb1555 alpha bit is 0. */
+   uint32_t component0;
+   /* Source alpha value when argb1555 alpha bit is 1. */
+   uint32_t component1;
+};
+
+struct pvr_transfer_blit {
+   /* 16 bit rop4 (ie two 8 bit rop3's). */
+   uint32_t rop_code;
+
+   /* Color key mask. */
+   uint32_t color_mask;
+
+   /* Alpha blend. */
+   struct pvr_transfer_alpha alpha;
+
+   VkOffset2D offset;
+};
+
 struct pvr_transfer_cmd {
    /* Node to link this cmd into the transfer_cmds list in
     * pvr_sub_cmd::transfer structure.
     */
    struct list_head link;
 
-   struct pvr_buffer *src;
-   struct pvr_buffer *dst;
-   uint32_t region_count;
-   VkBufferCopy2 regions[0];
+   uint32_t flags;
+
+   struct pvr_transfer_cmd_surface src;
+   bool src_present;
+
+   union fi clear_color[4];
+
+   struct pvr_transfer_cmd_surface dst;
+
+   VkRect2D scissor;
+
+   uint32_t mapping_count;
+   struct pvr_rect_mapping mappings[PVR_TRANSFER_MAX_CUSTOM_MAPPINGS];
+
+   /* In the case of a simple 1:1 copy, this setting does not affect the output
+    * but will affect performance. Use clamp to edge when possible.
+    */
+   /* This is of type enum PVRX(TEXSTATE_ADDRMODE). */
+   int addr_mode;
+
+   /* Source filtering method. */
+   enum pvr_filter filter;
+
+   /* MSAA resolve operation. */
+   enum pvr_resolve_op resolve_op;
+
+   struct pvr_transfer_blit blit;
+
+   /* Pointer to cmd buffer this transfer cmd belongs to. This is mainly used
+    * to link buffer objects allocated during job submission into
+    * cmd_buffer::bo_list head.
+    */
+   struct pvr_cmd_buffer *cmd_buffer;
 };
 
 struct pvr_sub_cmd_gfx {
diff --git a/src/imagination/vulkan/pvr_queue.c b/src/imagination/vulkan/pvr_queue.c
index 4b8ebc3..e029ba4 100644
--- a/src/imagination/vulkan/pvr_queue.c
+++ b/src/imagination/vulkan/pvr_queue.c
@@ -361,8 +361,7 @@ static VkResult pvr_process_transfer_cmds(struct pvr_device *device,
       return result;
 
    result =
-      pvr_transfer_job_submit(device,
-                              queue->transfer_ctx,
+      pvr_transfer_job_submit(queue->transfer_ctx,
                               sub_cmd,
                               queue->next_job_wait_sync[PVR_JOB_TYPE_TRANSFER],
                               sync);
diff --git a/src/imagination/vulkan/pvr_tex_state.c b/src/imagination/vulkan/pvr_tex_state.c
index 4ef771a..4de0e99 100644
--- a/src/imagination/vulkan/pvr_tex_state.c
+++ b/src/imagination/vulkan/pvr_tex_state.c
@@ -218,7 +218,8 @@ pvr_pack_tex_state(struct pvr_device *device,
             if (iview_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
                array_layers /= 6;
 
-            word1.depth = array_layers - 1;
+            if (array_layers > 0)
+               word1.depth = array_layers - 1;
          }
 
          word1.texaddr = PVR_DEV_ADDR_OFFSET(info->addr, info->offset);
diff --git a/src/imagination/vulkan/pvr_transfer_frag_store.c b/src/imagination/vulkan/pvr_transfer_frag_store.c
new file mode 100644
index 0000000..d1354d4
--- /dev/null
+++ b/src/imagination/vulkan/pvr_transfer_frag_store.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright Â© 2023 Imagination Technologies Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <vulkan/vulkan_core.h>
+
+#include "hwdef/rogue_hw_utils.h"
+#include "pvr_bo.h"
+#include "pvr_common.h"
+#include "pvr_device_info.h"
+#include "pvr_job_transfer.h"
+#include "pvr_pds.h"
+#include "pvr_private.h"
+#include "pvr_transfer_frag_store.h"
+#include "pvr_types.h"
+#include "pvr_uscgen.h"
+#include "util/hash_table.h"
+#include "util/macros.h"
+#include "util/ralloc.h"
+#include "util/u_dynarray.h"
+#include "util/u_math.h"
+#include "vk_log.h"
+
+#define PVR_TRANSFER_BYTE_UNWIND_MAX 16U
+
+struct pvr_transfer_frag_store_entry_data {
+   pvr_dev_addr_t kick_usc_pds_offset;
+   struct pvr_bo *kick_usc_pds_upload;
+
+   struct pvr_bo *usc_upload;
+   struct pvr_tq_frag_sh_reg_layout sh_reg_layout;
+};
+
+#define to_pvr_entry_data(_entry) \
+   _Generic((_entry), \
+            struct hash_entry *: (struct pvr_transfer_frag_store_entry_data *)((_entry)->data), \
+            const struct hash_entry *: (const struct pvr_transfer_frag_store_entry_data *)((_entry)->data))
+
+VkResult pvr_transfer_frag_store_init(struct pvr_device *device,
+                                      struct pvr_transfer_frag_store *store)
+{
+   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
+
+   *store = (struct pvr_transfer_frag_store){
+      .max_multisample = PVR_GET_FEATURE_VALUE(dev_info, max_multisample, 1U),
+      .hash_table = _mesa_hash_table_create_u32_keys(NULL),
+   };
+
+   if (!store->hash_table)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   return VK_SUCCESS;
+}
+
+/**
+ * \brief Returns a key based on shader properties.
+ *
+ * Returns a unique key that can be used to uniquely identify a transfer
+ * fragment shader based on the provided shader properties.
+ *
+ * Make sure that the non valid parts of shader_props are memset to 0. Otherwise
+ * these bits might appear in the key as uninitialized data and might not
+ * match a key for the same shader.
+ */
+static uint32_t pvr_transfer_frag_shader_key(
+   uint32_t max_multisample,
+   const struct pvr_tq_shader_properties *shader_props)
+{
+   const struct pvr_tq_layer_properties *layer = &shader_props->layer_props;
+   uint32_t resolve_op_num = max_multisample + PVR_RESOLVE_SAMPLE0;
+
+   uint32_t num_layers_bits = util_logbase2_ceil(PVR_TRANSFER_MAX_LAYERS + 1U);
+   uint32_t layer_float_bits = util_logbase2_ceil(PVR_INT_COORD_SET_FLOATS_NUM);
+   uint32_t pixel_src_bits = util_logbase2_ceil(PVR_TRANSFER_PBE_PIXEL_SRC_NUM);
+   uint32_t byte_unwind_bits = util_logbase2_ceil(PVR_TRANSFER_BYTE_UNWIND_MAX);
+   uint32_t resolve_op_bits = util_logbase2_ceil(resolve_op_num);
+   uint32_t sample_cnt_bits = util_last_bit(util_logbase2(max_multisample));
+   uint32_t hash = 0U;
+
+#if defined(DEBUG)
+   uint32_t max_shift = 0U;
+#   define shift_hash(hash, num)   \
+      do {                         \
+         max_shift += (num);       \
+         assert(max_shift <= 32U); \
+                                   \
+         (hash) <<= (num);         \
+      } while (0U)
+#else
+#   define shift_hash(hash, num) hash <<= (num)
+#endif
+
+   /* Hash layer info. */
+
+   shift_hash(hash, layer_float_bits);
+   hash |= (uint32_t)shader_props->layer_props.layer_floats;
+
+   shift_hash(hash, 1U);
+   hash |= layer->sample;
+
+   shift_hash(hash, 1U);
+   hash |= (uint32_t) false;
+
+   shift_hash(hash, 1U);
+   hash |= (uint32_t) false;
+
+   shift_hash(hash, pixel_src_bits);
+   hash |= (uint32_t)layer->pbe_format;
+
+   shift_hash(hash, resolve_op_bits);
+   hash |= (uint32_t)layer->resolve_op;
+
+   assert(util_is_power_of_two_nonzero(layer->sample_count));
+   shift_hash(hash, sample_cnt_bits);
+   hash |= (uint32_t)util_logbase2(layer->sample_count);
+
+   shift_hash(hash, 1U);
+   hash |= (uint32_t)layer->msaa;
+
+   shift_hash(hash, byte_unwind_bits);
+   hash |= layer->byte_unwind;
+
+   shift_hash(hash, 1U);
+   hash |= (uint32_t)layer->linear;
+
+   /* End layer info. */
+
+   shift_hash(hash, 1U);
+   hash |= (uint32_t)shader_props->full_rate;
+
+   shift_hash(hash, 1U);
+   hash |= (uint32_t)shader_props->iterated;
+
+   shift_hash(hash, 1U);
+   hash |= (uint32_t)shader_props->pick_component;
+
+   shift_hash(hash, num_layers_bits);
+   /* Just 1 layer. */
+   hash |= 1;
+
+   shift_hash(hash, 3U);
+   hash |= shader_props->alpha_type;
+
+#undef shift_hash
+
+   return hash;
+}
+
+#define to_hash_table_key(_key) ((void *)(uintptr_t)(_key))
+
+static VkResult pvr_transfer_frag_store_entry_data_compile(
+   struct pvr_device *device,
+   struct pvr_transfer_frag_store_entry_data *const entry_data,
+   const struct pvr_tq_shader_properties *shader_props,
+   uint32_t *const num_usc_temps_out)
+{
+   const uint32_t image_desc_offset =
+      offsetof(struct pvr_combined_image_sampler_descriptor, image) / 4;
+   const uint32_t sampler_desc_offset =
+      offsetof(struct pvr_combined_image_sampler_descriptor, sampler) / 4;
+
+   const uint32_t cache_line_size =
+      rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
+
+   struct pvr_tq_frag_sh_reg_layout *sh_reg_layout = &entry_data->sh_reg_layout;
+   uint32_t next_free_sh_reg = 0;
+   struct util_dynarray shader;
+   VkResult result;
+
+   /* TODO: Allocate all combined image samplers if needed? Otherwise change the
+    * array to a single descriptor.
+    */
+   sh_reg_layout->combined_image_samplers.offsets[0].image =
+      next_free_sh_reg + image_desc_offset;
+   sh_reg_layout->combined_image_samplers.offsets[0].sampler =
+      next_free_sh_reg + sampler_desc_offset;
+   sh_reg_layout->combined_image_samplers.count = 1;
+   next_free_sh_reg += sizeof(struct pvr_combined_image_sampler_descriptor) / 4;
+
+   /* TODO: Handle dynamic_const_regs used for PVR_INT_COORD_SET_FLOATS_{4,6}, Z
+    * position, texel unwind, etc. when compiler adds support for them.
+    */
+   sh_reg_layout->dynamic_consts.offset = next_free_sh_reg;
+   sh_reg_layout->dynamic_consts.count = 0;
+
+   sh_reg_layout->driver_total = next_free_sh_reg;
+
+   pvr_uscgen_tq_frag(shader_props,
+                      &entry_data->sh_reg_layout,
+                      num_usc_temps_out,
+                      &shader);
+
+   result = pvr_gpu_upload_usc(device,
+                               util_dynarray_begin(&shader),
+                               util_dynarray_num_elements(&shader, uint8_t),
+                               cache_line_size,
+                               &entry_data->usc_upload);
+   util_dynarray_fini(&shader);
+   if (result != VK_SUCCESS)
+      return result;
+
+   return VK_SUCCESS;
+}
+
+static VkResult pvr_transfer_frag_store_entry_data_create(
+   struct pvr_device *device,
+   struct pvr_transfer_frag_store *store,
+   const struct pvr_tq_shader_properties *shader_props,
+   const struct pvr_transfer_frag_store_entry_data **const entry_data_out)
+{
+   struct pvr_pds_kickusc_program kick_usc_pds_prog = { 0 };
+   struct pvr_transfer_frag_store_entry_data *entry_data;
+   pvr_dev_addr_t dev_addr;
+   uint32_t num_usc_temps;
+   VkResult result;
+
+   entry_data = ralloc(store->hash_table, __typeof__(*entry_data));
+   if (!entry_data)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = pvr_transfer_frag_store_entry_data_compile(device,
+                                                       entry_data,
+                                                       shader_props,
+                                                       &num_usc_temps);
+   if (result != VK_SUCCESS)
+      goto err_free_entry;
+
+   dev_addr = entry_data->usc_upload->vma->dev_addr;
+   dev_addr.addr -= device->heaps.usc_heap->base_addr.addr;
+
+   pvr_pds_setup_doutu(&kick_usc_pds_prog.usc_task_control,
+                       dev_addr.addr,
+                       num_usc_temps,
+                       shader_props->full_rate
+                          ? PVRX(PDSINST_DOUTU_SAMPLE_RATE_FULL)
+                          : PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
+                       false);
+
+   pvr_pds_kick_usc(&kick_usc_pds_prog, NULL, 0U, false, PDS_GENERATE_SIZES);
+
+   result = pvr_bo_alloc(
+      device,
+      device->heaps.pds_heap,
+      (kick_usc_pds_prog.data_size + kick_usc_pds_prog.code_size) * 4,
+      16,
+      PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+      &entry_data->kick_usc_pds_upload);
+   if (result != VK_SUCCESS)
+      goto err_free_usc_upload;
+
+   pvr_pds_kick_usc(&kick_usc_pds_prog,
+                    entry_data->kick_usc_pds_upload->bo->map,
+                    0U,
+                    false,
+                    PDS_GENERATE_CODEDATA_SEGMENTS);
+
+   dev_addr = entry_data->kick_usc_pds_upload->vma->dev_addr;
+   dev_addr.addr -= device->heaps.pds_heap->base_addr.addr;
+   entry_data->kick_usc_pds_offset = dev_addr;
+
+   *entry_data_out = entry_data;
+
+   return VK_SUCCESS;
+
+err_free_usc_upload:
+   pvr_bo_free(device, entry_data->usc_upload);
+
+err_free_entry:
+   ralloc_free(entry_data);
+
+   return result;
+}
+
+static void inline pvr_transfer_frag_store_entry_data_destroy_no_ralloc_free(
+   struct pvr_device *device,
+   const struct pvr_transfer_frag_store_entry_data *entry_data)
+{
+   pvr_bo_free(device, entry_data->kick_usc_pds_upload);
+   pvr_bo_free(device, entry_data->usc_upload);
+}
+
+static void inline pvr_transfer_frag_store_entry_data_destroy(
+   struct pvr_device *device,
+   const struct pvr_transfer_frag_store_entry_data *entry_data)
+{
+   pvr_transfer_frag_store_entry_data_destroy_no_ralloc_free(device,
+                                                             entry_data);
+   /* Casting away the const :( */
+   ralloc_free((void *)entry_data);
+}
+
+static VkResult pvr_transfer_frag_store_get_entry(
+   struct pvr_device *device,
+   struct pvr_transfer_frag_store *store,
+   const struct pvr_tq_shader_properties *shader_props,
+   const struct pvr_transfer_frag_store_entry_data **const entry_data_out)
+{
+   const uint32_t key =
+      pvr_transfer_frag_shader_key(store->max_multisample, shader_props);
+   const struct hash_entry *entry;
+   VkResult result;
+
+   entry = _mesa_hash_table_search(store->hash_table, to_hash_table_key(key));
+   if (!entry) {
+      /* Init so that gcc stops complaining. */
+      const struct pvr_transfer_frag_store_entry_data *entry_data = NULL;
+
+      result = pvr_transfer_frag_store_entry_data_create(device,
+                                                         store,
+                                                         shader_props,
+                                                         &entry_data);
+      if (result != VK_SUCCESS)
+         return result;
+
+      assert(entry_data);
+
+      entry = _mesa_hash_table_insert(store->hash_table,
+                                      to_hash_table_key(key),
+                                      (void *)entry_data);
+      if (!entry) {
+         pvr_transfer_frag_store_entry_data_destroy(device, entry_data);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+   }
+
+   *entry_data_out = to_pvr_entry_data(entry);
+
+   return VK_SUCCESS;
+}
+
+VkResult pvr_transfer_frag_store_get_shader_info(
+   struct pvr_device *device,
+   struct pvr_transfer_frag_store *store,
+   const struct pvr_tq_shader_properties *shader_props,
+   pvr_dev_addr_t *const pds_dev_addr_out,
+   const struct pvr_tq_frag_sh_reg_layout **const reg_layout_out)
+{
+   /* Init so that gcc stops complaining. */
+   const struct pvr_transfer_frag_store_entry_data *entry_data = NULL;
+   VkResult result;
+
+   result = pvr_transfer_frag_store_get_entry(device,
+                                              store,
+                                              shader_props,
+                                              &entry_data);
+   if (result != VK_SUCCESS)
+      return result;
+
+   *pds_dev_addr_out = entry_data->kick_usc_pds_offset;
+   *reg_layout_out = &entry_data->sh_reg_layout;
+
+   return VK_SUCCESS;
+}
+
+void pvr_transfer_frag_store_fini(struct pvr_device *device,
+                                  struct pvr_transfer_frag_store *store)
+{
+   hash_table_foreach_remove(store->hash_table, entry)
+   {
+      /* ralloc_free() in _mesa_hash_table_destroy() will free each entry's
+       * memory so let's not waste extra time freeing them one by one and
+       * unliking.
+       */
+      pvr_transfer_frag_store_entry_data_destroy_no_ralloc_free(
+         device,
+         to_pvr_entry_data(entry));
+   }
+
+   _mesa_hash_table_destroy(store->hash_table, NULL);
+}
diff --git a/src/imagination/vulkan/pvr_transfer_frag_store.h b/src/imagination/vulkan/pvr_transfer_frag_store.h
new file mode 100644
index 0000000..ba1a7ae
--- /dev/null
+++ b/src/imagination/vulkan/pvr_transfer_frag_store.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright Â© 2023 Imagination Technologies Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef PVR_TRANSFER_FRAG_STORE_H
+#define PVR_TRANSFER_FRAG_STORE_H
+
+#include <stdint.h>
+#include <vulkan/vulkan_core.h>
+
+#include "pvr_device_info.h"
+#include "pvr_uscgen.h"
+#include "pvr_types.h"
+#include "util/hash_table.h"
+
+struct pvr_device;
+
+struct pvr_transfer_frag_store {
+   uint32_t max_multisample;
+   /* Hash table mapping keys, produced by pvr_transfer_frag_shader_key(), to
+    * pvr_transfer_frag_store_entry_data entries.
+    */
+   struct hash_table *hash_table;
+};
+
+VkResult pvr_transfer_frag_store_init(struct pvr_device *device,
+                                      struct pvr_transfer_frag_store *store);
+void pvr_transfer_frag_store_fini(struct pvr_device *device,
+                                  struct pvr_transfer_frag_store *store);
+
+VkResult pvr_transfer_frag_store_get_shader_info(
+   struct pvr_device *device,
+   struct pvr_transfer_frag_store *store,
+   const struct pvr_tq_shader_properties *shader_props,
+   pvr_dev_addr_t *const pds_dev_addr_out,
+   const struct pvr_tq_frag_sh_reg_layout **const reg_layout_out);
+
+#endif /* PVR_TRANSFER_FRAG_STORE_H */
diff --git a/src/imagination/vulkan/vk_format.h b/src/imagination/vulkan/vk_format.h
index 2153460..4b846ce 100644
--- a/src/imagination/vulkan/vk_format.h
+++ b/src/imagination/vulkan/vk_format.h
@@ -88,4 +88,45 @@ static inline bool vk_format_is_normalized(VkFormat vk_format)
    return true;
 }
 
+static inline uint32_t
+vk_format_get_common_color_channel_count(VkFormat src_format,
+                                         VkFormat dst_format)
+{
+   const struct util_format_description *dst_desc =
+      vk_format_description(dst_format);
+   const struct util_format_description *src_desc =
+      vk_format_description(src_format);
+   uint32_t count = 0;
+
+   /* Check if destination format is alpha only and source format has alpha
+    * channel.
+    */
+   if (util_format_is_alpha(vk_format_to_pipe_format(dst_format))) {
+      count = 1;
+   } else if (dst_desc->nr_channels <= src_desc->nr_channels) {
+      for (uint32_t i = 0; i < dst_desc->nr_channels; i++) {
+         enum pipe_swizzle swizzle = dst_desc->swizzle[i];
+
+         if (swizzle > PIPE_SWIZZLE_W)
+            continue;
+
+         for (uint32_t j = 0; j < src_desc->nr_channels; j++) {
+            if (src_desc->swizzle[j] == swizzle) {
+               count++;
+               break;
+            }
+         }
+      }
+   } else {
+      count = dst_desc->nr_channels;
+   }
+
+   return count;
+}
+
+static inline bool vk_format_is_alpha(VkFormat format)
+{
+   return util_format_is_alpha(vk_format_to_pipe_format(format));
+}
+
 #endif /* VK_FORMAT_H */
diff --git a/src/imagination/vulkan/winsys/pvr_winsys.h b/src/imagination/vulkan/winsys/pvr_winsys.h
index 1fa8650..08d4989 100644
--- a/src/imagination/vulkan/winsys/pvr_winsys.h
+++ b/src/imagination/vulkan/winsys/pvr_winsys.h
@@ -258,6 +258,7 @@ struct pvr_winsys_transfer_regs {
    uint32_t event_pixel_pds_code;
    uint32_t event_pixel_pds_data;
    uint32_t event_pixel_pds_info;
+   uint32_t frag_screen;
    uint32_t isp_aa;
    uint32_t isp_bgobjvals;
    uint32_t isp_ctl;