panvk: Implement vkCmdCopyImageToBuffer()
authorBoris Brezillon <boris.brezillon@collabora.com>
Wed, 8 Sep 2021 08:14:21 +0000 (10:14 +0200)
committerBoris Brezillon <boris.brezillon@collabora.com>
Tue, 21 Sep 2021 13:01:23 +0000 (15:01 +0200)
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12095>

src/panfrost/vulkan/panvk_private.h
src/panfrost/vulkan/panvk_vX_meta_copy.c

index 7922828..65ace3e 100644 (file)
@@ -130,6 +130,7 @@ panvk_logi_v(const char *format, va_list va);
 #define panvk_stub() assert(!"stub")
 
 #define PANVK_META_COPY_BUF2IMG_NUM_FORMATS 12
+#define PANVK_META_COPY_IMG2BUF_NUM_FORMATS 12
 #define PANVK_META_COPY_IMG2IMG_NUM_FORMATS 14
 #define PANVK_META_COPY_NUM_TEX_TYPES 5
 
@@ -166,6 +167,10 @@ struct panvk_meta {
       } buf2img[PANVK_META_COPY_BUF2IMG_NUM_FORMATS];
       struct {
          mali_ptr rsd;
+         struct panfrost_ubo_push pushmap;
+      } img2buf[PANVK_META_COPY_NUM_TEX_TYPES][PANVK_META_COPY_IMG2BUF_NUM_FORMATS];
+      struct {
+         mali_ptr rsd;
       } img2img[PANVK_META_COPY_NUM_TEX_TYPES][PANVK_META_COPY_IMG2IMG_NUM_FORMATS];
    } copy;
 };
index c88ccb7..c255f40 100644 (file)
@@ -191,6 +191,41 @@ panvk_meta_copy_emit_tiler_job(struct pan_pool *desc_pool,
    return job;
 }
 
+static struct panfrost_ptr
+panvk_meta_copy_emit_compute_job(struct pan_pool *desc_pool,
+                                 struct pan_scoreboard *scoreboard,
+                                 const struct pan_compute_dim *num_wg,
+                                 const struct pan_compute_dim *wg_sz,
+                                 mali_ptr texture, mali_ptr sampler,
+                                 mali_ptr ubo, mali_ptr push_constants,
+                                 mali_ptr rsd, mali_ptr tsd)
+{
+   struct panfrost_ptr job =
+      pan_pool_alloc_desc(desc_pool, COMPUTE_JOB);
+
+   void *invoc = pan_section_ptr(job.cpu,
+                                 COMPUTE_JOB,
+                                 INVOCATION);
+   panfrost_pack_work_groups_compute(invoc, num_wg->x, num_wg->y, num_wg->z,
+                                     wg_sz->x, wg_sz->y, wg_sz->z,
+                                     false, false);
+
+   pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
+      cfg.job_task_split = 8;
+   }
+
+   panvk_meta_copy_emit_dcd(desc_pool, 0, 0, texture, sampler,
+                            0, tsd, rsd, ubo, push_constants,
+                            pan_section_ptr(job.cpu, COMPUTE_JOB, DRAW));
+
+   pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);
+
+   panfrost_add_job(desc_pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
+                    false, false, 0, 0, &job, false);
+   return job;
+}
+
+
 #if PAN_ARCH >= 6
 static uint32_t
 panvk_meta_copy_img_bifrost_raw_format(unsigned texelsize)
@@ -340,6 +375,28 @@ panvk_meta_copy_emit_push_constants(struct panfrost_device *pdev,
 }
 
 static mali_ptr
+panvk_meta_copy_to_buf_emit_rsd(struct panfrost_device *pdev,
+                                struct pan_pool *desc_pool,
+                                mali_ptr shader,
+                                const struct pan_shader_info *shader_info,
+                                bool from_img)
+{
+   struct panfrost_ptr rsd_ptr =
+      pan_pool_alloc_desc_aggregate(desc_pool,
+                                    PAN_DESC(RENDERER_STATE));
+
+   pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
+      pan_shader_prepare_rsd(shader_info, shader, &cfg);
+      if (from_img) {
+         cfg.shader.texture_count = 1;
+         cfg.shader.sampler_count = 1;
+      }
+   }
+
+   return rsd_ptr.gpu;
+}
+
+static mali_ptr
 panvk_meta_copy_img2img_shader(struct panfrost_device *pdev,
                                struct pan_pool *bin_pool,
                                enum pipe_format srcfmt,
@@ -1235,6 +1292,448 @@ panvk_per_arch(CmdCopyBufferToImage)(VkCommandBuffer commandBuffer,
    }
 }
 
+static const struct panvk_meta_copy_format_info panvk_meta_copy_img2buf_fmts[] = {
+   { PIPE_FORMAT_R8_UINT, 0x1 },
+   { PIPE_FORMAT_R8G8_UINT, 0x3 },
+   { PIPE_FORMAT_R5G6B5_UNORM, 0x7 },
+   { PIPE_FORMAT_R8G8B8A8_UINT, 0xf },
+   { PIPE_FORMAT_R16G16B16_UINT, 0x7 },
+   { PIPE_FORMAT_R32G32_UINT, 0x3 },
+   { PIPE_FORMAT_R32G32B32_UINT, 0x7 },
+   { PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
+   /* S8 -> Z24S8 */
+   { PIPE_FORMAT_R8G8B8A8_UINT, 0x8 },
+   /* S8 -> Z32_S8X24 */
+   { PIPE_FORMAT_R32G32_UINT, 0x2 },
+   /* Z24X8 -> Z24S8 */
+   { PIPE_FORMAT_R8G8B8A8_UINT, 0x7 },
+   /* Z32 -> Z32_S8X24 */
+   { PIPE_FORMAT_R32G32_UINT, 0x1 },
+};
+
+static enum pipe_format
+panvk_meta_copy_img2buf_format(enum pipe_format imgfmt)
+{
+   /* Pick blendable formats when we can, and the FLOAT variant matching the
+    * texelsize otherwise.
+    */
+   switch (util_format_get_blocksize(imgfmt)) {
+   case 1: return PIPE_FORMAT_R8_UINT;
+   /* AFBC stores things differently for RGB565,
+    * we can't simply map to R8G8 in that case */
+   case 2: return (imgfmt == PIPE_FORMAT_R5G6B5_UNORM ||
+                   imgfmt == PIPE_FORMAT_B5G6R5_UNORM) ?
+                  PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UINT;
+   case 4: return PIPE_FORMAT_R8G8B8A8_UINT;
+   case 6: return PIPE_FORMAT_R16G16B16_UINT;
+   case 8: return PIPE_FORMAT_R32G32_UINT;
+   case 12: return PIPE_FORMAT_R32G32B32_UINT;
+   case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
+   default: unreachable("Invalid format\n");
+   }
+}
+
+struct panvk_meta_copy_img2buf_info {
+   struct {
+      mali_ptr ptr;
+      struct {
+         unsigned line;
+         unsigned surf;
+      } stride;
+   } buf;
+   struct {
+      struct {
+         unsigned x, y, z;
+      } offset;
+      struct {
+         unsigned minx, miny, maxx, maxy;
+      } extent;
+   } img;
+};
+
+#define panvk_meta_copy_img2buf_get_info_field(b, field) \
+        nir_load_ubo((b), 1, \
+                     sizeof(((struct panvk_meta_copy_img2buf_info *)0)->field) * 8, \
+                     nir_imm_int(b, 0), \
+                     nir_imm_int(b, offsetof(struct panvk_meta_copy_img2buf_info, field)), \
+                     .align_mul = 4, \
+                     .align_offset = 0, \
+                     .range_base = 0, \
+                     .range = ~0)
+
+static mali_ptr
+panvk_meta_copy_img2buf_shader(struct panfrost_device *pdev,
+                               struct pan_pool *bin_pool,
+                               struct panvk_meta_copy_format_info key,
+                               unsigned texdim, unsigned texisarray,
+                               struct pan_shader_info *shader_info)
+{
+   unsigned imgtexelsz = util_format_get_blocksize(key.imgfmt);
+   unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
+
+   /* FIXME: Won't work on compute queues, but we can't do that with
+    * a compute shader if the destination is an AFBC surface.
+    */
+   nir_builder b =
+      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
+                                     GENX(pan_shader_get_compiler_options)(),
+                                     "panvk_meta_copy_img2buf(dim=%dD%s,imgfmt=%s,mask=%x)",
+                                     texdim, texisarray ? "[]" : "",
+                                     util_format_name(key.imgfmt),
+                                     key.mask);
+
+   b.shader->info.internal = true;
+   b.shader->info.num_ubos = 1;
+
+   nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
+   nir_ssa_def *bufptr =
+      panvk_meta_copy_img2buf_get_info_field(&b, buf.ptr);
+   nir_ssa_def *buflinestride =
+      panvk_meta_copy_img2buf_get_info_field(&b, buf.stride.line);
+   nir_ssa_def *bufsurfstride =
+      panvk_meta_copy_img2buf_get_info_field(&b, buf.stride.surf);
+
+   nir_ssa_def *imgminx =
+      panvk_meta_copy_img2buf_get_info_field(&b, img.extent.minx);
+   nir_ssa_def *imgminy =
+      panvk_meta_copy_img2buf_get_info_field(&b, img.extent.miny);
+   nir_ssa_def *imgmaxx =
+      panvk_meta_copy_img2buf_get_info_field(&b, img.extent.maxx);
+   nir_ssa_def *imgmaxy =
+      panvk_meta_copy_img2buf_get_info_field(&b, img.extent.maxy);
+
+   nir_ssa_def *imgcoords, *inbounds;
+
+   switch (texdim + texisarray) {
+   case 1:
+      imgcoords =
+         nir_iadd(&b,
+                  nir_channel(&b, coord, 0),
+                  panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x));
+      inbounds =
+         nir_iand(&b,
+                  nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
+                  nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx));
+      break;
+   case 2:
+      imgcoords =
+         nir_vec2(&b,
+                  nir_iadd(&b,
+                           nir_channel(&b, coord, 0),
+                           panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x)),
+                  nir_iadd(&b,
+                           nir_channel(&b, coord, 1),
+                           panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)));
+      inbounds =
+         nir_iand(&b,
+                  nir_iand(&b,
+                           nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
+                           nir_uge(&b, imgmaxy, nir_channel(&b, imgcoords, 1))),
+                  nir_iand(&b,
+                           nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx),
+                           nir_uge(&b, nir_channel(&b, imgcoords, 1), imgminy)));
+      break;
+   case 3:
+      imgcoords =
+         nir_vec3(&b,
+                  nir_iadd(&b,
+                           nir_channel(&b, coord, 0),
+                           panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x)),
+                  nir_iadd(&b,
+                           nir_channel(&b, coord, 1),
+                           panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)),
+                  nir_iadd(&b,
+                           nir_channel(&b, coord, 2),
+                           panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)));
+      inbounds =
+         nir_iand(&b,
+                  nir_iand(&b,
+                           nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
+                           nir_uge(&b, imgmaxy, nir_channel(&b, imgcoords, 1))),
+                  nir_iand(&b,
+                           nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx),
+                           nir_uge(&b, nir_channel(&b, imgcoords, 1), imgminy)));
+      break;
+   default:
+      unreachable("Invalid texture dimension\n");
+   }
+
+   nir_push_if(&b, inbounds);
+
+   /* FIXME: doesn't work for tiled+compressed formats since blocks are 4x4
+    * blocks instead of 16x16 texels in that case, and there's nothing we can
+    * do to force the tile size to 4x4 in the render path.
+    * This being said, compressed textures are not compatible with AFBC, so we
+    * could use a compute shader arranging the blocks properly.
+    */
+   nir_ssa_def *offset =
+      nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, buftexelsz));
+   offset = nir_iadd(&b, offset,
+                     nir_imul(&b, nir_channel(&b, coord, 1), buflinestride));
+   offset = nir_iadd(&b, offset,
+                     nir_imul(&b, nir_channel(&b, coord, 2), bufsurfstride));
+   bufptr = nir_iadd(&b, bufptr, nir_u2u64(&b, offset));
+
+   unsigned imgcompsz = imgtexelsz <= 4 ?
+                        1 : MIN2(1 << (ffs(imgtexelsz) - 1), 4);
+   unsigned nimgcomps = imgtexelsz / imgcompsz;
+   assert(nimgcomps <= 4);
+
+   nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1);
+   tex->op = nir_texop_txf;
+   tex->texture_index = 0;
+   tex->is_array = texisarray;
+   tex->dest_type = util_format_is_unorm(key.imgfmt) ?
+                    nir_type_float32 : nir_type_uint32;
+
+   switch (texdim) {
+   case 1: tex->sampler_dim = GLSL_SAMPLER_DIM_1D; break;
+   case 2: tex->sampler_dim = GLSL_SAMPLER_DIM_2D; break;
+   case 3: tex->sampler_dim = GLSL_SAMPLER_DIM_3D; break;
+   default: unreachable("Invalid texture dimension");
+   }
+
+   tex->src[0].src_type = nir_tex_src_coord;
+   tex->src[0].src = nir_src_for_ssa(imgcoords);
+   tex->coord_components = texdim + texisarray;
+   nir_ssa_dest_init(&tex->instr, &tex->dest, 4,
+                     nir_alu_type_get_type_size(tex->dest_type), NULL);
+   nir_builder_instr_insert(&b, &tex->instr);
+
+   nir_ssa_def *texel = &tex->dest.ssa;
+
+   unsigned fullmask = (1 << util_format_get_nr_components(key.imgfmt)) - 1;
+   unsigned nbufcomps = util_bitcount(fullmask);
+   if (key.mask != fullmask) {
+      nir_ssa_def *bufcomps[4];
+      nbufcomps = 0;
+      for (unsigned i = 0; i < nimgcomps; i++) {
+         if (key.mask & BITFIELD_BIT(i))
+            bufcomps[nbufcomps++] = nir_channel(&b, texel, i);
+      }
+
+      texel = nir_vec(&b, bufcomps, nbufcomps);
+   }
+
+   unsigned bufcompsz = buftexelsz / nbufcomps;
+
+   if (key.imgfmt == PIPE_FORMAT_R5G6B5_UNORM) {
+      texel = nir_fmul(&b, texel,
+                       nir_vec3(&b,
+                                nir_imm_float(&b, 31),
+                                nir_imm_float(&b, 63),
+                                nir_imm_float(&b, 31)));
+      texel = nir_f2u16(&b, texel);
+      texel = nir_ior(&b, nir_channel(&b, texel, 0),
+                      nir_ior(&b,
+                              nir_ishl(&b, nir_channel(&b, texel, 1), nir_imm_int(&b, 5)),
+                              nir_ishl(&b, nir_channel(&b, texel, 2), nir_imm_int(&b, 11))));
+      imgcompsz = 2;
+      bufcompsz = 2;
+      nbufcomps = 1;
+      nimgcomps = 1;
+   } else if (imgcompsz == 1) {
+      nir_ssa_def *packed = nir_channel(&b, texel, 0);
+      for (unsigned i = 1; i < nbufcomps; i++) {
+         packed = nir_ior(&b, packed,
+                          nir_ishl(&b, nir_iand_imm(&b, nir_channel(&b, texel, i), 0xff),
+                                   nir_imm_int(&b, i * 8)));
+      }
+      texel = packed;
+
+      bufcompsz = nbufcomps == 3 ? 4 : nbufcomps;
+      nbufcomps = 1;
+   }
+
+   assert(bufcompsz == 1 || bufcompsz == 2 || bufcompsz == 4);
+   assert(nbufcomps <= 4 && nimgcomps <= 4);
+   texel = nir_u2uN(&b, texel, bufcompsz * 8);
+
+   nir_store_global(&b, bufptr, bufcompsz, texel, (1 << nbufcomps) - 1);
+   nir_pop_if(&b, NULL);
+
+   struct panfrost_compile_inputs inputs = {
+      .gpu_id = pdev->gpu_id,
+      .is_blit = true,
+   };
+
+   struct util_dynarray binary;
+
+   util_dynarray_init(&binary, NULL);
+   GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
+
+   /* Make sure UBO words have been upgraded to push constants and everything
+    * is at the right place.
+    */
+   assert(shader_info->ubo_count == 1);
+   assert(shader_info->push.count <= (sizeof(struct panvk_meta_copy_img2buf_info) / 4));
+
+   mali_ptr shader =
+      pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
+                              PAN_ARCH >= 6 ? 128 : 64);
+
+   util_dynarray_fini(&binary);
+   ralloc_free(b.shader);
+
+   return shader;
+}
+
+static unsigned
+panvk_meta_copy_img2buf_format_idx(struct panvk_meta_copy_format_info key)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2buf_fmts); i++) {
+      if (!memcmp(&key, &panvk_meta_copy_img2buf_fmts[i], sizeof(key)))
+         return i;
+   }
+
+   unreachable("Invalid texel size\n");
+}
+
+static void
+panvk_meta_copy_img2buf(struct panvk_cmd_buffer *cmdbuf,
+                        const struct panvk_buffer *buf,
+                        const struct panvk_image *img,
+                        const VkBufferImageCopy *region)
+{
+   struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
+   struct panvk_meta_copy_format_info key = {
+      .imgfmt = panvk_meta_copy_img2buf_format(img->pimage.layout.format),
+      .mask = panvk_meta_copy_img_mask(img->pimage.layout.format,
+                                       region->imageSubresource.aspectMask),
+   };
+   unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
+   unsigned texdimidx =
+      panvk_meta_copy_tex_type(img->pimage.layout.dim,
+                               img->pimage.layout.array_size > 1);
+   unsigned fmtidx = panvk_meta_copy_img2buf_format_idx(key);
+
+   mali_ptr rsd =
+      cmdbuf->device->physical_device->meta.copy.img2buf[texdimidx][fmtidx].rsd;
+   const struct panfrost_ubo_push *pushmap =
+      &cmdbuf->device->physical_device->meta.copy.img2buf[texdimidx][fmtidx].pushmap;
+
+   struct panvk_meta_copy_img2buf_info info = {
+      .buf.ptr = buf->bo->ptr.gpu + buf->bo_offset + region->bufferOffset,
+      .buf.stride.line = (region->bufferRowLength ? : region->imageExtent.width) * buftexelsz,
+      .img.offset.x = MAX2(region->imageOffset.x & ~15, 0),
+      .img.offset.y = MAX2(region->imageOffset.y & ~15, 0),
+      .img.offset.z = MAX2(region->imageOffset.z, 0),
+      .img.extent.minx = MAX2(region->imageOffset.x, 0),
+      .img.extent.miny = MAX2(region->imageOffset.y, 0),
+      .img.extent.maxx = MAX2(region->imageOffset.x + region->imageExtent.width - 1, 0),
+      .img.extent.maxy = MAX2(region->imageOffset.y + region->imageExtent.height - 1, 0),
+   };
+
+   info.buf.stride.surf = (region->bufferImageHeight ? : region->imageExtent.height) *
+                          info.buf.stride.line;
+
+   mali_ptr pushconsts =
+      panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
+                                          &info, sizeof(info));
+   mali_ptr ubo =
+      panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
+
+   struct pan_image_view view = {
+      .format = key.imgfmt,
+      .dim = img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_CUBE ?
+             MALI_TEXTURE_DIMENSION_2D : img->pimage.layout.dim,
+      .image = &img->pimage,
+      .nr_samples = img->pimage.layout.nr_samples,
+      .first_level = region->imageSubresource.mipLevel,
+      .last_level = region->imageSubresource.mipLevel,
+      .first_layer = region->imageSubresource.baseArrayLayer,
+      .last_layer = region->imageSubresource.baseArrayLayer + region->imageSubresource.layerCount - 1,
+      .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
+   };
+
+   mali_ptr texture =
+      panvk_meta_copy_img_emit_texture(pdev, &cmdbuf->desc_pool.base, &view);
+   mali_ptr sampler =
+      panvk_meta_copy_img_emit_sampler(pdev, &cmdbuf->desc_pool.base);
+
+   if (cmdbuf->state.batch)
+      panvk_per_arch(cmd_close_batch)(cmdbuf);
+
+   panvk_cmd_open_batch(cmdbuf);
+
+   struct panvk_batch *batch = cmdbuf->state.batch;
+
+   struct pan_tls_info tlsinfo = { 0 };
+
+   batch->blit.src = img->pimage.data.bo;
+   batch->blit.dst = buf->bo;
+   batch->tls =
+      pan_pool_alloc_desc(&cmdbuf->desc_pool.base, LOCAL_STORAGE);
+   GENX(pan_emit_tls)(&tlsinfo, batch->tls.cpu);
+
+   mali_ptr tsd = batch->tls.gpu;
+
+   struct pan_compute_dim wg_sz = {
+      16,
+      img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D ? 1 : 16,
+      1,
+   };
+
+   struct pan_compute_dim num_wg = {
+     (ALIGN_POT(info.img.extent.maxx + 1, 16) - info.img.offset.x) / 16,
+     img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D ?
+        1 : (ALIGN_POT(info.img.extent.maxy + 1, 16) - info.img.offset.y) / 16,
+     MAX2(region->imageSubresource.layerCount, region->imageExtent.depth),
+   };
+
+   struct panfrost_ptr job =
+      panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
+                                       &batch->scoreboard, &num_wg, &wg_sz,
+                                       texture, sampler,
+                                       ubo, pushconsts,
+                                       rsd, tsd);
+
+   util_dynarray_append(&batch->jobs, void *, job.cpu);
+
+   if (cmdbuf->state.batch)
+      panvk_per_arch(cmd_close_batch)(cmdbuf);
+}
+
+static void
+panvk_meta_copy_img2buf_init(struct panvk_physical_device *dev)
+{
+   STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2buf_fmts) == PANVK_META_COPY_IMG2BUF_NUM_FORMATS);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2buf_fmts); i++) {
+      for (unsigned texdim = 1; texdim <= 3; texdim++) {
+         unsigned texdimidx = panvk_meta_copy_tex_type(texdim, false);
+         assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2buf));
+
+         struct pan_shader_info shader_info;
+         mali_ptr shader =
+            panvk_meta_copy_img2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
+                                           panvk_meta_copy_img2buf_fmts[i],
+                                           texdim, false, &shader_info);
+         dev->meta.copy.img2buf[texdimidx][i].pushmap = shader_info.push;
+         dev->meta.copy.img2buf[texdimidx][i].rsd =
+            panvk_meta_copy_to_buf_emit_rsd(&dev->pdev,
+                                            &dev->meta.desc_pool.base,
+                                            shader, &shader_info, true);
+
+         if (texdim == 3)
+            continue;
+
+         memset(&shader_info, 0, sizeof(shader_info));
+         texdimidx = panvk_meta_copy_tex_type(texdim, true);
+         assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2buf));
+         shader =
+            panvk_meta_copy_img2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
+                                           panvk_meta_copy_img2buf_fmts[i],
+                                           texdim, true, &shader_info);
+         dev->meta.copy.img2buf[texdimidx][i].pushmap = shader_info.push;
+         dev->meta.copy.img2buf[texdimidx][i].rsd =
+            panvk_meta_copy_to_buf_emit_rsd(&dev->pdev,
+                                            &dev->meta.desc_pool.base,
+                                            shader, &shader_info, true);
+      }
+   }
+}
+
 void
 panvk_per_arch(CmdCopyImageToBuffer)(VkCommandBuffer commandBuffer,
                                      VkImage srcImage,
@@ -1243,7 +1742,13 @@ panvk_per_arch(CmdCopyImageToBuffer)(VkCommandBuffer commandBuffer,
                                      uint32_t regionCount,
                                      const VkBufferImageCopy *pRegions)
 {
-   panvk_stub();
+   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
+   VK_FROM_HANDLE(panvk_buffer, buf, destBuffer);
+   VK_FROM_HANDLE(panvk_image, img, srcImage);
+
+   for (unsigned i = 0; i < regionCount; i++) {
+      panvk_meta_copy_img2buf(cmdbuf, buf, img, &pRegions[i]);
+   }
 }
 
 void
@@ -1281,4 +1786,5 @@ panvk_per_arch(meta_copy_init)(struct panvk_physical_device *dev)
 {
    panvk_meta_copy_img2img_init(dev);
    panvk_meta_copy_buf2img_init(dev);
+   panvk_meta_copy_img2buf_init(dev);
 }