From: Marek Olšák <marek.olsak@amd.com>
Date: Tue, 19 Jul 2022 09:08:23 +0000 (-0400)
Subject: radeonsi: implement a non-scaled compute blit+resolve and use it on gfx11
X-Git-Tag: upstream/22.3.5~5253
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=49237c0eb3eaae799d9ee7703788d2573df21e13;p=platform%2Fupstream%2Fmesa.git

radeonsi: implement a non-scaled compute blit+resolve and use it on gfx11

This was written from scratch. Only a few pieces were taken from Indrajit's
code. This is also much simpler, and hopefully easier to review. For example,
out-of-bounds coordinates are handled trivially.

The new blit test proves that this is identical to u_blitter except for
a few precision differences (NaNs, sRGB) where the compute blit should be
more precise.

This is only enabled on gfx11 because it's slower than the gfx blit on gfx10.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17782>
---

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 1187377..8f461d4 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -1219,6 +1219,9 @@ static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
       return;
    }
 
+   if (si_compute_blit(sctx, info))
+      return;
+
    si_gfx_blit(ctx, info);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index f31cf3a..e20d47f 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -35,7 +35,17 @@ static bool si_can_use_compute_blit(struct si_context *sctx, enum pipe_format fo
    if (format == PIPE_FORMAT_A8R8_UNORM && is_store)
       return false;
 
-   if (num_samples > 1)
+   /* MSAA image stores are broken. AMD_DEBUG=nofmask fixes them, implying that the FMASK
+    * expand pass doesn't work, but let's use the gfx blit, which should be faster because
+    * it doesn't require expanding the FMASK.
+    *
+    * TODO: Broken MSAA stores can cause app issues, though this issue might only affect
+    *       internal blits, not sure.
+    *
+    * EQAA image stores are also unimplemented, which should be rejected here after MSAA
+    * image stores are fixed.
+    */
+   if (num_samples > 1 && is_store)
       return false;
 
    if (util_format_is_depth_or_stencil(format))
@@ -1014,3 +1024,103 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
 
    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, true, &saved_cb);
 }
+
+bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info)
+{
+   /* Compute blits require D16 right now (see the ISA).
+    *
+    * Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit.
+    * The compute blit is even slower with DCC stores. VP13 CATIA_plane_pencil is a good test
+    * for that because it's mostly just blits.
+    *
+    * TODO: benchmark the performance on gfx11
+    */
+   if (sctx->gfx_level < GFX11)
+      return false;
+
+   if (!si_can_use_compute_blit(sctx, info->dst.format, info->dst.resource->nr_samples, true,
+                                vi_dcc_enabled((struct si_texture*)info->dst.resource,
+                                               info->dst.level)) ||
+       !si_can_use_compute_blit(sctx, info->src.format, info->src.resource->nr_samples, false,
+                                vi_dcc_enabled((struct si_texture*)info->src.resource,
+                                               info->src.level)))
+      return false;
+
+   if (info->alpha_blend ||
+       info->num_window_rectangles ||
+       info->scissor_enable ||
+       /* No scaling. */
+       info->dst.box.width != abs(info->src.box.width) ||
+       info->dst.box.height != abs(info->src.box.height) ||
+       info->dst.box.depth != abs(info->src.box.depth))
+      return false;
+
+   assert(info->src.box.depth >= 0);
+
+   /* Shader images. */
+   struct pipe_image_view image[2];
+   image[0].resource = info->src.resource;
+   image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+   image[0].format = info->src.format;
+   image[0].u.tex.level = info->src.level;
+   image[0].u.tex.first_layer = 0;
+   image[0].u.tex.last_layer = util_max_layer(info->src.resource, info->src.level);
+
+   image[1].resource = info->dst.resource;
+   image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+   image[1].format = info->dst.format;
+   image[1].u.tex.level = info->dst.level;
+   image[1].u.tex.first_layer = 0;
+   image[1].u.tex.last_layer = util_max_layer(info->dst.resource, info->dst.level);
+
+   /* Get the shader key. */
+   const struct util_format_description *dst_desc = util_format_description(info->dst.format);
+   unsigned i = util_format_get_first_non_void_channel(info->dst.format);
+   union si_compute_blit_shader_key options;
+   options.key = 0;
+
+   options.always_true = true;
+   options.src_is_1d = info->src.resource->target == PIPE_TEXTURE_1D ||
+                       info->src.resource->target == PIPE_TEXTURE_1D_ARRAY;
+   options.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D ||
+                       info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY;
+   options.src_is_msaa = info->src.resource->nr_samples > 1;
+   options.dst_is_msaa = info->dst.resource->nr_samples > 1;
+   /* Resolving integer formats only copies sample 0. log2_samples is then unused. */
+   options.sample0_only = options.src_is_msaa && !options.dst_is_msaa &&
+                          util_format_is_pure_integer(info->src.format);
+   unsigned num_samples = MAX2(info->src.resource->nr_samples, info->dst.resource->nr_samples);
+   options.log2_samples = options.sample0_only ? 0 : util_logbase2(num_samples);
+   options.flip_x = info->src.box.width < 0;
+   options.flip_y = info->src.box.height < 0;
+   options.sint_to_uint = util_format_is_pure_sint(info->src.format) &&
+                          util_format_is_pure_uint(info->dst.format);
+   options.uint_to_sint = util_format_is_pure_uint(info->src.format) &&
+                          util_format_is_pure_sint(info->dst.format);
+   options.dst_is_srgb = util_format_is_srgb(info->dst.format);
+   options.fp16_rtz = !util_format_is_pure_integer(info->dst.format) &&
+                      (dst_desc->channel[i].size <= 10 ||
+                       (dst_desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT &&
+                        dst_desc->channel[i].size <= 16));
+
+   struct hash_entry *entry = _mesa_hash_table_search(sctx->cs_blit_shaders,
+                                                      (void*)(uintptr_t)options.key);
+   void *shader = entry ? entry->data : NULL;
+   if (!shader) {
+      shader = si_create_blit_cs(sctx, &options);
+      _mesa_hash_table_insert(sctx->cs_blit_shaders,
+                              (void*)(uintptr_t)options.key, shader);
+   }
+
+   sctx->cs_user_data[0] = (info->src.box.x & 0xffff) | ((info->dst.box.x & 0xffff) << 16);
+   sctx->cs_user_data[1] = (info->src.box.y & 0xffff) | ((info->dst.box.y & 0xffff) << 16);
+   sctx->cs_user_data[2] = (info->src.box.z & 0xffff) | ((info->dst.box.z & 0xffff) << 16);
+
+   struct pipe_grid_info grid = {0};
+   set_work_size(&grid, 8, 8, 1, info->dst.box.width, info->dst.box.height, info->dst.box.depth);
+
+   si_launch_grid_internal_images(sctx, image, 2, &grid, shader,
+                                  SI_OP_SYNC_BEFORE_AFTER |
+                                  (info->render_condition_enable ? SI_OP_CS_RENDER_COND_ENABLE : 0));
+   return true;
+}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 39f6803..9d82907 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -356,6 +356,13 @@ static void si_destroy_context(struct pipe_context *context)
    if (!(sctx->context_flags & SI_CONTEXT_FLAG_AUX))
       p_atomic_dec(&context->screen->num_contexts);
 
+   if (sctx->cs_blit_shaders) {
+      hash_table_foreach(sctx->cs_blit_shaders, entry) {
+         context->delete_compute_state(context, entry->data);
+      }
+      _mesa_hash_table_destroy(sctx->cs_blit_shaders, NULL);
+   }
+
    FREE(sctx);
 }
 
@@ -827,6 +834,11 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
    }
 
    sctx->initial_gfx_cs_size = sctx->gfx_cs.current.cdw;
+
+   sctx->cs_blit_shaders = _mesa_hash_table_create_u32_keys(NULL);
+   if (!sctx->cs_blit_shaders)
+      goto fail;
+
    return &sctx->b;
 fail:
    fprintf(stderr, "radeonsi: Failed to create a context.\n");
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 691e125..2e5ec4c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -984,6 +984,7 @@ struct si_context {
    void *cs_clear_12bytes_buffer;
    void *cs_dcc_retile[32];
    void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
+   struct hash_table *cs_blit_shaders;
    struct si_screen *screen;
    struct util_debug_callback debug;
    struct ac_llvm_compiler compiler; /* only non-threaded compilation */
@@ -1436,6 +1437,7 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex);
 void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value,
                          unsigned flags, enum si_coherency coher);
 void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex);
+bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info);
 void si_init_compute_blit_functions(struct si_context *sctx);
 
 /* si_cp_dma.c */
@@ -1547,6 +1549,31 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)
 void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
 void *si_create_passthrough_tcs(struct si_context *sctx);
 
+union si_compute_blit_shader_key {
+   struct {
+      /* The key saved in _mesa_hash_table_create_u32_keys() can't be 0. */
+      bool always_true:1;
+      /* Declaration modifiers. */
+      bool src_is_1d:1;
+      bool dst_is_1d:1;
+      bool src_is_msaa:1;
+      bool dst_is_msaa:1;
+      uint8_t log2_samples:4;
+      bool sample0_only:1; /* src is MSAA, dst is not MSAA, log2_samples is ignored */
+      /* Source coordinate modifiers. */
+      bool flip_x:1;
+      bool flip_y:1;
+      /* Output modifiers. */
+      bool sint_to_uint:1;
+      bool uint_to_sint:1;
+      bool dst_is_srgb:1;
+      bool fp16_rtz:1; /* only for equality with pixel shaders, not necessary otherwise */
+   };
+   uint32_t key;
+};
+
+void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options);
+
 /* si_shaderlib_tgsi.c */
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                         unsigned num_layers);
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
index f92218b..2c426cb 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -71,6 +71,12 @@ static void unpack_2x16(nir_builder *b, nir_ssa_def *src, nir_ssa_def **x, nir_s
    *y = nir_ushr(b, src, nir_imm_int(b, 16));
 }
 
+static void unpack_2x16_signed(nir_builder *b, nir_ssa_def *src, nir_ssa_def **x, nir_ssa_def **y)
+{
+   *x = nir_i2i32(b, nir_u2u16(b, src));
+   *y = nir_ishr(b, src, nir_imm_int(b, 16));
+}
+
 static nir_ssa_def *
 deref_ssa(nir_builder *b, nir_variable *var)
 {
@@ -347,3 +353,242 @@ void *si_create_passthrough_tcs(struct si_context *sctx)
 
    return create_shader_state(sctx, b.shader);
 }
+
+static nir_ssa_def *convert_linear_to_srgb(nir_builder *b, nir_ssa_def *input)
+{
+   /* There are small precision differences compared to CB, so the gfx blit will return slightly
+    * different results.
+    */
+   nir_ssa_def *cmp[3];
+   for (unsigned i = 0; i < 3; i++)
+      cmp[i] = nir_flt(b, nir_channel(b, input, i), nir_imm_float(b, 0.0031308));
+
+   nir_ssa_def *ltvals[3];
+   for (unsigned i = 0; i < 3; i++)
+      ltvals[i] = nir_fmul(b, nir_channel(b, input, i), nir_imm_float(b, 12.92));
+
+   nir_ssa_def *gtvals[3];
+
+   for (unsigned i = 0; i < 3; i++) {
+      gtvals[i] = nir_fpow(b, nir_channel(b, input, i), nir_imm_float(b, 1.0/2.4));
+      gtvals[i] = nir_fmul(b, gtvals[i], nir_imm_float(b, 1.055));
+      gtvals[i] = nir_fsub(b, gtvals[i], nir_imm_float(b, 0.055));
+   }
+
+   nir_ssa_def *comp[4];
+   for (unsigned i = 0; i < 3; i++)
+      comp[i] = nir_bcsel(b, cmp[i], ltvals[i], gtvals[i]);
+   comp[3] = nir_channel(b, input, 3);
+
+   return nir_vec(b, comp, 4);
+}
+
+static nir_ssa_def *image_resolve_msaa(nir_builder *b, nir_variable *img, unsigned num_samples,
+                                       nir_ssa_def *coord, enum amd_gfx_level gfx_level)
+{
+   nir_ssa_def *zero = nir_imm_int(b, 0);
+   nir_ssa_def *result = NULL;
+   nir_variable *var = NULL;
+
+   /* Gfx11 doesn't support samples_identical, so we can't use it. */
+   if (gfx_level < GFX11) {
+      /* We need a local variable to get the result out of conditional branches in SSA. */
+      var = nir_local_variable_create(b->impl, glsl_vec4_type(), NULL);
+
+      /* If all samples are identical, load only sample 0. */
+      nir_push_if(b, nir_image_deref_samples_identical(b, 1, deref_ssa(b, img), coord));
+      result = nir_image_deref_load(b, 4, 32, deref_ssa(b, img), coord, zero, zero);
+      nir_store_var(b, var, result, 0xf);
+
+      nir_push_else(b, NULL);
+   }
+
+   /* Average all samples. (the only options on gfx11) */
+   result = NULL;
+   for (unsigned i = 0; i < num_samples; i++) {
+      nir_ssa_def *sample = nir_image_deref_load(b, 4, 32, deref_ssa(b, img),
+                                                 coord, nir_imm_int(b, i), zero);
+      result = result ? nir_fadd(b, result, sample) : sample;
+   }
+   result = nir_fmul_imm(b, result, 1.0 / num_samples); /* average the sum */
+
+   if (gfx_level < GFX11) {
+      /* Exit the conditional branch and get the result out of the branch. */
+      nir_store_var(b, var, result, 0xf);
+      nir_pop_if(b, NULL);
+      result = nir_load_var(b, var);
+   }
+
+   return result;
+}
+
+static nir_ssa_def *apply_blit_output_modifiers(nir_builder *b, nir_ssa_def *color,
+                                                const union si_compute_blit_shader_key *options)
+{
+   if (options->sint_to_uint)
+      color = nir_imax(b, color, nir_imm_int(b, 0));
+
+   if (options->uint_to_sint)
+      color = nir_umin(b, color, nir_imm_int(b, INT32_MAX));
+
+   if (options->dst_is_srgb)
+      color = convert_linear_to_srgb(b, color);
+
+   /* Convert to FP16 with rtz to match the pixel shader. Not necessary, but it helps verify
+    * the behavior of the whole shader by comparing it to the gfx blit.
+    */
+   if (options->fp16_rtz)
+      color = nir_f2f16_rtz(b, color);
+
+   return color;
+}
+
+/* The compute blit shader.
+ *
+ * Differences compared to u_blitter (the gfx blit):
+ * - u_blitter doesn't preserve NaNs, but the compute blit does
+ * - u_blitter has lower linear->SRGB precision because the CB block doesn't
+ *   use FP32, but the compute blit does.
+ *
+ * Other than that, non-scaled blits are identical to u_blitter.
+ *
+ * Implementation details:
+ * - Out-of-bounds dst coordinates are not clamped at all. The hw drops
+ *   out-of-bounds stores for us.
+ * - Out-of-bounds src coordinates are clamped by emulating CLAMP_TO_EDGE using
+ *   the image_size NIR intrinsic.
+ * - X/Y flipping just does this in the shader: -threadIDs - 1
+ * - MSAA copies are implemented but disabled because MSAA image stores don't
+ *   work.
+ */
+void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options)
+{
+   const nir_shader_compiler_options *nir_options =
+      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options,
+                                                  "blit_non_scaled_cs");
+   b.shader->info.num_images = 2;
+   if (options->src_is_msaa)
+      BITSET_SET(b.shader->info.msaa_images, 0);
+   if (options->dst_is_msaa)
+      BITSET_SET(b.shader->info.msaa_images, 1);
+   b.shader->info.workgroup_size[0] = 8;
+   b.shader->info.workgroup_size[1] = 8;
+   b.shader->info.workgroup_size[2] = 1;
+   b.shader->info.cs.user_data_components_amd = 3;
+
+   const struct glsl_type *img_type[2] = {
+      glsl_image_type(options->src_is_1d ? GLSL_SAMPLER_DIM_1D :
+                      options->src_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
+                      /*is_array*/ true, GLSL_TYPE_FLOAT),
+      glsl_image_type(options->dst_is_1d ? GLSL_SAMPLER_DIM_1D :
+                      options->dst_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
+                      /*is_array*/ true, GLSL_TYPE_FLOAT),
+   };
+
+   nir_variable *img_src = nir_variable_create(b.shader, nir_var_uniform, img_type[0], "img0");
+   img_src->data.binding = 0;
+
+   nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1");
+   img_dst->data.binding = 1;
+
+   nir_ssa_def *zero = nir_imm_int(&b, 0);
+
+   /* Instructions. */
+   /* Let's work with 0-based src and dst coordinates (thread IDs) first. */
+   nir_ssa_def *dst_xyz = get_global_ids(&b, 3);
+   nir_ssa_def *src_xyz = dst_xyz;
+
+   /* Flip src coordinates. */
+   for (unsigned i = 0; i < 2; i++) {
+      if (i ? options->flip_y : options->flip_x) {
+         /* x goes from 0 to (dim - 1).
+          * The flipped blit should load from -dim to -1.
+          * Therefore do: x = -x - 1;
+          */
+         nir_ssa_def *comp = nir_channel(&b, src_xyz, i);
+         comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -1);
+         src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i);
+      }
+   }
+
+   /* Add box.xyz. */
+   nir_ssa_def *coord_src = NULL, *coord_dst = NULL;
+   unpack_2x16_signed(&b, nir_channels(&b, nir_load_user_data_amd(&b), 0x7),
+                      &coord_src, &coord_dst);
+   coord_dst = nir_iadd(&b, coord_dst, dst_xyz);
+   coord_src = nir_iadd(&b, coord_src, src_xyz);
+
+   /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */
+   unsigned src_clamp_channels = options->src_is_1d ? 0x1 : 0x3;
+   nir_ssa_def *dim = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), zero);
+   dim = nir_channels(&b, dim, src_clamp_channels);
+
+   nir_ssa_def *coord_src_clamped = nir_channels(&b, coord_src, src_clamp_channels);
+   coord_src_clamped = nir_imax(&b, coord_src_clamped, nir_imm_int(&b, 0));
+   coord_src_clamped = nir_imin(&b, coord_src_clamped, nir_iadd_imm(&b, dim, -1));
+
+   for (unsigned i = 0; i < util_bitcount(src_clamp_channels); i++)
+      coord_src = nir_vector_insert_imm(&b, coord_src, nir_channel(&b, coord_src_clamped, i), i);
+
+   /* Swizzle coordinates for 1D_ARRAY. */
+   static unsigned swizzle_xz[] = {0, 2, 0, 0};
+
+   if (options->src_is_1d)
+      coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4);
+   if (options->dst_is_1d)
+      coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4);
+
+   /* Coordinates must have 4 channels in NIR. */
+   coord_src = nir_pad_vector(&b, coord_src, 4);
+   coord_dst = nir_pad_vector(&b, coord_dst, 4);
+
+   /* TODO: out-of-bounds image stores have no effect, but we could jump over them for better perf */
+
+   /* Execute the image loads and stores. */
+   unsigned num_samples = 1 << options->log2_samples;
+   nir_ssa_def *color;
+
+   if (options->src_is_msaa && !options->dst_is_msaa && !options->sample0_only) {
+      /* MSAA resolving (downsampling). */
+      assert(num_samples > 1);
+      color = image_resolve_msaa(&b, img_src, num_samples, coord_src, sctx->gfx_level);
+      color = apply_blit_output_modifiers(&b, color, options);
+      nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
+
+   } else if (options->src_is_msaa && options->dst_is_msaa) {
+      /* MSAA copy. */
+      nir_ssa_def *color[16];
+      assert(num_samples > 1);
+      /* Group loads together and then stores. */
+      for (unsigned i = 0; i < num_samples; i++) {
+         color[i] = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src,
+                                         nir_imm_int(&b, i), zero);
+      }
+      for (unsigned i = 0; i < num_samples; i++)
+         color[i] = apply_blit_output_modifiers(&b, color[i], options);
+      for (unsigned i = 0; i < num_samples; i++) {
+         nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
+                               nir_imm_int(&b, i), color[i], zero);
+      }
+   } else if (!options->src_is_msaa && options->dst_is_msaa) {
+      /* MSAA upsampling. */
+      assert(num_samples > 1);
+      color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
+      color = apply_blit_output_modifiers(&b, color, options);
+      for (unsigned i = 0; i < num_samples; i++) {
+         nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
+                               nir_imm_int(&b, i), color, zero);
+      }
+   } else {
+      /* Non-MSAA copy or read sample 0 only. */
+      /* src2 = sample_index (zero), src3 = lod (zero) */
+      assert(num_samples == 1);
+      color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
+      color = apply_blit_output_modifiers(&b, color, options);
+      nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
+   }
+
+   return create_shader_state(sctx, b.shader);
+}
diff --git a/src/gallium/drivers/radeonsi/si_test_image_copy_region.c b/src/gallium/drivers/radeonsi/si_test_image_copy_region.c
index 2d9ca7f..94918f9 100644
--- a/src/gallium/drivers/radeonsi/si_test_image_copy_region.c
+++ b/src/gallium/drivers/radeonsi/si_test_image_copy_region.c
@@ -952,7 +952,7 @@ void si_test_blit(struct si_screen *sscreen, unsigned test_flags)
       if (only_cb_resolve)
          success = si_msaa_resolve_blit_via_CB(ctx, &info);
       else
-         success = false;
+         success = si_compute_blit(sctx, &info);
 
       if (success) {
          printf(" %-7s", only_cb_resolve ? "resolve" : "comp");