if (format == PIPE_FORMAT_A8R8_UNORM && is_store)
return false;
- if (num_samples > 1)
+ /* MSAA image stores are broken. AMD_DEBUG=nofmask fixes them, implying that the FMASK
+ * expand pass doesn't work, but let's use the gfx blit, which should be faster because
+ * it doesn't require expanding the FMASK.
+ *
+ * TODO: Broken MSAA stores can cause app issues, though this issue might only affect
+ * internal blits, not sure.
+ *
+ * EQAA image stores are also unimplemented, which should be rejected here after MSAA
+ * image stores are fixed.
+ */
+ if (num_samples > 1 && is_store)
return false;
if (util_format_is_depth_or_stencil(format))
ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, true, &saved_cb);
}
+
+bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info)
+{
+ /* Compute blits require D16 right now (see the ISA).
+ *
+ * Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit.
+ * The compute blit is even slower with DCC stores. VP13 CATIA_plane_pencil is a good test
+ * for that because it's mostly just blits.
+ *
+ * TODO: benchmark the performance on gfx11
+ */
+ if (sctx->gfx_level < GFX11)
+ return false;
+
+ if (!si_can_use_compute_blit(sctx, info->dst.format, info->dst.resource->nr_samples, true,
+ vi_dcc_enabled((struct si_texture*)info->dst.resource,
+ info->dst.level)) ||
+ !si_can_use_compute_blit(sctx, info->src.format, info->src.resource->nr_samples, false,
+ vi_dcc_enabled((struct si_texture*)info->src.resource,
+ info->src.level)))
+ return false;
+
+ if (info->alpha_blend ||
+ info->num_window_rectangles ||
+ info->scissor_enable ||
+ /* No scaling. */
+ info->dst.box.width != abs(info->src.box.width) ||
+ info->dst.box.height != abs(info->src.box.height) ||
+ info->dst.box.depth != abs(info->src.box.depth))
+ return false;
+
+ assert(info->src.box.depth >= 0);
+
+ /* Shader images. */
+ struct pipe_image_view image[2];
+ image[0].resource = info->src.resource;
+ image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+ image[0].format = info->src.format;
+ image[0].u.tex.level = info->src.level;
+ image[0].u.tex.first_layer = 0;
+ image[0].u.tex.last_layer = util_max_layer(info->src.resource, info->src.level);
+
+ image[1].resource = info->dst.resource;
+ image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+ image[1].format = info->dst.format;
+ image[1].u.tex.level = info->dst.level;
+ image[1].u.tex.first_layer = 0;
+ image[1].u.tex.last_layer = util_max_layer(info->dst.resource, info->dst.level);
+
+ /* Get the shader key. */
+ const struct util_format_description *dst_desc = util_format_description(info->dst.format);
+ unsigned i = util_format_get_first_non_void_channel(info->dst.format);
+ union si_compute_blit_shader_key options;
+ options.key = 0;
+
+ options.always_true = true;
+ options.src_is_1d = info->src.resource->target == PIPE_TEXTURE_1D ||
+ info->src.resource->target == PIPE_TEXTURE_1D_ARRAY;
+ options.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D ||
+ info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY;
+ options.src_is_msaa = info->src.resource->nr_samples > 1;
+ options.dst_is_msaa = info->dst.resource->nr_samples > 1;
+ /* Resolving integer formats only copies sample 0. log2_samples is then unused. */
+ options.sample0_only = options.src_is_msaa && !options.dst_is_msaa &&
+ util_format_is_pure_integer(info->src.format);
+ unsigned num_samples = MAX2(info->src.resource->nr_samples, info->dst.resource->nr_samples);
+ options.log2_samples = options.sample0_only ? 0 : util_logbase2(num_samples);
+ options.flip_x = info->src.box.width < 0;
+ options.flip_y = info->src.box.height < 0;
+ options.sint_to_uint = util_format_is_pure_sint(info->src.format) &&
+ util_format_is_pure_uint(info->dst.format);
+ options.uint_to_sint = util_format_is_pure_uint(info->src.format) &&
+ util_format_is_pure_sint(info->dst.format);
+ options.dst_is_srgb = util_format_is_srgb(info->dst.format);
+ options.fp16_rtz = !util_format_is_pure_integer(info->dst.format) &&
+ (dst_desc->channel[i].size <= 10 ||
+ (dst_desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT &&
+ dst_desc->channel[i].size <= 16));
+
+ struct hash_entry *entry = _mesa_hash_table_search(sctx->cs_blit_shaders,
+ (void*)(uintptr_t)options.key);
+ void *shader = entry ? entry->data : NULL;
+ if (!shader) {
+ shader = si_create_blit_cs(sctx, &options);
+ _mesa_hash_table_insert(sctx->cs_blit_shaders,
+ (void*)(uintptr_t)options.key, shader);
+ }
+
+ sctx->cs_user_data[0] = (info->src.box.x & 0xffff) | ((info->dst.box.x & 0xffff) << 16);
+ sctx->cs_user_data[1] = (info->src.box.y & 0xffff) | ((info->dst.box.y & 0xffff) << 16);
+ sctx->cs_user_data[2] = (info->src.box.z & 0xffff) | ((info->dst.box.z & 0xffff) << 16);
+
+ struct pipe_grid_info grid = {0};
+ set_work_size(&grid, 8, 8, 1, info->dst.box.width, info->dst.box.height, info->dst.box.depth);
+
+ si_launch_grid_internal_images(sctx, image, 2, &grid, shader,
+ SI_OP_SYNC_BEFORE_AFTER |
+ (info->render_condition_enable ? SI_OP_CS_RENDER_COND_ENABLE : 0));
+ return true;
+}
*y = nir_ushr(b, src, nir_imm_int(b, 16));
}
+static void unpack_2x16_signed(nir_builder *b, nir_ssa_def *src, nir_ssa_def **x, nir_ssa_def **y)
+{
+ *x = nir_i2i32(b, nir_u2u16(b, src));
+ *y = nir_ishr(b, src, nir_imm_int(b, 16));
+}
+
static nir_ssa_def *
deref_ssa(nir_builder *b, nir_variable *var)
{
return create_shader_state(sctx, b.shader);
}
+
+static nir_ssa_def *convert_linear_to_srgb(nir_builder *b, nir_ssa_def *input)
+{
+ /* There are small precision differences compared to CB, so the gfx blit will return slightly
+ * different results.
+ */
+ nir_ssa_def *cmp[3];
+ for (unsigned i = 0; i < 3; i++)
+ cmp[i] = nir_flt(b, nir_channel(b, input, i), nir_imm_float(b, 0.0031308));
+
+ nir_ssa_def *ltvals[3];
+ for (unsigned i = 0; i < 3; i++)
+ ltvals[i] = nir_fmul(b, nir_channel(b, input, i), nir_imm_float(b, 12.92));
+
+ nir_ssa_def *gtvals[3];
+
+ for (unsigned i = 0; i < 3; i++) {
+ gtvals[i] = nir_fpow(b, nir_channel(b, input, i), nir_imm_float(b, 1.0/2.4));
+ gtvals[i] = nir_fmul(b, gtvals[i], nir_imm_float(b, 1.055));
+ gtvals[i] = nir_fsub(b, gtvals[i], nir_imm_float(b, 0.055));
+ }
+
+ nir_ssa_def *comp[4];
+ for (unsigned i = 0; i < 3; i++)
+ comp[i] = nir_bcsel(b, cmp[i], ltvals[i], gtvals[i]);
+ comp[3] = nir_channel(b, input, 3);
+
+ return nir_vec(b, comp, 4);
+}
+
+static nir_ssa_def *image_resolve_msaa(nir_builder *b, nir_variable *img, unsigned num_samples,
+ nir_ssa_def *coord, enum amd_gfx_level gfx_level)
+{
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+ nir_ssa_def *result = NULL;
+ nir_variable *var = NULL;
+
+ /* Gfx11 doesn't support samples_identical, so we can't use it. */
+ if (gfx_level < GFX11) {
+ /* We need a local variable to get the result out of conditional branches in SSA. */
+ var = nir_local_variable_create(b->impl, glsl_vec4_type(), NULL);
+
+ /* If all samples are identical, load only sample 0. */
+ nir_push_if(b, nir_image_deref_samples_identical(b, 1, deref_ssa(b, img), coord));
+ result = nir_image_deref_load(b, 4, 32, deref_ssa(b, img), coord, zero, zero);
+ nir_store_var(b, var, result, 0xf);
+
+ nir_push_else(b, NULL);
+ }
+
+ /* Average all samples. (the only options on gfx11) */
+ result = NULL;
+ for (unsigned i = 0; i < num_samples; i++) {
+ nir_ssa_def *sample = nir_image_deref_load(b, 4, 32, deref_ssa(b, img),
+ coord, nir_imm_int(b, i), zero);
+ result = result ? nir_fadd(b, result, sample) : sample;
+ }
+ result = nir_fmul_imm(b, result, 1.0 / num_samples); /* average the sum */
+
+ if (gfx_level < GFX11) {
+ /* Exit the conditional branch and get the result out of the branch. */
+ nir_store_var(b, var, result, 0xf);
+ nir_pop_if(b, NULL);
+ result = nir_load_var(b, var);
+ }
+
+ return result;
+}
+
+static nir_ssa_def *apply_blit_output_modifiers(nir_builder *b, nir_ssa_def *color,
+ const union si_compute_blit_shader_key *options)
+{
+ if (options->sint_to_uint)
+ color = nir_imax(b, color, nir_imm_int(b, 0));
+
+ if (options->uint_to_sint)
+ color = nir_umin(b, color, nir_imm_int(b, INT32_MAX));
+
+ if (options->dst_is_srgb)
+ color = convert_linear_to_srgb(b, color);
+
+ /* Convert to FP16 with rtz to match the pixel shader. Not necessary, but it helps verify
+ * the behavior of the whole shader by comparing it to the gfx blit.
+ */
+ if (options->fp16_rtz)
+ color = nir_f2f16_rtz(b, color);
+
+ return color;
+}
+
+/* The compute blit shader.
+ *
+ * Differences compared to u_blitter (the gfx blit):
+ * - u_blitter doesn't preserve NaNs, but the compute blit does
+ * - u_blitter has lower linear->SRGB precision because the CB block doesn't
+ * use FP32, but the compute blit does.
+ *
+ * Other than that, non-scaled blits are identical to u_blitter.
+ *
+ * Implementation details:
+ * - Out-of-bounds dst coordinates are not clamped at all. The hw drops
+ * out-of-bounds stores for us.
+ * - Out-of-bounds src coordinates are clamped by emulating CLAMP_TO_EDGE using
+ * the image_size NIR intrinsic.
+ * - X/Y flipping just does this in the shader: -threadIDs - 1
+ * - MSAA copies are implemented but disabled because MSAA image stores don't
+ * work.
+ */
+void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options)
+{
+ const nir_shader_compiler_options *nir_options =
+ sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
+
+ nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options,
+ "blit_non_scaled_cs");
+ b.shader->info.num_images = 2;
+ if (options->src_is_msaa)
+ BITSET_SET(b.shader->info.msaa_images, 0);
+ if (options->dst_is_msaa)
+ BITSET_SET(b.shader->info.msaa_images, 1);
+ b.shader->info.workgroup_size[0] = 8;
+ b.shader->info.workgroup_size[1] = 8;
+ b.shader->info.workgroup_size[2] = 1;
+ b.shader->info.cs.user_data_components_amd = 3;
+
+ const struct glsl_type *img_type[2] = {
+ glsl_image_type(options->src_is_1d ? GLSL_SAMPLER_DIM_1D :
+ options->src_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
+ /*is_array*/ true, GLSL_TYPE_FLOAT),
+ glsl_image_type(options->dst_is_1d ? GLSL_SAMPLER_DIM_1D :
+ options->dst_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
+ /*is_array*/ true, GLSL_TYPE_FLOAT),
+ };
+
+ nir_variable *img_src = nir_variable_create(b.shader, nir_var_uniform, img_type[0], "img0");
+ img_src->data.binding = 0;
+
+ nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1");
+ img_dst->data.binding = 1;
+
+ nir_ssa_def *zero = nir_imm_int(&b, 0);
+
+ /* Instructions. */
+ /* Let's work with 0-based src and dst coordinates (thread IDs) first. */
+ nir_ssa_def *dst_xyz = get_global_ids(&b, 3);
+ nir_ssa_def *src_xyz = dst_xyz;
+
+ /* Flip src coordinates. */
+ for (unsigned i = 0; i < 2; i++) {
+ if (i ? options->flip_y : options->flip_x) {
+ /* x goes from 0 to (dim - 1).
+ * The flipped blit should load from -dim to -1.
+ * Therefore do: x = -x - 1;
+ */
+ nir_ssa_def *comp = nir_channel(&b, src_xyz, i);
+ comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -1);
+ src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i);
+ }
+ }
+
+ /* Add box.xyz. */
+ nir_ssa_def *coord_src = NULL, *coord_dst = NULL;
+ unpack_2x16_signed(&b, nir_channels(&b, nir_load_user_data_amd(&b), 0x7),
+ &coord_src, &coord_dst);
+ coord_dst = nir_iadd(&b, coord_dst, dst_xyz);
+ coord_src = nir_iadd(&b, coord_src, src_xyz);
+
+ /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */
+ unsigned src_clamp_channels = options->src_is_1d ? 0x1 : 0x3;
+ nir_ssa_def *dim = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), zero);
+ dim = nir_channels(&b, dim, src_clamp_channels);
+
+ nir_ssa_def *coord_src_clamped = nir_channels(&b, coord_src, src_clamp_channels);
+ coord_src_clamped = nir_imax(&b, coord_src_clamped, nir_imm_int(&b, 0));
+ coord_src_clamped = nir_imin(&b, coord_src_clamped, nir_iadd_imm(&b, dim, -1));
+
+ for (unsigned i = 0; i < util_bitcount(src_clamp_channels); i++)
+ coord_src = nir_vector_insert_imm(&b, coord_src, nir_channel(&b, coord_src_clamped, i), i);
+
+ /* Swizzle coordinates for 1D_ARRAY. */
+ static unsigned swizzle_xz[] = {0, 2, 0, 0};
+
+ if (options->src_is_1d)
+ coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4);
+ if (options->dst_is_1d)
+ coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4);
+
+ /* Coordinates must have 4 channels in NIR. */
+ coord_src = nir_pad_vector(&b, coord_src, 4);
+ coord_dst = nir_pad_vector(&b, coord_dst, 4);
+
+ /* TODO: out-of-bounds image stores have no effect, but we could jump over them for better perf */
+
+ /* Execute the image loads and stores. */
+ unsigned num_samples = 1 << options->log2_samples;
+ nir_ssa_def *color;
+
+ if (options->src_is_msaa && !options->dst_is_msaa && !options->sample0_only) {
+ /* MSAA resolving (downsampling). */
+ assert(num_samples > 1);
+ color = image_resolve_msaa(&b, img_src, num_samples, coord_src, sctx->gfx_level);
+ color = apply_blit_output_modifiers(&b, color, options);
+ nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
+
+ } else if (options->src_is_msaa && options->dst_is_msaa) {
+ /* MSAA copy. */
+ nir_ssa_def *color[16];
+ assert(num_samples > 1);
+ /* Group loads together and then stores. */
+ for (unsigned i = 0; i < num_samples; i++) {
+ color[i] = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src,
+ nir_imm_int(&b, i), zero);
+ }
+ for (unsigned i = 0; i < num_samples; i++)
+ color[i] = apply_blit_output_modifiers(&b, color[i], options);
+ for (unsigned i = 0; i < num_samples; i++) {
+ nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
+ nir_imm_int(&b, i), color[i], zero);
+ }
+ } else if (!options->src_is_msaa && options->dst_is_msaa) {
+ /* MSAA upsampling. */
+ assert(num_samples > 1);
+ color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
+ color = apply_blit_output_modifiers(&b, color, options);
+ for (unsigned i = 0; i < num_samples; i++) {
+ nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
+ nir_imm_int(&b, i), color, zero);
+ }
+ } else {
+ /* Non-MSAA copy or read sample 0 only. */
+ /* src2 = sample_index (zero), src3 = lod (zero) */
+ assert(num_samples == 1);
+ color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
+ color = apply_blit_output_modifiers(&b, color, options);
+ nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
+ }
+
+ return create_shader_state(sctx, b.shader);
+}