From: Marek Olšák Date: Tue, 19 Jul 2022 09:08:23 +0000 (-0400) Subject: radeonsi: implement a non-scaled compute blit+resolve and use it on gfx11 X-Git-Tag: upstream/22.3.5~5253 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=49237c0eb3eaae799d9ee7703788d2573df21e13;p=platform%2Fupstream%2Fmesa.git radeonsi: implement a non-scaled compute blit+resolve and use it on gfx11 This was written from scratch. Only a few pieces were taken from Indrajit's code. This is also much simpler, and hopefully easier to review. For example, out-of-bounds coordinates are handled trivially. The new blit test proves that this is identical to u_blitter except for a few precision differences (NaNs, sRGB) where the compute blit should be more precise. This is only enabled on gfx11 because it's slower than the gfx blit on gfx10. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 1187377..8f461d4 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -1219,6 +1219,9 @@ static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) return; } + if (si_compute_blit(sctx, info)) + return; + si_gfx_blit(ctx, info); } diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index f31cf3a..e20d47f 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -35,7 +35,17 @@ static bool si_can_use_compute_blit(struct si_context *sctx, enum pipe_format fo if (format == PIPE_FORMAT_A8R8_UNORM && is_store) return false; - if (num_samples > 1) + /* MSAA image stores are broken. AMD_DEBUG=nofmask fixes them, implying that the FMASK + * expand pass doesn't work, but let's use the gfx blit, which should be faster because + * it doesn't require expanding the FMASK. + * + * TODO: Broken MSAA stores can cause app issues, though this issue might only affect + * internal blits, not sure. + * + * EQAA image stores are also unimplemented, which should be rejected here after MSAA + * image stores are fixed. + */ + if (num_samples > 1 && is_store) return false; if (util_format_is_depth_or_stencil(format)) @@ -1014,3 +1024,103 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, true, &saved_cb); } + +bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info) +{ + /* Compute blits require D16 right now (see the ISA). + * + * Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit. + * The compute blit is even slower with DCC stores. VP13 CATIA_plane_pencil is a good test + * for that because it's mostly just blits. + * + * TODO: benchmark the performance on gfx11 + */ + if (sctx->gfx_level < GFX11) + return false; + + if (!si_can_use_compute_blit(sctx, info->dst.format, info->dst.resource->nr_samples, true, + vi_dcc_enabled((struct si_texture*)info->dst.resource, + info->dst.level)) || + !si_can_use_compute_blit(sctx, info->src.format, info->src.resource->nr_samples, false, + vi_dcc_enabled((struct si_texture*)info->src.resource, + info->src.level))) + return false; + + if (info->alpha_blend || + info->num_window_rectangles || + info->scissor_enable || + /* No scaling. */ + info->dst.box.width != abs(info->src.box.width) || + info->dst.box.height != abs(info->src.box.height) || + info->dst.box.depth != abs(info->src.box.depth)) + return false; + + assert(info->src.box.depth >= 0); + + /* Shader images. */ + struct pipe_image_view image[2]; + image[0].resource = info->src.resource; + image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ; + image[0].format = info->src.format; + image[0].u.tex.level = info->src.level; + image[0].u.tex.first_layer = 0; + image[0].u.tex.last_layer = util_max_layer(info->src.resource, info->src.level); + + image[1].resource = info->dst.resource; + image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE; + image[1].format = info->dst.format; + image[1].u.tex.level = info->dst.level; + image[1].u.tex.first_layer = 0; + image[1].u.tex.last_layer = util_max_layer(info->dst.resource, info->dst.level); + + /* Get the shader key. */ + const struct util_format_description *dst_desc = util_format_description(info->dst.format); + unsigned i = util_format_get_first_non_void_channel(info->dst.format); + union si_compute_blit_shader_key options; + options.key = 0; + + options.always_true = true; + options.src_is_1d = info->src.resource->target == PIPE_TEXTURE_1D || + info->src.resource->target == PIPE_TEXTURE_1D_ARRAY; + options.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D || + info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY; + options.src_is_msaa = info->src.resource->nr_samples > 1; + options.dst_is_msaa = info->dst.resource->nr_samples > 1; + /* Resolving integer formats only copies sample 0. log2_samples is then unused. */ + options.sample0_only = options.src_is_msaa && !options.dst_is_msaa && + util_format_is_pure_integer(info->src.format); + unsigned num_samples = MAX2(info->src.resource->nr_samples, info->dst.resource->nr_samples); + options.log2_samples = options.sample0_only ? 0 : util_logbase2(num_samples); + options.flip_x = info->src.box.width < 0; + options.flip_y = info->src.box.height < 0; + options.sint_to_uint = util_format_is_pure_sint(info->src.format) && + util_format_is_pure_uint(info->dst.format); + options.uint_to_sint = util_format_is_pure_uint(info->src.format) && + util_format_is_pure_sint(info->dst.format); + options.dst_is_srgb = util_format_is_srgb(info->dst.format); + options.fp16_rtz = !util_format_is_pure_integer(info->dst.format) && + (dst_desc->channel[i].size <= 10 || + (dst_desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT && + dst_desc->channel[i].size <= 16)); + + struct hash_entry *entry = _mesa_hash_table_search(sctx->cs_blit_shaders, + (void*)(uintptr_t)options.key); + void *shader = entry ? entry->data : NULL; + if (!shader) { + shader = si_create_blit_cs(sctx, &options); + _mesa_hash_table_insert(sctx->cs_blit_shaders, + (void*)(uintptr_t)options.key, shader); + } + + sctx->cs_user_data[0] = (info->src.box.x & 0xffff) | ((info->dst.box.x & 0xffff) << 16); + sctx->cs_user_data[1] = (info->src.box.y & 0xffff) | ((info->dst.box.y & 0xffff) << 16); + sctx->cs_user_data[2] = (info->src.box.z & 0xffff) | ((info->dst.box.z & 0xffff) << 16); + + struct pipe_grid_info grid = {0}; + set_work_size(&grid, 8, 8, 1, info->dst.box.width, info->dst.box.height, info->dst.box.depth); + + si_launch_grid_internal_images(sctx, image, 2, &grid, shader, + SI_OP_SYNC_BEFORE_AFTER | + (info->render_condition_enable ? SI_OP_CS_RENDER_COND_ENABLE : 0)); + return true; +} diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 39f6803..9d82907 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -356,6 +356,13 @@ static void si_destroy_context(struct pipe_context *context) if (!(sctx->context_flags & SI_CONTEXT_FLAG_AUX)) p_atomic_dec(&context->screen->num_contexts); + if (sctx->cs_blit_shaders) { + hash_table_foreach(sctx->cs_blit_shaders, entry) { + context->delete_compute_state(context, entry->data); + } + _mesa_hash_table_destroy(sctx->cs_blit_shaders, NULL); + } + FREE(sctx); } @@ -827,6 +834,11 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign } sctx->initial_gfx_cs_size = sctx->gfx_cs.current.cdw; + + sctx->cs_blit_shaders = _mesa_hash_table_create_u32_keys(NULL); + if (!sctx->cs_blit_shaders) + goto fail; + return &sctx->b; fail: fprintf(stderr, "radeonsi: Failed to create a context.\n"); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 691e125..2e5ec4c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -984,6 +984,7 @@ struct si_context { void *cs_clear_12bytes_buffer; void *cs_dcc_retile[32]; void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */ + struct hash_table *cs_blit_shaders; struct si_screen *screen; struct util_debug_callback debug; struct ac_llvm_compiler compiler; /* only non-threaded compilation */ @@ -1436,6 +1437,7 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex); void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value, unsigned flags, enum si_coherency coher); void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex); +bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info); void si_init_compute_blit_functions(struct si_context *sctx); /* si_cp_dma.c */ @@ -1547,6 +1549,31 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf) void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex); void *si_create_passthrough_tcs(struct si_context *sctx); +union si_compute_blit_shader_key { + struct { + /* The key saved in _mesa_hash_table_create_u32_keys() can't be 0. */ + bool always_true:1; + /* Declaration modifiers. */ + bool src_is_1d:1; + bool dst_is_1d:1; + bool src_is_msaa:1; + bool dst_is_msaa:1; + uint8_t log2_samples:4; + bool sample0_only:1; /* src is MSAA, dst is not MSAA, log2_samples is ignored */ + /* Source coordinate modifiers. */ + bool flip_x:1; + bool flip_y:1; + /* Output modifiers. */ + bool sint_to_uint:1; + bool uint_to_sint:1; + bool dst_is_srgb:1; + bool fp16_rtz:1; /* only for equality with pixel shaders, not necessary otherwise */ + }; + uint32_t key; +}; + +void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options); + /* si_shaderlib_tgsi.c */ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index f92218b..2c426cb 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -71,6 +71,12 @@ static void unpack_2x16(nir_builder *b, nir_ssa_def *src, nir_ssa_def **x, nir_s *y = nir_ushr(b, src, nir_imm_int(b, 16)); } +static void unpack_2x16_signed(nir_builder *b, nir_ssa_def *src, nir_ssa_def **x, nir_ssa_def **y) +{ + *x = nir_i2i32(b, nir_u2u16(b, src)); + *y = nir_ishr(b, src, nir_imm_int(b, 16)); +} + static nir_ssa_def * deref_ssa(nir_builder *b, nir_variable *var) { @@ -347,3 +353,242 @@ void *si_create_passthrough_tcs(struct si_context *sctx) return create_shader_state(sctx, b.shader); } + +static nir_ssa_def *convert_linear_to_srgb(nir_builder *b, nir_ssa_def *input) +{ + /* There are small precision differences compared to CB, so the gfx blit will return slightly + * different results. + */ + nir_ssa_def *cmp[3]; + for (unsigned i = 0; i < 3; i++) + cmp[i] = nir_flt(b, nir_channel(b, input, i), nir_imm_float(b, 0.0031308)); + + nir_ssa_def *ltvals[3]; + for (unsigned i = 0; i < 3; i++) + ltvals[i] = nir_fmul(b, nir_channel(b, input, i), nir_imm_float(b, 12.92)); + + nir_ssa_def *gtvals[3]; + + for (unsigned i = 0; i < 3; i++) { + gtvals[i] = nir_fpow(b, nir_channel(b, input, i), nir_imm_float(b, 1.0/2.4)); + gtvals[i] = nir_fmul(b, gtvals[i], nir_imm_float(b, 1.055)); + gtvals[i] = nir_fsub(b, gtvals[i], nir_imm_float(b, 0.055)); + } + + nir_ssa_def *comp[4]; + for (unsigned i = 0; i < 3; i++) + comp[i] = nir_bcsel(b, cmp[i], ltvals[i], gtvals[i]); + comp[3] = nir_channel(b, input, 3); + + return nir_vec(b, comp, 4); +} + +static nir_ssa_def *image_resolve_msaa(nir_builder *b, nir_variable *img, unsigned num_samples, + nir_ssa_def *coord, enum amd_gfx_level gfx_level) +{ + nir_ssa_def *zero = nir_imm_int(b, 0); + nir_ssa_def *result = NULL; + nir_variable *var = NULL; + + /* Gfx11 doesn't support samples_identical, so we can't use it. */ + if (gfx_level < GFX11) { + /* We need a local variable to get the result out of conditional branches in SSA. */ + var = nir_local_variable_create(b->impl, glsl_vec4_type(), NULL); + + /* If all samples are identical, load only sample 0. */ + nir_push_if(b, nir_image_deref_samples_identical(b, 1, deref_ssa(b, img), coord)); + result = nir_image_deref_load(b, 4, 32, deref_ssa(b, img), coord, zero, zero); + nir_store_var(b, var, result, 0xf); + + nir_push_else(b, NULL); + } + + /* Average all samples. (the only options on gfx11) */ + result = NULL; + for (unsigned i = 0; i < num_samples; i++) { + nir_ssa_def *sample = nir_image_deref_load(b, 4, 32, deref_ssa(b, img), + coord, nir_imm_int(b, i), zero); + result = result ? nir_fadd(b, result, sample) : sample; + } + result = nir_fmul_imm(b, result, 1.0 / num_samples); /* average the sum */ + + if (gfx_level < GFX11) { + /* Exit the conditional branch and get the result out of the branch. */ + nir_store_var(b, var, result, 0xf); + nir_pop_if(b, NULL); + result = nir_load_var(b, var); + } + + return result; +} + +static nir_ssa_def *apply_blit_output_modifiers(nir_builder *b, nir_ssa_def *color, + const union si_compute_blit_shader_key *options) +{ + if (options->sint_to_uint) + color = nir_imax(b, color, nir_imm_int(b, 0)); + + if (options->uint_to_sint) + color = nir_umin(b, color, nir_imm_int(b, INT32_MAX)); + + if (options->dst_is_srgb) + color = convert_linear_to_srgb(b, color); + + /* Convert to FP16 with rtz to match the pixel shader. Not necessary, but it helps verify + * the behavior of the whole shader by comparing it to the gfx blit. + */ + if (options->fp16_rtz) + color = nir_f2f16_rtz(b, color); + + return color; +} + +/* The compute blit shader. + * + * Differences compared to u_blitter (the gfx blit): + * - u_blitter doesn't preserve NaNs, but the compute blit does + * - u_blitter has lower linear->SRGB precision because the CB block doesn't + * use FP32, but the compute blit does. + * + * Other than that, non-scaled blits are identical to u_blitter. + * + * Implementation details: + * - Out-of-bounds dst coordinates are not clamped at all. The hw drops + * out-of-bounds stores for us. + * - Out-of-bounds src coordinates are clamped by emulating CLAMP_TO_EDGE using + * the image_size NIR intrinsic. + * - X/Y flipping just does this in the shader: -threadIDs - 1 + * - MSAA copies are implemented but disabled because MSAA image stores don't + * work. + */ +void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options) +{ + const nir_shader_compiler_options *nir_options = + sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE); + + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options, + "blit_non_scaled_cs"); + b.shader->info.num_images = 2; + if (options->src_is_msaa) + BITSET_SET(b.shader->info.msaa_images, 0); + if (options->dst_is_msaa) + BITSET_SET(b.shader->info.msaa_images, 1); + b.shader->info.workgroup_size[0] = 8; + b.shader->info.workgroup_size[1] = 8; + b.shader->info.workgroup_size[2] = 1; + b.shader->info.cs.user_data_components_amd = 3; + + const struct glsl_type *img_type[2] = { + glsl_image_type(options->src_is_1d ? GLSL_SAMPLER_DIM_1D : + options->src_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, + /*is_array*/ true, GLSL_TYPE_FLOAT), + glsl_image_type(options->dst_is_1d ? GLSL_SAMPLER_DIM_1D : + options->dst_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, + /*is_array*/ true, GLSL_TYPE_FLOAT), + }; + + nir_variable *img_src = nir_variable_create(b.shader, nir_var_uniform, img_type[0], "img0"); + img_src->data.binding = 0; + + nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1"); + img_dst->data.binding = 1; + + nir_ssa_def *zero = nir_imm_int(&b, 0); + + /* Instructions. */ + /* Let's work with 0-based src and dst coordinates (thread IDs) first. */ + nir_ssa_def *dst_xyz = get_global_ids(&b, 3); + nir_ssa_def *src_xyz = dst_xyz; + + /* Flip src coordinates. */ + for (unsigned i = 0; i < 2; i++) { + if (i ? options->flip_y : options->flip_x) { + /* x goes from 0 to (dim - 1). + * The flipped blit should load from -dim to -1. + * Therefore do: x = -x - 1; + */ + nir_ssa_def *comp = nir_channel(&b, src_xyz, i); + comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -1); + src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i); + } + } + + /* Add box.xyz. */ + nir_ssa_def *coord_src = NULL, *coord_dst = NULL; + unpack_2x16_signed(&b, nir_channels(&b, nir_load_user_data_amd(&b), 0x7), + &coord_src, &coord_dst); + coord_dst = nir_iadd(&b, coord_dst, dst_xyz); + coord_src = nir_iadd(&b, coord_src, src_xyz); + + /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */ + unsigned src_clamp_channels = options->src_is_1d ? 0x1 : 0x3; + nir_ssa_def *dim = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), zero); + dim = nir_channels(&b, dim, src_clamp_channels); + + nir_ssa_def *coord_src_clamped = nir_channels(&b, coord_src, src_clamp_channels); + coord_src_clamped = nir_imax(&b, coord_src_clamped, nir_imm_int(&b, 0)); + coord_src_clamped = nir_imin(&b, coord_src_clamped, nir_iadd_imm(&b, dim, -1)); + + for (unsigned i = 0; i < util_bitcount(src_clamp_channels); i++) + coord_src = nir_vector_insert_imm(&b, coord_src, nir_channel(&b, coord_src_clamped, i), i); + + /* Swizzle coordinates for 1D_ARRAY. */ + static unsigned swizzle_xz[] = {0, 2, 0, 0}; + + if (options->src_is_1d) + coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4); + if (options->dst_is_1d) + coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4); + + /* Coordinates must have 4 channels in NIR. */ + coord_src = nir_pad_vector(&b, coord_src, 4); + coord_dst = nir_pad_vector(&b, coord_dst, 4); + + /* TODO: out-of-bounds image stores have no effect, but we could jump over them for better perf */ + + /* Execute the image loads and stores. */ + unsigned num_samples = 1 << options->log2_samples; + nir_ssa_def *color; + + if (options->src_is_msaa && !options->dst_is_msaa && !options->sample0_only) { + /* MSAA resolving (downsampling). */ + assert(num_samples > 1); + color = image_resolve_msaa(&b, img_src, num_samples, coord_src, sctx->gfx_level); + color = apply_blit_output_modifiers(&b, color, options); + nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero); + + } else if (options->src_is_msaa && options->dst_is_msaa) { + /* MSAA copy. */ + nir_ssa_def *color[16]; + assert(num_samples > 1); + /* Group loads together and then stores. */ + for (unsigned i = 0; i < num_samples; i++) { + color[i] = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, + nir_imm_int(&b, i), zero); + } + for (unsigned i = 0; i < num_samples; i++) + color[i] = apply_blit_output_modifiers(&b, color[i], options); + for (unsigned i = 0; i < num_samples; i++) { + nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, + nir_imm_int(&b, i), color[i], zero); + } + } else if (!options->src_is_msaa && options->dst_is_msaa) { + /* MSAA upsampling. */ + assert(num_samples > 1); + color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero); + color = apply_blit_output_modifiers(&b, color, options); + for (unsigned i = 0; i < num_samples; i++) { + nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, + nir_imm_int(&b, i), color, zero); + } + } else { + /* Non-MSAA copy or read sample 0 only. */ + /* src2 = sample_index (zero), src3 = lod (zero) */ + assert(num_samples == 1); + color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero); + color = apply_blit_output_modifiers(&b, color, options); + nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero); + } + + return create_shader_state(sctx, b.shader); +} diff --git a/src/gallium/drivers/radeonsi/si_test_image_copy_region.c b/src/gallium/drivers/radeonsi/si_test_image_copy_region.c index 2d9ca7f..94918f9 100644 --- a/src/gallium/drivers/radeonsi/si_test_image_copy_region.c +++ b/src/gallium/drivers/radeonsi/si_test_image_copy_region.c @@ -952,7 +952,7 @@ void si_test_blit(struct si_screen *sscreen, unsigned test_flags) if (only_cb_resolve) success = si_msaa_resolve_blit_via_CB(ctx, &info); else - success = false; + success = si_compute_blit(sctx, &info); if (success) { printf(" %-7s", only_cb_resolve ? "resolve" : "comp");