From e5fb9dca2a15c57d8216e0fc2260a200df3049e9 Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Mon, 7 Sep 2020 09:58:36 +0200 Subject: [PATCH] amd/common: switch to 3-spaces style MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Follow-up of !4319 using the same clang-format config. Acked-by: Samuel Pitoiset Acked-by: Marek Olšák Part-of: --- src/amd/common/.editorconfig | 3 - src/amd/common/ac_binary.c | 215 +- src/amd/common/ac_binary.h | 36 +- src/amd/common/ac_debug.c | 1311 ++++++----- src/amd/common/ac_debug.h | 54 +- src/amd/common/ac_exp_param.h | 21 +- src/amd/common/ac_gpu_info.c | 2339 ++++++++++--------- src/amd/common/ac_gpu_info.h | 342 ++- src/amd/common/ac_rtld.c | 1298 ++++++----- src/amd/common/ac_rtld.h | 111 +- src/amd/common/ac_shader_args.c | 45 +- src/amd/common/ac_shader_args.h | 139 +- src/amd/common/ac_shader_util.c | 494 +++-- src/amd/common/ac_shader_util.h | 81 +- src/amd/common/ac_shadowed_regs.c | 3374 ++++++++++++++-------------- src/amd/common/ac_shadowed_regs.h | 10 +- src/amd/common/ac_surface.c | 4196 +++++++++++++++++------------------ src/amd/common/ac_surface.h | 446 ++-- src/amd/common/amd_family.h | 217 +- src/amd/common/amd_kernel_code_t.h | 589 ++--- src/amd/common/gfx10_format_table.h | 13 +- src/amd/common/sid.h | 475 ++-- 22 files changed, 7814 insertions(+), 7995 deletions(-) delete mode 100644 src/amd/common/.editorconfig diff --git a/src/amd/common/.editorconfig b/src/amd/common/.editorconfig deleted file mode 100644 index 21a3c7d..0000000 --- a/src/amd/common/.editorconfig +++ /dev/null @@ -1,3 +0,0 @@ -[*.{c,h}] -indent_style = tab -indent_size = tab diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c index 4651c06..9306869 100644 --- a/src/amd/common/ac_binary.c +++ b/src/amd/common/ac_binary.c @@ -21,132 +21,129 @@ * SOFTWARE. */ -#include "ac_gpu_info.h" #include "ac_binary.h" +#include "ac_gpu_info.h" #include "util/u_math.h" #include "util/u_memory.h" #include #include -#include - #include +#include -#define SPILLED_SGPRS 0x4 -#define SPILLED_VGPRS 0x8 +#define SPILLED_SGPRS 0x4 +#define SPILLED_VGPRS 0x8 /* Parse configuration data in .AMDGPU.config section format. */ -void ac_parse_shader_binary_config(const char *data, size_t nbytes, - unsigned wave_size, - bool really_needs_scratch, - const struct radeon_info *info, - struct ac_shader_config *conf) +void ac_parse_shader_binary_config(const char *data, size_t nbytes, unsigned wave_size, + bool really_needs_scratch, const struct radeon_info *info, + struct ac_shader_config *conf) { - uint32_t scratch_size = 0; + uint32_t scratch_size = 0; - for (size_t i = 0; i < nbytes; i += 8) { - unsigned reg = util_le32_to_cpu(*(uint32_t*)(data + i)); - unsigned value = util_le32_to_cpu(*(uint32_t*)(data + i + 4)); - switch (reg) { - case R_00B028_SPI_SHADER_PGM_RSRC1_PS: - case R_00B128_SPI_SHADER_PGM_RSRC1_VS: - case R_00B228_SPI_SHADER_PGM_RSRC1_GS: - case R_00B848_COMPUTE_PGM_RSRC1: - case R_00B428_SPI_SHADER_PGM_RSRC1_HS: - if (wave_size == 32) - conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 8); - else - conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); + for (size_t i = 0; i < nbytes; i += 8) { + unsigned reg = util_le32_to_cpu(*(uint32_t *)(data + i)); + unsigned value = util_le32_to_cpu(*(uint32_t *)(data + i + 4)); + switch (reg) { + case R_00B028_SPI_SHADER_PGM_RSRC1_PS: + case R_00B128_SPI_SHADER_PGM_RSRC1_VS: + case R_00B228_SPI_SHADER_PGM_RSRC1_GS: + case R_00B848_COMPUTE_PGM_RSRC1: + case R_00B428_SPI_SHADER_PGM_RSRC1_HS: + if (wave_size == 32) + conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 8); + else + conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); - conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); - /* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */ - conf->float_mode = G_00B028_FLOAT_MODE(value); - conf->rsrc1 = value; - break; - case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: - conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); - /* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */ - conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value); - conf->rsrc2 = value; - break; - case R_00B12C_SPI_SHADER_PGM_RSRC2_VS: - conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value); - conf->rsrc2 = value; - break; - case R_00B22C_SPI_SHADER_PGM_RSRC2_GS: - conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value); - conf->rsrc2 = value; - break; - case R_00B42C_SPI_SHADER_PGM_RSRC2_HS: - conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value); - conf->rsrc2 = value; - break; - case R_00B84C_COMPUTE_PGM_RSRC2: - conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value)); - conf->rsrc2 = value; - break; - case R_00B8A0_COMPUTE_PGM_RSRC3: - conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value); - conf->rsrc3 = value; - break; - case R_0286CC_SPI_PS_INPUT_ENA: - conf->spi_ps_input_ena = value; - break; - case R_0286D0_SPI_PS_INPUT_ADDR: - conf->spi_ps_input_addr = value; - break; - case R_0286E8_SPI_TMPRING_SIZE: - case R_00B860_COMPUTE_TMPRING_SIZE: - /* WAVESIZE is in units of 256 dwords. */ - scratch_size = value; - break; - case SPILLED_SGPRS: - conf->spilled_sgprs = value; - break; - case SPILLED_VGPRS: - conf->spilled_vgprs = value; - break; - default: - { - static bool printed; + conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); + /* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */ + conf->float_mode = G_00B028_FLOAT_MODE(value); + conf->rsrc1 = value; + break; + case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: + conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); + /* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */ + conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value); + conf->rsrc2 = value; + break; + case R_00B12C_SPI_SHADER_PGM_RSRC2_VS: + conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value); + conf->rsrc2 = value; + break; + case R_00B22C_SPI_SHADER_PGM_RSRC2_GS: + conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value); + conf->rsrc2 = value; + break; + case R_00B42C_SPI_SHADER_PGM_RSRC2_HS: + conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value); + conf->rsrc2 = value; + break; + case R_00B84C_COMPUTE_PGM_RSRC2: + conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value)); + conf->rsrc2 = value; + break; + case R_00B8A0_COMPUTE_PGM_RSRC3: + conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value); + conf->rsrc3 = value; + break; + case R_0286CC_SPI_PS_INPUT_ENA: + conf->spi_ps_input_ena = value; + break; + case R_0286D0_SPI_PS_INPUT_ADDR: + conf->spi_ps_input_addr = value; + break; + case R_0286E8_SPI_TMPRING_SIZE: + case R_00B860_COMPUTE_TMPRING_SIZE: + /* WAVESIZE is in units of 256 dwords. */ + scratch_size = value; + break; + case SPILLED_SGPRS: + conf->spilled_sgprs = value; + break; + case SPILLED_VGPRS: + conf->spilled_vgprs = value; + break; + default: { + static bool printed; - if (!printed) { - fprintf(stderr, "Warning: LLVM emitted unknown " - "config register: 0x%x\n", reg); - printed = true; - } - } - break; - } - } + if (!printed) { + fprintf(stderr, + "Warning: LLVM emitted unknown " + "config register: 0x%x\n", + reg); + printed = true; + } + } break; + } + } - if (!conf->spi_ps_input_addr) - conf->spi_ps_input_addr = conf->spi_ps_input_ena; + if (!conf->spi_ps_input_addr) + conf->spi_ps_input_addr = conf->spi_ps_input_ena; - if (really_needs_scratch) { - /* sgprs spills aren't spilling */ - conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4; - } + if (really_needs_scratch) { + /* sgprs spills aren't spilling */ + conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4; + } - /* GFX 10.3 internally: - * - aligns VGPRS to 16 for Wave32 and 8 for Wave64 - * - aligns LDS to 1024 - * - * For shader-db stats, set num_vgprs that the hw actually uses. - */ - if (info->chip_class >= GFX10_3) { - conf->num_vgprs = align(conf->num_vgprs, wave_size == 32 ? 16 : 8); - } + /* GFX 10.3 internally: + * - aligns VGPRS to 16 for Wave32 and 8 for Wave64 + * - aligns LDS to 1024 + * + * For shader-db stats, set num_vgprs that the hw actually uses. + */ + if (info->chip_class >= GFX10_3) { + conf->num_vgprs = align(conf->num_vgprs, wave_size == 32 ? 16 : 8); + } - /* Enable 64-bit and 16-bit denormals, because there is no performance - * cost. - * - * Don't enable denormals for 32-bit floats, because: - * - denormals disable output modifiers - * - denormals break v_mad_f32 - * - GFX6 & GFX7 would be very slow - */ - conf->float_mode &= ~V_00B028_FP_ALL_DENORMS; - conf->float_mode |= V_00B028_FP_64_DENORMS; + /* Enable 64-bit and 16-bit denormals, because there is no performance + * cost. + * + * Don't enable denormals for 32-bit floats, because: + * - denormals disable output modifiers + * - denormals break v_mad_f32 + * - GFX6 & GFX7 would be very slow + */ + conf->float_mode &= ~V_00B028_FP_ALL_DENORMS; + conf->float_mode |= V_00B028_FP_64_DENORMS; } diff --git a/src/amd/common/ac_binary.h b/src/amd/common/ac_binary.h index 0d98142..5eae2d5 100644 --- a/src/amd/common/ac_binary.h +++ b/src/amd/common/ac_binary.h @@ -24,9 +24,9 @@ #ifndef AC_BINARY_H #define AC_BINARY_H +#include #include #include -#include #ifdef __cplusplus extern "C" { @@ -35,26 +35,24 @@ extern "C" { struct radeon_info; struct ac_shader_config { - unsigned num_sgprs; - unsigned num_vgprs; - unsigned num_shared_vgprs; /* GFX10: number of VGPRs shared between half-waves */ - unsigned spilled_sgprs; - unsigned spilled_vgprs; - unsigned lds_size; /* in HW allocation units; i.e 256 bytes on SI, 512 bytes on CI+ */ - unsigned spi_ps_input_ena; - unsigned spi_ps_input_addr; - unsigned float_mode; - unsigned scratch_bytes_per_wave; - unsigned rsrc1; - unsigned rsrc2; - unsigned rsrc3; + unsigned num_sgprs; + unsigned num_vgprs; + unsigned num_shared_vgprs; /* GFX10: number of VGPRs shared between half-waves */ + unsigned spilled_sgprs; + unsigned spilled_vgprs; + unsigned lds_size; /* in HW allocation units; i.e 256 bytes on SI, 512 bytes on CI+ */ + unsigned spi_ps_input_ena; + unsigned spi_ps_input_addr; + unsigned float_mode; + unsigned scratch_bytes_per_wave; + unsigned rsrc1; + unsigned rsrc2; + unsigned rsrc3; }; -void ac_parse_shader_binary_config(const char *data, size_t nbytes, - unsigned wave_size, - bool really_needs_scratch, - const struct radeon_info *info, - struct ac_shader_config *conf); +void ac_parse_shader_binary_config(const char *data, size_t nbytes, unsigned wave_size, + bool really_needs_scratch, const struct radeon_info *info, + struct ac_shader_config *conf); #ifdef __cplusplus } diff --git a/src/amd/common/ac_debug.c b/src/amd/common/ac_debug.c index f095b0b..bbaed82 100644 --- a/src/amd/common/ac_debug.c +++ b/src/amd/common/ac_debug.c @@ -24,15 +24,13 @@ #include "ac_debug.h" #ifdef HAVE_VALGRIND -#include #include +#include #define VG(x) x #else #define VG(x) ((void)0) #endif -#include - #include "sid.h" #include "sid_tables.h" #include "util/u_math.h" @@ -40,476 +38,455 @@ #include "util/u_string.h" #include +#include /* Parsed IBs are difficult to read without colors. Use "less -R file" to * read them, or use "aha -b -f file" to convert them to html. */ -#define COLOR_RESET "\033[0m" -#define COLOR_RED "\033[31m" -#define COLOR_GREEN "\033[1;32m" -#define COLOR_YELLOW "\033[1;33m" -#define COLOR_CYAN "\033[1;36m" +#define COLOR_RESET "\033[0m" +#define COLOR_RED "\033[31m" +#define COLOR_GREEN "\033[1;32m" +#define COLOR_YELLOW "\033[1;33m" +#define COLOR_CYAN "\033[1;36m" #define INDENT_PKT 8 struct ac_ib_parser { - FILE *f; - uint32_t *ib; - unsigned num_dw; - const int *trace_ids; - unsigned trace_id_count; - enum chip_class chip_class; - ac_debug_addr_callback addr_callback; - void *addr_callback_data; - - unsigned cur_dw; + FILE *f; + uint32_t *ib; + unsigned num_dw; + const int *trace_ids; + unsigned trace_id_count; + enum chip_class chip_class; + ac_debug_addr_callback addr_callback; + void *addr_callback_data; + + unsigned cur_dw; }; static void ac_do_parse_ib(FILE *f, struct ac_ib_parser *ib); static void print_spaces(FILE *f, unsigned num) { - fprintf(f, "%*s", num, ""); + fprintf(f, "%*s", num, ""); } static void print_value(FILE *file, uint32_t value, int bits) { - /* Guess if it's int or float */ - if (value <= (1 << 15)) { - if (value <= 9) - fprintf(file, "%u\n", value); - else - fprintf(file, "%u (0x%0*x)\n", value, bits / 4, value); - } else { - float f = uif(value); - - if (fabs(f) < 100000 && f*10 == floor(f*10)) - fprintf(file, "%.1ff (0x%0*x)\n", f, bits / 4, value); - else - /* Don't print more leading zeros than there are bits. */ - fprintf(file, "0x%0*x\n", bits / 4, value); - } + /* Guess if it's int or float */ + if (value <= (1 << 15)) { + if (value <= 9) + fprintf(file, "%u\n", value); + else + fprintf(file, "%u (0x%0*x)\n", value, bits / 4, value); + } else { + float f = uif(value); + + if (fabs(f) < 100000 && f * 10 == floor(f * 10)) + fprintf(file, "%.1ff (0x%0*x)\n", f, bits / 4, value); + else + /* Don't print more leading zeros than there are bits. */ + fprintf(file, "0x%0*x\n", bits / 4, value); + } } -static void print_named_value(FILE *file, const char *name, uint32_t value, - int bits) +static void print_named_value(FILE *file, const char *name, uint32_t value, int bits) { - print_spaces(file, INDENT_PKT); - fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", name); - print_value(file, value, bits); + print_spaces(file, INDENT_PKT); + fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", name); + print_value(file, value, bits); } static const struct si_reg *find_register(enum chip_class chip_class, unsigned offset) { - const struct si_reg *table; - unsigned table_size; - - switch (chip_class) { - case GFX10_3: - case GFX10: - table = gfx10_reg_table; - table_size = ARRAY_SIZE(gfx10_reg_table); - break; - case GFX9: - table = gfx9_reg_table; - table_size = ARRAY_SIZE(gfx9_reg_table); - break; - case GFX8: - table = gfx8_reg_table; - table_size = ARRAY_SIZE(gfx8_reg_table); - break; - case GFX7: - table = gfx7_reg_table; - table_size = ARRAY_SIZE(gfx7_reg_table); - break; - case GFX6: - table = gfx6_reg_table; - table_size = ARRAY_SIZE(gfx6_reg_table); - break; - default: - return NULL; - } - - for (unsigned i = 0; i < table_size; i++) { - const struct si_reg *reg = &table[i]; - - if (reg->offset == offset) - return reg; - } - - return NULL; + const struct si_reg *table; + unsigned table_size; + + switch (chip_class) { + case GFX10_3: + case GFX10: + table = gfx10_reg_table; + table_size = ARRAY_SIZE(gfx10_reg_table); + break; + case GFX9: + table = gfx9_reg_table; + table_size = ARRAY_SIZE(gfx9_reg_table); + break; + case GFX8: + table = gfx8_reg_table; + table_size = ARRAY_SIZE(gfx8_reg_table); + break; + case GFX7: + table = gfx7_reg_table; + table_size = ARRAY_SIZE(gfx7_reg_table); + break; + case GFX6: + table = gfx6_reg_table; + table_size = ARRAY_SIZE(gfx6_reg_table); + break; + default: + return NULL; + } + + for (unsigned i = 0; i < table_size; i++) { + const struct si_reg *reg = &table[i]; + + if (reg->offset == offset) + return reg; + } + + return NULL; } const char *ac_get_register_name(enum chip_class chip_class, unsigned offset) { - const struct si_reg *reg = find_register(chip_class, offset); + const struct si_reg *reg = find_register(chip_class, offset); - return reg ? sid_strings + reg->name_offset : "(no name)"; + return reg ? sid_strings + reg->name_offset : "(no name)"; } -void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, - uint32_t value, uint32_t field_mask) +void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, uint32_t value, + uint32_t field_mask) { - const struct si_reg *reg = find_register(chip_class, offset); - - if (reg) { - const char *reg_name = sid_strings + reg->name_offset; - bool first_field = true; - - print_spaces(file, INDENT_PKT); - fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", - reg_name); - - if (!reg->num_fields) { - print_value(file, value, 32); - return; - } - - for (unsigned f = 0; f < reg->num_fields; f++) { - const struct si_field *field = sid_fields_table + reg->fields_offset + f; - const int *values_offsets = sid_strings_offsets + field->values_offset; - uint32_t val = (value & field->mask) >> - (ffs(field->mask) - 1); - - if (!(field->mask & field_mask)) - continue; - - /* Indent the field. */ - if (!first_field) - print_spaces(file, - INDENT_PKT + strlen(reg_name) + 4); - - /* Print the field. */ - fprintf(file, "%s = ", sid_strings + field->name_offset); - - if (val < field->num_values && values_offsets[val] >= 0) - fprintf(file, "%s\n", sid_strings + values_offsets[val]); - else - print_value(file, val, - util_bitcount(field->mask)); - - first_field = false; - } - return; - } - - print_spaces(file, INDENT_PKT); - fprintf(file, COLOR_YELLOW "0x%05x" COLOR_RESET " <- 0x%08x\n", offset, value); + const struct si_reg *reg = find_register(chip_class, offset); + + if (reg) { + const char *reg_name = sid_strings + reg->name_offset; + bool first_field = true; + + print_spaces(file, INDENT_PKT); + fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", reg_name); + + if (!reg->num_fields) { + print_value(file, value, 32); + return; + } + + for (unsigned f = 0; f < reg->num_fields; f++) { + const struct si_field *field = sid_fields_table + reg->fields_offset + f; + const int *values_offsets = sid_strings_offsets + field->values_offset; + uint32_t val = (value & field->mask) >> (ffs(field->mask) - 1); + + if (!(field->mask & field_mask)) + continue; + + /* Indent the field. */ + if (!first_field) + print_spaces(file, INDENT_PKT + strlen(reg_name) + 4); + + /* Print the field. */ + fprintf(file, "%s = ", sid_strings + field->name_offset); + + if (val < field->num_values && values_offsets[val] >= 0) + fprintf(file, "%s\n", sid_strings + values_offsets[val]); + else + print_value(file, val, util_bitcount(field->mask)); + + first_field = false; + } + return; + } + + print_spaces(file, INDENT_PKT); + fprintf(file, COLOR_YELLOW "0x%05x" COLOR_RESET " <- 0x%08x\n", offset, value); } static uint32_t ac_ib_get(struct ac_ib_parser *ib) { - uint32_t v = 0; + uint32_t v = 0; - if (ib->cur_dw < ib->num_dw) { - v = ib->ib[ib->cur_dw]; + if (ib->cur_dw < ib->num_dw) { + v = ib->ib[ib->cur_dw]; #ifdef HAVE_VALGRIND - /* Help figure out where garbage data is written to IBs. - * - * Arguably we should do this already when the IBs are written, - * see RADEON_VALGRIND. The problem is that client-requests to - * Valgrind have an overhead even when Valgrind isn't running, - * and radeon_emit is performance sensitive... - */ - if (VALGRIND_CHECK_VALUE_IS_DEFINED(v)) - fprintf(ib->f, COLOR_RED "Valgrind: The next DWORD is garbage" - COLOR_RESET "\n"); + /* Help figure out where garbage data is written to IBs. + * + * Arguably we should do this already when the IBs are written, + * see RADEON_VALGRIND. The problem is that client-requests to + * Valgrind have an overhead even when Valgrind isn't running, + * and radeon_emit is performance sensitive... + */ + if (VALGRIND_CHECK_VALUE_IS_DEFINED(v)) + fprintf(ib->f, COLOR_RED "Valgrind: The next DWORD is garbage" COLOR_RESET "\n"); #endif - fprintf(ib->f, "\n\035#%08x ", v); - } else { - fprintf(ib->f, "\n\035#???????? "); - } + fprintf(ib->f, "\n\035#%08x ", v); + } else { + fprintf(ib->f, "\n\035#???????? "); + } - ib->cur_dw++; - return v; + ib->cur_dw++; + return v; } static void ac_parse_set_reg_packet(FILE *f, unsigned count, unsigned reg_offset, - struct ac_ib_parser *ib) + struct ac_ib_parser *ib) { - unsigned reg_dw = ac_ib_get(ib); - unsigned reg = ((reg_dw & 0xFFFF) << 2) + reg_offset; - unsigned index = reg_dw >> 28; - int i; - - if (index != 0) { - print_spaces(f, INDENT_PKT); - fprintf(f, "INDEX = %u\n", index); - } - - for (i = 0; i < count; i++) - ac_dump_reg(f, ib->chip_class, reg + i*4, ac_ib_get(ib), ~0); + unsigned reg_dw = ac_ib_get(ib); + unsigned reg = ((reg_dw & 0xFFFF) << 2) + reg_offset; + unsigned index = reg_dw >> 28; + int i; + + if (index != 0) { + print_spaces(f, INDENT_PKT); + fprintf(f, "INDEX = %u\n", index); + } + + for (i = 0; i < count; i++) + ac_dump_reg(f, ib->chip_class, reg + i * 4, ac_ib_get(ib), ~0); } static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib, int *current_trace_id) { - unsigned first_dw = ib->cur_dw; - int count = PKT_COUNT_G(header); - unsigned op = PKT3_IT_OPCODE_G(header); - const char *predicate = PKT3_PREDICATE(header) ? "(predicate)" : ""; - int i; - - /* Print the name first. */ - for (i = 0; i < ARRAY_SIZE(packet3_table); i++) - if (packet3_table[i].op == op) - break; - - if (i < ARRAY_SIZE(packet3_table)) { - const char *name = sid_strings + packet3_table[i].name_offset; - - if (op == PKT3_SET_CONTEXT_REG || - op == PKT3_SET_CONFIG_REG || - op == PKT3_SET_UCONFIG_REG || - op == PKT3_SET_UCONFIG_REG_INDEX || - op == PKT3_SET_SH_REG) - fprintf(f, COLOR_CYAN "%s%s" COLOR_CYAN ":\n", - name, predicate); - else - fprintf(f, COLOR_GREEN "%s%s" COLOR_RESET ":\n", - name, predicate); - } else - fprintf(f, COLOR_RED "PKT3_UNKNOWN 0x%x%s" COLOR_RESET ":\n", - op, predicate); - - /* Print the contents. */ - switch (op) { - case PKT3_SET_CONTEXT_REG: - ac_parse_set_reg_packet(f, count, SI_CONTEXT_REG_OFFSET, ib); - break; - case PKT3_SET_CONFIG_REG: - ac_parse_set_reg_packet(f, count, SI_CONFIG_REG_OFFSET, ib); - break; - case PKT3_SET_UCONFIG_REG: - case PKT3_SET_UCONFIG_REG_INDEX: - ac_parse_set_reg_packet(f, count, CIK_UCONFIG_REG_OFFSET, ib); - break; - case PKT3_SET_SH_REG: - ac_parse_set_reg_packet(f, count, SI_SH_REG_OFFSET, ib); - break; - case PKT3_ACQUIRE_MEM: - ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_030230_CP_COHER_SIZE_HI, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0301E4_CP_COHER_BASE_HI, ac_ib_get(ib), ~0); - print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16); - if (ib->chip_class >= GFX10) - ac_dump_reg(f, ib->chip_class, R_586_GCR_CNTL, ac_ib_get(ib), ~0); - break; - case PKT3_SURFACE_SYNC: - if (ib->chip_class >= GFX7) { - ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0); - } else { - ac_dump_reg(f, ib->chip_class, R_0085F0_CP_COHER_CNTL, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0085F4_CP_COHER_SIZE, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0085F8_CP_COHER_BASE, ac_ib_get(ib), ~0); - } - print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16); - break; - case PKT3_EVENT_WRITE: { - uint32_t event_dw = ac_ib_get(ib); - ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw, - S_028A90_EVENT_TYPE(~0)); - print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4); - print_named_value(f, "INV_L2", (event_dw >> 20) & 0x1, 1); - if (count > 0) { - print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32); - print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 16); - } - break; - } - case PKT3_EVENT_WRITE_EOP: { - uint32_t event_dw = ac_ib_get(ib); - ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw, - S_028A90_EVENT_TYPE(~0)); - print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4); - print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1); - print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1); - print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1); - print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1); - print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1); - print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32); - uint32_t addr_hi_dw = ac_ib_get(ib); - print_named_value(f, "ADDRESS_HI", addr_hi_dw, 16); - print_named_value(f, "DST_SEL", (addr_hi_dw >> 16) & 0x3, 2); - print_named_value(f, "INT_SEL", (addr_hi_dw >> 24) & 0x7, 3); - print_named_value(f, "DATA_SEL", addr_hi_dw >> 29, 3); - print_named_value(f, "DATA_LO", ac_ib_get(ib), 32); - print_named_value(f, "DATA_HI", ac_ib_get(ib), 32); - break; - } - case PKT3_RELEASE_MEM: { - uint32_t event_dw = ac_ib_get(ib); - if (ib->chip_class >= GFX10) { - ac_dump_reg(f, ib->chip_class, R_490_RELEASE_MEM_OP, event_dw, ~0u); - } else { - ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw, - S_028A90_EVENT_TYPE(~0)); - print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4); - print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1); - print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1); - print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1); - print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1); - print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1); - print_named_value(f, "TC_NC_ACTION_ENA", (event_dw >> 19) & 0x1, 1); - print_named_value(f, "TC_WC_ACTION_ENA", (event_dw >> 20) & 0x1, 1); - print_named_value(f, "TC_MD_ACTION_ENA", (event_dw >> 21) & 0x1, 1); - } - uint32_t sel_dw = ac_ib_get(ib); - print_named_value(f, "DST_SEL", (sel_dw >> 16) & 0x3, 2); - print_named_value(f, "INT_SEL", (sel_dw >> 24) & 0x7, 3); - print_named_value(f, "DATA_SEL", sel_dw >> 29, 3); - print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32); - print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32); - print_named_value(f, "DATA_LO", ac_ib_get(ib), 32); - print_named_value(f, "DATA_HI", ac_ib_get(ib), 32); - print_named_value(f, "CTXID", ac_ib_get(ib), 32); - break; - } - case PKT3_WAIT_REG_MEM: - print_named_value(f, "OP", ac_ib_get(ib), 32); - print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32); - print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32); - print_named_value(f, "REF", ac_ib_get(ib), 32); - print_named_value(f, "MASK", ac_ib_get(ib), 32); - print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16); - break; - case PKT3_DRAW_INDEX_AUTO: - ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0); - break; - case PKT3_DRAW_INDEX_2: - ac_dump_reg(f, ib->chip_class, R_028A78_VGT_DMA_MAX_SIZE, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0287E8_VGT_DMA_BASE, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0287E4_VGT_DMA_BASE_HI, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0); - break; - case PKT3_INDEX_TYPE: - ac_dump_reg(f, ib->chip_class, R_028A7C_VGT_DMA_INDEX_TYPE, ac_ib_get(ib), ~0); - break; - case PKT3_NUM_INSTANCES: - ac_dump_reg(f, ib->chip_class, R_030934_VGT_NUM_INSTANCES, ac_ib_get(ib), ~0); - break; - case PKT3_WRITE_DATA: - ac_dump_reg(f, ib->chip_class, R_370_CONTROL, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_371_DST_ADDR_LO, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_372_DST_ADDR_HI, ac_ib_get(ib), ~0); - /* The payload is written automatically */ - break; - case PKT3_CP_DMA: - ac_dump_reg(f, ib->chip_class, R_410_CP_DMA_WORD0, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_411_CP_DMA_WORD1, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_412_CP_DMA_WORD2, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_413_CP_DMA_WORD3, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0); - break; - case PKT3_DMA_DATA: - ac_dump_reg(f, ib->chip_class, R_500_DMA_DATA_WORD0, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_501_SRC_ADDR_LO, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_502_SRC_ADDR_HI, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_503_DST_ADDR_LO, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_504_DST_ADDR_HI, ac_ib_get(ib), ~0); - ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0); - break; - case PKT3_INDIRECT_BUFFER_SI: - case PKT3_INDIRECT_BUFFER_CONST: - case PKT3_INDIRECT_BUFFER_CIK: { - uint32_t base_lo_dw = ac_ib_get(ib); - ac_dump_reg(f, ib->chip_class, R_3F0_IB_BASE_LO, base_lo_dw, ~0); - uint32_t base_hi_dw = ac_ib_get(ib); - ac_dump_reg(f, ib->chip_class, R_3F1_IB_BASE_HI, base_hi_dw, ~0); - uint32_t control_dw = ac_ib_get(ib); - ac_dump_reg(f, ib->chip_class, R_3F2_IB_CONTROL, control_dw, ~0); - - if (!ib->addr_callback) - break; - - uint64_t addr = ((uint64_t)base_hi_dw << 32) | base_lo_dw; - void *data = ib->addr_callback(ib->addr_callback_data, addr); - if (!data) - break; - - if (G_3F2_CHAIN(control_dw)) { - ib->ib = data; - ib->num_dw = G_3F2_IB_SIZE(control_dw); - ib->cur_dw = 0; - return; - } - - struct ac_ib_parser ib_recurse; - memcpy(&ib_recurse, ib, sizeof(ib_recurse)); - ib_recurse.ib = data; - ib_recurse.num_dw = G_3F2_IB_SIZE(control_dw); - ib_recurse.cur_dw = 0; - if(ib_recurse.trace_id_count) { - if (*current_trace_id == *ib->trace_ids) { - ++ib_recurse.trace_ids; - --ib_recurse.trace_id_count; - } else { - ib_recurse.trace_id_count = 0; - } - } - - fprintf(f, "\n\035>------------------ nested begin ------------------\n"); - ac_do_parse_ib(f, &ib_recurse); - fprintf(f, "\n\035<------------------- nested end -------------------\n"); - break; - } - case PKT3_CLEAR_STATE: - case PKT3_INCREMENT_DE_COUNTER: - case PKT3_PFP_SYNC_ME: - break; - case PKT3_NOP: - if (header == PKT3_NOP_PAD) { - count = -1; /* One dword NOP. */ - } else if (count == 0 && ib->cur_dw < ib->num_dw && - AC_IS_TRACE_POINT(ib->ib[ib->cur_dw])) { - unsigned packet_id = AC_GET_TRACE_POINT_ID(ib->ib[ib->cur_dw]); - - print_spaces(f, INDENT_PKT); - fprintf(f, COLOR_RED "Trace point ID: %u\n", packet_id); - - if (!ib->trace_id_count) - break; /* tracing was disabled */ - - *current_trace_id = packet_id; - - print_spaces(f, INDENT_PKT); - if (packet_id < *ib->trace_ids) - fprintf(f, COLOR_RED - "This trace point was reached by the CP." - COLOR_RESET "\n"); - else if (packet_id == *ib->trace_ids) - fprintf(f, COLOR_RED - "!!!!! This is the last trace point that " - "was reached by the CP !!!!!" - COLOR_RESET "\n"); - else if (packet_id+1 == *ib->trace_ids) - fprintf(f, COLOR_RED - "!!!!! This is the first trace point that " - "was NOT been reached by the CP !!!!!" - COLOR_RESET "\n"); - else - fprintf(f, COLOR_RED - "!!!!! This trace point was NOT reached " - "by the CP !!!!!" - COLOR_RESET "\n"); - break; - } - break; - } - - /* print additional dwords */ - while (ib->cur_dw <= first_dw + count) - ac_ib_get(ib); - - if (ib->cur_dw > first_dw + count + 1) - fprintf(f, COLOR_RED "\n!!!!! count in header too low !!!!!" - COLOR_RESET "\n"); + unsigned first_dw = ib->cur_dw; + int count = PKT_COUNT_G(header); + unsigned op = PKT3_IT_OPCODE_G(header); + const char *predicate = PKT3_PREDICATE(header) ? "(predicate)" : ""; + int i; + + /* Print the name first. */ + for (i = 0; i < ARRAY_SIZE(packet3_table); i++) + if (packet3_table[i].op == op) + break; + + if (i < ARRAY_SIZE(packet3_table)) { + const char *name = sid_strings + packet3_table[i].name_offset; + + if (op == PKT3_SET_CONTEXT_REG || op == PKT3_SET_CONFIG_REG || op == PKT3_SET_UCONFIG_REG || + op == PKT3_SET_UCONFIG_REG_INDEX || op == PKT3_SET_SH_REG) + fprintf(f, COLOR_CYAN "%s%s" COLOR_CYAN ":\n", name, predicate); + else + fprintf(f, COLOR_GREEN "%s%s" COLOR_RESET ":\n", name, predicate); + } else + fprintf(f, COLOR_RED "PKT3_UNKNOWN 0x%x%s" COLOR_RESET ":\n", op, predicate); + + /* Print the contents. */ + switch (op) { + case PKT3_SET_CONTEXT_REG: + ac_parse_set_reg_packet(f, count, SI_CONTEXT_REG_OFFSET, ib); + break; + case PKT3_SET_CONFIG_REG: + ac_parse_set_reg_packet(f, count, SI_CONFIG_REG_OFFSET, ib); + break; + case PKT3_SET_UCONFIG_REG: + case PKT3_SET_UCONFIG_REG_INDEX: + ac_parse_set_reg_packet(f, count, CIK_UCONFIG_REG_OFFSET, ib); + break; + case PKT3_SET_SH_REG: + ac_parse_set_reg_packet(f, count, SI_SH_REG_OFFSET, ib); + break; + case PKT3_ACQUIRE_MEM: + ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_030230_CP_COHER_SIZE_HI, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0301E4_CP_COHER_BASE_HI, ac_ib_get(ib), ~0); + print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16); + if (ib->chip_class >= GFX10) + ac_dump_reg(f, ib->chip_class, R_586_GCR_CNTL, ac_ib_get(ib), ~0); + break; + case PKT3_SURFACE_SYNC: + if (ib->chip_class >= GFX7) { + ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0); + } else { + ac_dump_reg(f, ib->chip_class, R_0085F0_CP_COHER_CNTL, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0085F4_CP_COHER_SIZE, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0085F8_CP_COHER_BASE, ac_ib_get(ib), ~0); + } + print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16); + break; + case PKT3_EVENT_WRITE: { + uint32_t event_dw = ac_ib_get(ib); + ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw, + S_028A90_EVENT_TYPE(~0)); + print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4); + print_named_value(f, "INV_L2", (event_dw >> 20) & 0x1, 1); + if (count > 0) { + print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32); + print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 16); + } + break; + } + case PKT3_EVENT_WRITE_EOP: { + uint32_t event_dw = ac_ib_get(ib); + ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw, + S_028A90_EVENT_TYPE(~0)); + print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4); + print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1); + print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1); + print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1); + print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1); + print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1); + print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32); + uint32_t addr_hi_dw = ac_ib_get(ib); + print_named_value(f, "ADDRESS_HI", addr_hi_dw, 16); + print_named_value(f, "DST_SEL", (addr_hi_dw >> 16) & 0x3, 2); + print_named_value(f, "INT_SEL", (addr_hi_dw >> 24) & 0x7, 3); + print_named_value(f, "DATA_SEL", addr_hi_dw >> 29, 3); + print_named_value(f, "DATA_LO", ac_ib_get(ib), 32); + print_named_value(f, "DATA_HI", ac_ib_get(ib), 32); + break; + } + case PKT3_RELEASE_MEM: { + uint32_t event_dw = ac_ib_get(ib); + if (ib->chip_class >= GFX10) { + ac_dump_reg(f, ib->chip_class, R_490_RELEASE_MEM_OP, event_dw, ~0u); + } else { + ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw, + S_028A90_EVENT_TYPE(~0)); + print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4); + print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1); + print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1); + print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1); + print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1); + print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1); + print_named_value(f, "TC_NC_ACTION_ENA", (event_dw >> 19) & 0x1, 1); + print_named_value(f, "TC_WC_ACTION_ENA", (event_dw >> 20) & 0x1, 1); + print_named_value(f, "TC_MD_ACTION_ENA", (event_dw >> 21) & 0x1, 1); + } + uint32_t sel_dw = ac_ib_get(ib); + print_named_value(f, "DST_SEL", (sel_dw >> 16) & 0x3, 2); + print_named_value(f, "INT_SEL", (sel_dw >> 24) & 0x7, 3); + print_named_value(f, "DATA_SEL", sel_dw >> 29, 3); + print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32); + print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32); + print_named_value(f, "DATA_LO", ac_ib_get(ib), 32); + print_named_value(f, "DATA_HI", ac_ib_get(ib), 32); + print_named_value(f, "CTXID", ac_ib_get(ib), 32); + break; + } + case PKT3_WAIT_REG_MEM: + print_named_value(f, "OP", ac_ib_get(ib), 32); + print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32); + print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32); + print_named_value(f, "REF", ac_ib_get(ib), 32); + print_named_value(f, "MASK", ac_ib_get(ib), 32); + print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16); + break; + case PKT3_DRAW_INDEX_AUTO: + ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0); + break; + case PKT3_DRAW_INDEX_2: + ac_dump_reg(f, ib->chip_class, R_028A78_VGT_DMA_MAX_SIZE, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0287E8_VGT_DMA_BASE, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0287E4_VGT_DMA_BASE_HI, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0); + break; + case PKT3_INDEX_TYPE: + ac_dump_reg(f, ib->chip_class, R_028A7C_VGT_DMA_INDEX_TYPE, ac_ib_get(ib), ~0); + break; + case PKT3_NUM_INSTANCES: + ac_dump_reg(f, ib->chip_class, R_030934_VGT_NUM_INSTANCES, ac_ib_get(ib), ~0); + break; + case PKT3_WRITE_DATA: + ac_dump_reg(f, ib->chip_class, R_370_CONTROL, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_371_DST_ADDR_LO, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_372_DST_ADDR_HI, ac_ib_get(ib), ~0); + /* The payload is written automatically */ + break; + case PKT3_CP_DMA: + ac_dump_reg(f, ib->chip_class, R_410_CP_DMA_WORD0, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_411_CP_DMA_WORD1, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_412_CP_DMA_WORD2, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_413_CP_DMA_WORD3, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0); + break; + case PKT3_DMA_DATA: + ac_dump_reg(f, ib->chip_class, R_500_DMA_DATA_WORD0, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_501_SRC_ADDR_LO, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_502_SRC_ADDR_HI, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_503_DST_ADDR_LO, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_504_DST_ADDR_HI, ac_ib_get(ib), ~0); + ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0); + break; + case PKT3_INDIRECT_BUFFER_SI: + case PKT3_INDIRECT_BUFFER_CONST: + case PKT3_INDIRECT_BUFFER_CIK: { + uint32_t base_lo_dw = ac_ib_get(ib); + ac_dump_reg(f, ib->chip_class, R_3F0_IB_BASE_LO, base_lo_dw, ~0); + uint32_t base_hi_dw = ac_ib_get(ib); + ac_dump_reg(f, ib->chip_class, R_3F1_IB_BASE_HI, base_hi_dw, ~0); + uint32_t control_dw = ac_ib_get(ib); + ac_dump_reg(f, ib->chip_class, R_3F2_IB_CONTROL, control_dw, ~0); + + if (!ib->addr_callback) + break; + + uint64_t addr = ((uint64_t)base_hi_dw << 32) | base_lo_dw; + void *data = ib->addr_callback(ib->addr_callback_data, addr); + if (!data) + break; + + if (G_3F2_CHAIN(control_dw)) { + ib->ib = data; + ib->num_dw = G_3F2_IB_SIZE(control_dw); + ib->cur_dw = 0; + return; + } + + struct ac_ib_parser ib_recurse; + memcpy(&ib_recurse, ib, sizeof(ib_recurse)); + ib_recurse.ib = data; + ib_recurse.num_dw = G_3F2_IB_SIZE(control_dw); + ib_recurse.cur_dw = 0; + if (ib_recurse.trace_id_count) { + if (*current_trace_id == *ib->trace_ids) { + ++ib_recurse.trace_ids; + --ib_recurse.trace_id_count; + } else { + ib_recurse.trace_id_count = 0; + } + } + + fprintf(f, "\n\035>------------------ nested begin ------------------\n"); + ac_do_parse_ib(f, &ib_recurse); + fprintf(f, "\n\035<------------------- nested end -------------------\n"); + break; + } + case PKT3_CLEAR_STATE: + case PKT3_INCREMENT_DE_COUNTER: + case PKT3_PFP_SYNC_ME: + break; + case PKT3_NOP: + if (header == PKT3_NOP_PAD) { + count = -1; /* One dword NOP. */ + } else if (count == 0 && ib->cur_dw < ib->num_dw && AC_IS_TRACE_POINT(ib->ib[ib->cur_dw])) { + unsigned packet_id = AC_GET_TRACE_POINT_ID(ib->ib[ib->cur_dw]); + + print_spaces(f, INDENT_PKT); + fprintf(f, COLOR_RED "Trace point ID: %u\n", packet_id); + + if (!ib->trace_id_count) + break; /* tracing was disabled */ + + *current_trace_id = packet_id; + + print_spaces(f, INDENT_PKT); + if (packet_id < *ib->trace_ids) + fprintf(f, COLOR_RED "This trace point was reached by the CP." COLOR_RESET "\n"); + else if (packet_id == *ib->trace_ids) + fprintf(f, COLOR_RED "!!!!! This is the last trace point that " + "was reached by the CP !!!!!" COLOR_RESET "\n"); + else if (packet_id + 1 == *ib->trace_ids) + fprintf(f, COLOR_RED "!!!!! This is the first trace point that " + "was NOT been reached by the CP !!!!!" COLOR_RESET "\n"); + else + fprintf(f, COLOR_RED "!!!!! This trace point was NOT reached " + "by the CP !!!!!" COLOR_RESET "\n"); + break; + } + break; + } + + /* print additional dwords */ + while (ib->cur_dw <= first_dw + count) + ac_ib_get(ib); + + if (ib->cur_dw > first_dw + count + 1) + fprintf(f, COLOR_RED "\n!!!!! count in header too low !!!!!" COLOR_RESET "\n"); } /** @@ -517,65 +494,65 @@ static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib, */ static void ac_do_parse_ib(FILE *f, struct ac_ib_parser *ib) { - int current_trace_id = -1; - - while (ib->cur_dw < ib->num_dw) { - uint32_t header = ac_ib_get(ib); - unsigned type = PKT_TYPE_G(header); - - switch (type) { - case 3: - ac_parse_packet3(f, header, ib, ¤t_trace_id); - break; - case 2: - /* type-2 nop */ - if (header == 0x80000000) { - fprintf(f, COLOR_GREEN "NOP (type 2)" COLOR_RESET "\n"); - break; - } - /* fall through */ - default: - fprintf(f, "Unknown packet type %i\n", type); - break; - } - } + int current_trace_id = -1; + + while (ib->cur_dw < ib->num_dw) { + uint32_t header = ac_ib_get(ib); + unsigned type = PKT_TYPE_G(header); + + switch (type) { + case 3: + ac_parse_packet3(f, header, ib, ¤t_trace_id); + break; + case 2: + /* type-2 nop */ + if (header == 0x80000000) { + fprintf(f, COLOR_GREEN "NOP (type 2)" COLOR_RESET "\n"); + break; + } + /* fall through */ + default: + fprintf(f, "Unknown packet type %i\n", type); + break; + } + } } static void format_ib_output(FILE *f, char *out) { - unsigned depth = 0; + unsigned depth = 0; - for (;;) { - char op = 0; + for (;;) { + char op = 0; - if (out[0] == '\n' && out[1] == '\035') - out++; - if (out[0] == '\035') { - op = out[1]; - out += 2; - } + if (out[0] == '\n' && out[1] == '\035') + out++; + if (out[0] == '\035') { + op = out[1]; + out += 2; + } - if (op == '<') - depth--; + if (op == '<') + depth--; - unsigned indent = 4 * depth; - if (op != '#') - indent += 9; + unsigned indent = 4 * depth; + if (op != '#') + indent += 9; - if (indent) - print_spaces(f, indent); + if (indent) + print_spaces(f, indent); - char *end = strchrnul(out, '\n'); - fwrite(out, end - out, 1, f); - fputc('\n', f); /* always end with a new line */ - if (!*end) - break; + char *end = strchrnul(out, '\n'); + fwrite(out, end - out, 1, f); + fputc('\n', f); /* always end with a new line */ + if (!*end) + break; - out = end + 1; + out = end + 1; - if (op == '>') - depth++; - } + if (op == '>') + depth++; + } } /** @@ -593,34 +570,34 @@ static void format_ib_output(FILE *f, char *out) * \param addr_callback_data user data for addr_callback */ void ac_parse_ib_chunk(FILE *f, uint32_t *ib_ptr, int num_dw, const int *trace_ids, - unsigned trace_id_count, enum chip_class chip_class, + unsigned trace_id_count, enum chip_class chip_class, ac_debug_addr_callback addr_callback, void *addr_callback_data) { - struct ac_ib_parser ib = {}; - ib.ib = ib_ptr; - ib.num_dw = num_dw; - ib.trace_ids = trace_ids; - ib.trace_id_count = trace_id_count; - ib.chip_class = chip_class; - ib.addr_callback = addr_callback; - ib.addr_callback_data = addr_callback_data; - - char *out; - size_t outsize; - FILE *memf = open_memstream(&out, &outsize); - ib.f = memf; - ac_do_parse_ib(memf, &ib); - fclose(memf); - - if (out) { - format_ib_output(f, out); - free(out); - } - - if (ib.cur_dw > ib.num_dw) { - printf("\nPacket ends after the end of IB.\n"); - exit(1); - } + struct ac_ib_parser ib = {}; + ib.ib = ib_ptr; + ib.num_dw = num_dw; + ib.trace_ids = trace_ids; + ib.trace_id_count = trace_id_count; + ib.chip_class = chip_class; + ib.addr_callback = addr_callback; + ib.addr_callback_data = addr_callback_data; + + char *out; + size_t outsize; + FILE *memf = open_memstream(&out, &outsize); + ib.f = memf; + ac_do_parse_ib(memf, &ib); + fclose(memf); + + if (out) { + format_ib_output(f, out); + free(out); + } + + if (ib.cur_dw > ib.num_dw) { + printf("\nPacket ends after the end of IB.\n"); + exit(1); + } } /** @@ -637,17 +614,16 @@ void ac_parse_ib_chunk(FILE *f, uint32_t *ib_ptr, int num_dw, const int *trace_i * be NULL. * \param addr_callback_data user data for addr_callback */ -void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, - unsigned trace_id_count, const char *name, - enum chip_class chip_class, ac_debug_addr_callback addr_callback, - void *addr_callback_data) +void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, unsigned trace_id_count, + const char *name, enum chip_class chip_class, ac_debug_addr_callback addr_callback, + void *addr_callback_data) { - fprintf(f, "------------------ %s begin ------------------\n", name); + fprintf(f, "------------------ %s begin ------------------\n", name); - ac_parse_ib_chunk(f, ib, num_dw, trace_ids, trace_id_count, - chip_class, addr_callback, addr_callback_data); + ac_parse_ib_chunk(f, ib, num_dw, trace_ids, trace_id_count, chip_class, addr_callback, + addr_callback_data); - fprintf(f, "------------------- %s end -------------------\n\n", name); + fprintf(f, "------------------- %s end -------------------\n\n", name); } /** @@ -657,179 +633,176 @@ void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, * \param old_dmesg_timestamp previous dmesg timestamp parsed at init time * \param out_addr detected VM fault addr */ -bool ac_vm_fault_occured(enum chip_class chip_class, - uint64_t *old_dmesg_timestamp, uint64_t *out_addr) +bool ac_vm_fault_occured(enum chip_class chip_class, uint64_t *old_dmesg_timestamp, + uint64_t *out_addr) { - char line[2000]; - unsigned sec, usec; - int progress = 0; - uint64_t dmesg_timestamp = 0; - bool fault = false; - - FILE *p = popen("dmesg", "r"); - if (!p) - return false; - - while (fgets(line, sizeof(line), p)) { - char *msg, len; - - if (!line[0] || line[0] == '\n') - continue; - - /* Get the timestamp. */ - if (sscanf(line, "[%u.%u]", &sec, &usec) != 2) { - static bool hit = false; - if (!hit) { - fprintf(stderr, "%s: failed to parse line '%s'\n", - __func__, line); - hit = true; - } - continue; - } - dmesg_timestamp = sec * 1000000ull + usec; - - /* If just updating the timestamp. */ - if (!out_addr) - continue; - - /* Process messages only if the timestamp is newer. */ - if (dmesg_timestamp <= *old_dmesg_timestamp) - continue; - - /* Only process the first VM fault. */ - if (fault) - continue; - - /* Remove trailing \n */ - len = strlen(line); - if (len && line[len-1] == '\n') - line[len-1] = 0; - - /* Get the message part. */ - msg = strchr(line, ']'); - if (!msg) - continue; - msg++; - - const char *header_line, *addr_line_prefix, *addr_line_format; - - if (chip_class >= GFX9) { - /* Match this: - * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0) - * ..: at page 0x0000000219f8f000 from 27 - * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C - */ - header_line = "VMC page fault"; - addr_line_prefix = " at page"; - addr_line_format = "%"PRIx64; - } else { - header_line = "GPU fault detected:"; - addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR"; - addr_line_format = "%"PRIX64; - } - - switch (progress) { - case 0: - if (strstr(msg, header_line)) - progress = 1; - break; - case 1: - msg = strstr(msg, addr_line_prefix); - if (msg) { - msg = strstr(msg, "0x"); - if (msg) { - msg += 2; - if (sscanf(msg, addr_line_format, out_addr) == 1) - fault = true; - } - } - progress = 0; - break; - default: - progress = 0; - } - } - pclose(p); - - if (dmesg_timestamp > *old_dmesg_timestamp) - *old_dmesg_timestamp = dmesg_timestamp; - - return fault; + char line[2000]; + unsigned sec, usec; + int progress = 0; + uint64_t dmesg_timestamp = 0; + bool fault = false; + + FILE *p = popen("dmesg", "r"); + if (!p) + return false; + + while (fgets(line, sizeof(line), p)) { + char *msg, len; + + if (!line[0] || line[0] == '\n') + continue; + + /* Get the timestamp. */ + if (sscanf(line, "[%u.%u]", &sec, &usec) != 2) { + static bool hit = false; + if (!hit) { + fprintf(stderr, "%s: failed to parse line '%s'\n", __func__, line); + hit = true; + } + continue; + } + dmesg_timestamp = sec * 1000000ull + usec; + + /* If just updating the timestamp. */ + if (!out_addr) + continue; + + /* Process messages only if the timestamp is newer. */ + if (dmesg_timestamp <= *old_dmesg_timestamp) + continue; + + /* Only process the first VM fault. */ + if (fault) + continue; + + /* Remove trailing \n */ + len = strlen(line); + if (len && line[len - 1] == '\n') + line[len - 1] = 0; + + /* Get the message part. */ + msg = strchr(line, ']'); + if (!msg) + continue; + msg++; + + const char *header_line, *addr_line_prefix, *addr_line_format; + + if (chip_class >= GFX9) { + /* Match this: + * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0) + * ..: at page 0x0000000219f8f000 from 27 + * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C + */ + header_line = "VMC page fault"; + addr_line_prefix = " at page"; + addr_line_format = "%" PRIx64; + } else { + header_line = "GPU fault detected:"; + addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR"; + addr_line_format = "%" PRIX64; + } + + switch (progress) { + case 0: + if (strstr(msg, header_line)) + progress = 1; + break; + case 1: + msg = strstr(msg, addr_line_prefix); + if (msg) { + msg = strstr(msg, "0x"); + if (msg) { + msg += 2; + if (sscanf(msg, addr_line_format, out_addr) == 1) + fault = true; + } + } + progress = 0; + break; + default: + progress = 0; + } + } + pclose(p); + + if (dmesg_timestamp > *old_dmesg_timestamp) + *old_dmesg_timestamp = dmesg_timestamp; + + return fault; } static int compare_wave(const void *p1, const void *p2) { - struct ac_wave_info *w1 = (struct ac_wave_info *)p1; - struct ac_wave_info *w2 = (struct ac_wave_info *)p2; - - /* Sort waves according to PC and then SE, SH, CU, etc. */ - if (w1->pc < w2->pc) - return -1; - if (w1->pc > w2->pc) - return 1; - if (w1->se < w2->se) - return -1; - if (w1->se > w2->se) - return 1; - if (w1->sh < w2->sh) - return -1; - if (w1->sh > w2->sh) - return 1; - if (w1->cu < w2->cu) - return -1; - if (w1->cu > w2->cu) - return 1; - if (w1->simd < w2->simd) - return -1; - if (w1->simd > w2->simd) - return 1; - if (w1->wave < w2->wave) - return -1; - if (w1->wave > w2->wave) - return 1; - - return 0; + struct ac_wave_info *w1 = (struct ac_wave_info *)p1; + struct ac_wave_info *w2 = (struct ac_wave_info *)p2; + + /* Sort waves according to PC and then SE, SH, CU, etc. */ + if (w1->pc < w2->pc) + return -1; + if (w1->pc > w2->pc) + return 1; + if (w1->se < w2->se) + return -1; + if (w1->se > w2->se) + return 1; + if (w1->sh < w2->sh) + return -1; + if (w1->sh > w2->sh) + return 1; + if (w1->cu < w2->cu) + return -1; + if (w1->cu > w2->cu) + return 1; + if (w1->simd < w2->simd) + return -1; + if (w1->simd > w2->simd) + return 1; + if (w1->wave < w2->wave) + return -1; + if (w1->wave > w2->wave) + return 1; + + return 0; } /* Return wave information. "waves" should be a large enough array. */ unsigned ac_get_wave_info(enum chip_class chip_class, - struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]) + struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]) { - char line[2000], cmd[128]; - unsigned num_waves = 0; - - sprintf(cmd, "umr -O halt_waves -wa %s", chip_class >= GFX10 ? "gfx_0.0.0" : "gfx"); - - FILE *p = popen(cmd, "r"); - if (!p) - return 0; - - if (!fgets(line, sizeof(line), p) || - strncmp(line, "SE", 2) != 0) { - pclose(p); - return 0; - } - - while (fgets(line, sizeof(line), p)) { - struct ac_wave_info *w; - uint32_t pc_hi, pc_lo, exec_hi, exec_lo; - - assert(num_waves < AC_MAX_WAVES_PER_CHIP); - w = &waves[num_waves]; - - if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x", - &w->se, &w->sh, &w->cu, &w->simd, &w->wave, - &w->status, &pc_hi, &pc_lo, &w->inst_dw0, - &w->inst_dw1, &exec_hi, &exec_lo) == 12) { - w->pc = ((uint64_t)pc_hi << 32) | pc_lo; - w->exec = ((uint64_t)exec_hi << 32) | exec_lo; - w->matched = false; - num_waves++; - } - } - - qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave); - - pclose(p); - return num_waves; + char line[2000], cmd[128]; + unsigned num_waves = 0; + + sprintf(cmd, "umr -O halt_waves -wa %s", chip_class >= GFX10 ? "gfx_0.0.0" : "gfx"); + + FILE *p = popen(cmd, "r"); + if (!p) + return 0; + + if (!fgets(line, sizeof(line), p) || strncmp(line, "SE", 2) != 0) { + pclose(p); + return 0; + } + + while (fgets(line, sizeof(line), p)) { + struct ac_wave_info *w; + uint32_t pc_hi, pc_lo, exec_hi, exec_lo; + + assert(num_waves < AC_MAX_WAVES_PER_CHIP); + w = &waves[num_waves]; + + if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x", &w->se, &w->sh, &w->cu, &w->simd, + &w->wave, &w->status, &pc_hi, &pc_lo, &w->inst_dw0, &w->inst_dw1, &exec_hi, + &exec_lo) == 12) { + w->pc = ((uint64_t)pc_hi << 32) | pc_lo; + w->exec = ((uint64_t)exec_hi << 32) | exec_lo; + w->matched = false; + num_waves++; + } + } + + qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave); + + pclose(p); + return num_waves; } diff --git a/src/amd/common/ac_debug.h b/src/amd/common/ac_debug.h index e66abb9..72441f7 100644 --- a/src/amd/common/ac_debug.h +++ b/src/amd/common/ac_debug.h @@ -24,15 +24,15 @@ #ifndef AC_DEBUG_H #define AC_DEBUG_H +#include "amd_family.h" + +#include #include #include -#include - -#include "amd_family.h" -#define AC_ENCODE_TRACE_POINT(id) (0xcafe0000 | ((id) & 0xffff)) -#define AC_IS_TRACE_POINT(x) (((x) & 0xcafe0000) == 0xcafe0000) -#define AC_GET_TRACE_POINT_ID(x) ((x) & 0xffff) +#define AC_ENCODE_TRACE_POINT(id) (0xcafe0000 | ((id)&0xffff)) +#define AC_IS_TRACE_POINT(x) (((x)&0xcafe0000) == 0xcafe0000) +#define AC_GET_TRACE_POINT_ID(x) ((x)&0xffff) #define AC_MAX_WAVES_PER_CHIP (64 * 40) @@ -41,36 +41,36 @@ extern "C" { #endif struct ac_wave_info { - unsigned se; /* shader engine */ - unsigned sh; /* shader array */ - unsigned cu; /* compute unit */ - unsigned simd; - unsigned wave; - uint32_t status; - uint64_t pc; /* program counter */ - uint32_t inst_dw0; - uint32_t inst_dw1; - uint64_t exec; - bool matched; /* whether the wave is used by a currently-bound shader */ + unsigned se; /* shader engine */ + unsigned sh; /* shader array */ + unsigned cu; /* compute unit */ + unsigned simd; + unsigned wave; + uint32_t status; + uint64_t pc; /* program counter */ + uint32_t inst_dw0; + uint32_t inst_dw1; + uint64_t exec; + bool matched; /* whether the wave is used by a currently-bound shader */ }; typedef void *(*ac_debug_addr_callback)(void *data, uint64_t addr); const char *ac_get_register_name(enum chip_class chip_class, unsigned offset); -void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, - uint32_t value, uint32_t field_mask); +void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, uint32_t value, + uint32_t field_mask); void ac_parse_ib_chunk(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, - unsigned trace_id_count, enum chip_class chip_class, - ac_debug_addr_callback addr_callback, void *addr_callback_data); -void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, - unsigned trace_id_count, const char *name, enum chip_class chip_class, - ac_debug_addr_callback addr_callback, void *addr_callback_data); + unsigned trace_id_count, enum chip_class chip_class, + ac_debug_addr_callback addr_callback, void *addr_callback_data); +void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, unsigned trace_id_count, + const char *name, enum chip_class chip_class, ac_debug_addr_callback addr_callback, + void *addr_callback_data); -bool ac_vm_fault_occured(enum chip_class chip_class, - uint64_t *old_dmesg_timestamp, uint64_t *out_addr); +bool ac_vm_fault_occured(enum chip_class chip_class, uint64_t *old_dmesg_timestamp, + uint64_t *out_addr); unsigned ac_get_wave_info(enum chip_class chip_class, - struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]); + struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]); #ifdef __cplusplus } diff --git a/src/amd/common/ac_exp_param.h b/src/amd/common/ac_exp_param.h index b97ce81..ac8018c 100644 --- a/src/amd/common/ac_exp_param.h +++ b/src/amd/common/ac_exp_param.h @@ -25,16 +25,17 @@ #ifndef AC_EXP_PARAM_H #define AC_EXP_PARAM_H -enum { - /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */ - AC_EXP_PARAM_OFFSET_0 = 0, - AC_EXP_PARAM_OFFSET_31 = 31, - /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */ - AC_EXP_PARAM_DEFAULT_VAL_0000 = 64, - AC_EXP_PARAM_DEFAULT_VAL_0001, - AC_EXP_PARAM_DEFAULT_VAL_1110, - AC_EXP_PARAM_DEFAULT_VAL_1111, - AC_EXP_PARAM_UNDEFINED = 255, +enum +{ + /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */ + AC_EXP_PARAM_OFFSET_0 = 0, + AC_EXP_PARAM_OFFSET_31 = 31, + /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */ + AC_EXP_PARAM_DEFAULT_VAL_0000 = 64, + AC_EXP_PARAM_DEFAULT_VAL_0001, + AC_EXP_PARAM_DEFAULT_VAL_1110, + AC_EXP_PARAM_DEFAULT_VAL_1111, + AC_EXP_PARAM_UNDEFINED = 255, }; #endif diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index e6ed816..770737a 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -24,36 +24,34 @@ */ #include "ac_gpu_info.h" + #include "addrlib/src/amdgpu_asic_addr.h" +#include "drm-uapi/amdgpu_drm.h" #include "sid.h" - #include "util/macros.h" #include "util/u_math.h" +#include #include - #include -#include "drm-uapi/amdgpu_drm.h" -#include - -#define CIK_TILE_MODE_COLOR_2D 14 - -#define CIK__GB_TILE_MODE__PIPE_CONFIG(x) (((x) >> 6) & 0x1f) -#define CIK__PIPE_CONFIG__ADDR_SURF_P2 0 -#define CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16 4 -#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16 5 -#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32 6 -#define CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32 7 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16 8 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16 9 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16 10 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16 11 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16 12 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32 13 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32 14 -#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16 16 -#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16 17 +#define CIK_TILE_MODE_COLOR_2D 14 + +#define CIK__GB_TILE_MODE__PIPE_CONFIG(x) (((x) >> 6) & 0x1f) +#define CIK__PIPE_CONFIG__ADDR_SURF_P2 0 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16 4 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16 5 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32 6 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32 7 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16 8 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16 9 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16 10 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16 11 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16 12 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32 13 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32 14 +#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16 16 +#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16 17 static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info) { @@ -61,12 +59,12 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info) switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) { case CIK__PIPE_CONFIG__ADDR_SURF_P2: - return 2; + return 2; case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16: case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16: case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32: case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32: - return 4; + return 4; case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16: case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16: case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16: @@ -74,1239 +72,1162 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info) case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16: case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32: case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32: - return 8; + return 8; case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16: case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16: - return 16; + return 16; default: - fprintf(stderr, "Invalid GFX7 pipe configuration, assuming P2\n"); - assert(!"this should never occur"); - return 2; + fprintf(stderr, "Invalid GFX7 pipe configuration, assuming P2\n"); + assert(!"this should never occur"); + return 2; } } static bool has_syncobj(int fd) { - uint64_t value; - if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &value)) - return false; - return value ? true : false; + uint64_t value; + if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &value)) + return false; + return value ? true : false; } static bool has_timeline_syncobj(int fd) { - uint64_t value; - if (drmGetCap(fd, DRM_CAP_SYNCOBJ_TIMELINE, &value)) - return false; - return value ? true : false; + uint64_t value; + if (drmGetCap(fd, DRM_CAP_SYNCOBJ_TIMELINE, &value)) + return false; + return value ? true : false; } static uint64_t fix_vram_size(uint64_t size) { - /* The VRAM size is underreported, so we need to fix it, because - * it's used to compute the number of memory modules for harvesting. - */ - return align64(size, 256*1024*1024); + /* The VRAM size is underreported, so we need to fix it, because + * it's used to compute the number of memory modules for harvesting. + */ + return align64(size, 256 * 1024 * 1024); } -static uint32_t -get_l2_cache_size(enum radeon_family family) +static uint32_t get_l2_cache_size(enum radeon_family family) { - switch (family) { - case CHIP_KABINI: - case CHIP_STONEY: - return 128 * 1024; - case CHIP_OLAND: - case CHIP_HAINAN: - case CHIP_ICELAND: - return 256 * 1024; - case CHIP_PITCAIRN: - case CHIP_VERDE: - case CHIP_BONAIRE: - case CHIP_KAVERI: - case CHIP_POLARIS12: - case CHIP_CARRIZO: - return 512 * 1024; - case CHIP_TAHITI: - case CHIP_TONGA: - return 768 * 1024; - break; - case CHIP_HAWAII: - case CHIP_POLARIS11: - return 1024 * 1024; - case CHIP_FIJI: - case CHIP_POLARIS10: - return 2048 * 1024; - break; - default: - return 4096 * 1024; - } + switch (family) { + case CHIP_KABINI: + case CHIP_STONEY: + return 128 * 1024; + case CHIP_OLAND: + case CHIP_HAINAN: + case CHIP_ICELAND: + return 256 * 1024; + case CHIP_PITCAIRN: + case CHIP_VERDE: + case CHIP_BONAIRE: + case CHIP_KAVERI: + case CHIP_POLARIS12: + case CHIP_CARRIZO: + return 512 * 1024; + case CHIP_TAHITI: + case CHIP_TONGA: + return 768 * 1024; + break; + case CHIP_HAWAII: + case CHIP_POLARIS11: + return 1024 * 1024; + case CHIP_FIJI: + case CHIP_POLARIS10: + return 2048 * 1024; + break; + default: + return 4096 * 1024; + } } -bool ac_query_gpu_info(int fd, void *dev_p, - struct radeon_info *info, - struct amdgpu_gpu_info *amdinfo) +bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, + struct amdgpu_gpu_info *amdinfo) { - struct drm_amdgpu_info_device device_info = {}; - struct amdgpu_buffer_size_alignments alignment_info = {}; - struct drm_amdgpu_info_hw_ip dma = {}, compute = {}, uvd = {}; - struct drm_amdgpu_info_hw_ip uvd_enc = {}, vce = {}, vcn_dec = {}, vcn_jpeg = {}; - struct drm_amdgpu_info_hw_ip vcn_enc = {}, gfx = {}; - struct amdgpu_gds_resource_info gds = {}; - uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0; - int r, i, j; - amdgpu_device_handle dev = dev_p; - drmDevicePtr devinfo; - - /* Get PCI info. */ - r = drmGetDevice2(fd, 0, &devinfo); - if (r) { - fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n"); - return false; - } - info->pci_domain = devinfo->businfo.pci->domain; - info->pci_bus = devinfo->businfo.pci->bus; - info->pci_dev = devinfo->businfo.pci->dev; - info->pci_func = devinfo->businfo.pci->func; - drmFreeDevice(&devinfo); - - assert(info->drm_major == 3); - info->is_amdgpu = true; - - /* Query hardware and driver information. */ - r = amdgpu_query_gpu_info(dev, amdinfo); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n"); - return false; - } - - r = amdgpu_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info), - &device_info); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_info(dev_info) failed.\n"); - return false; - } - - r = amdgpu_query_buffer_size_alignment(dev, &alignment_info); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n"); - return false; - } - - r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_DMA, 0, &dma); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n"); - return false; - } - - r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_GFX, 0, &gfx); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(gfx) failed.\n"); - return false; - } - - r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_COMPUTE, 0, &compute); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(compute) failed.\n"); - return false; - } - - r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD, 0, &uvd); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n"); - return false; - } - - if (info->drm_minor >= 17) { - r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD_ENC, 0, &uvd_enc); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd_enc) failed.\n"); - return false; - } - } - - if (info->drm_minor >= 17) { - r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_DEC, 0, &vcn_dec); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_dec) failed.\n"); - return false; - } - } - - if (info->drm_minor >= 17) { - r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_ENC, 0, &vcn_enc); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_enc) failed.\n"); - return false; - } - } - - if (info->drm_minor >= 27) { - r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_JPEG, 0, &vcn_jpeg); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_jpeg) failed.\n"); - return false; - } - } - - r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, - &info->me_fw_version, - &info->me_fw_feature); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n"); - return false; - } - - r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, - &info->pfp_fw_version, - &info->pfp_fw_feature); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n"); - return false; - } - - r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_CE, 0, 0, - &info->ce_fw_version, - &info->ce_fw_feature); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n"); - return false; - } - - r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0, - &uvd_version, &uvd_feature); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n"); - return false; - } - - r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCE, 0, &vce); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n"); - return false; - } - - r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0, - &vce_version, &vce_feature); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n"); - return false; - } - - r = amdgpu_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n"); - return false; - } - - r = amdgpu_query_gds_info(dev, &gds); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_gds_info failed.\n"); - return false; - } - - if (info->drm_minor >= 9) { - struct drm_amdgpu_memory_info meminfo = {}; - - r = amdgpu_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_info(memory) failed.\n"); - return false; - } - - /* Note: usable_heap_size values can be random and can't be relied on. */ - info->gart_size = meminfo.gtt.total_heap_size; - info->vram_size = fix_vram_size(meminfo.vram.total_heap_size); - info->vram_vis_size = meminfo.cpu_accessible_vram.total_heap_size; - } else { - /* This is a deprecated interface, which reports usable sizes - * (total minus pinned), but the pinned size computation is - * buggy, so the values returned from these functions can be - * random. - */ - struct amdgpu_heap_info vram, vram_vis, gtt; - - r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n"); - return false; - } - - r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, - AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED, - &vram_vis); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram_vis) failed.\n"); - return false; - } - - r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_GTT, 0, >t); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n"); - return false; - } - - info->gart_size = gtt.heap_size; - info->vram_size = fix_vram_size(vram.heap_size); - info->vram_vis_size = vram_vis.heap_size; - } - - /* Set chip identification. */ - info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */ - info->pci_rev_id = amdinfo->pci_rev_id; - info->vce_harvest_config = amdinfo->vce_harvest_config; - -#define identify_chip2(asic, chipname) \ - if (ASICREV_IS(amdinfo->chip_external_rev, asic)) { \ - info->family = CHIP_##chipname; \ - info->name = #chipname; \ - } + struct drm_amdgpu_info_device device_info = {}; + struct amdgpu_buffer_size_alignments alignment_info = {}; + struct drm_amdgpu_info_hw_ip dma = {}, compute = {}, uvd = {}; + struct drm_amdgpu_info_hw_ip uvd_enc = {}, vce = {}, vcn_dec = {}, vcn_jpeg = {}; + struct drm_amdgpu_info_hw_ip vcn_enc = {}, gfx = {}; + struct amdgpu_gds_resource_info gds = {}; + uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0; + int r, i, j; + amdgpu_device_handle dev = dev_p; + drmDevicePtr devinfo; + + /* Get PCI info. */ + r = drmGetDevice2(fd, 0, &devinfo); + if (r) { + fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n"); + return false; + } + info->pci_domain = devinfo->businfo.pci->domain; + info->pci_bus = devinfo->businfo.pci->bus; + info->pci_dev = devinfo->businfo.pci->dev; + info->pci_func = devinfo->businfo.pci->func; + drmFreeDevice(&devinfo); + + assert(info->drm_major == 3); + info->is_amdgpu = true; + + /* Query hardware and driver information. */ + r = amdgpu_query_gpu_info(dev, amdinfo); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n"); + return false; + } + + r = amdgpu_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info), &device_info); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_info(dev_info) failed.\n"); + return false; + } + + r = amdgpu_query_buffer_size_alignment(dev, &alignment_info); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n"); + return false; + } + + r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_DMA, 0, &dma); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n"); + return false; + } + + r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_GFX, 0, &gfx); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(gfx) failed.\n"); + return false; + } + + r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_COMPUTE, 0, &compute); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(compute) failed.\n"); + return false; + } + + r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD, 0, &uvd); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n"); + return false; + } + + if (info->drm_minor >= 17) { + r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD_ENC, 0, &uvd_enc); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd_enc) failed.\n"); + return false; + } + } + + if (info->drm_minor >= 17) { + r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_DEC, 0, &vcn_dec); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_dec) failed.\n"); + return false; + } + } + + if (info->drm_minor >= 17) { + r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_ENC, 0, &vcn_enc); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_enc) failed.\n"); + return false; + } + } + + if (info->drm_minor >= 27) { + r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_JPEG, 0, &vcn_jpeg); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_jpeg) failed.\n"); + return false; + } + } + + r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, &info->me_fw_version, + &info->me_fw_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n"); + return false; + } + + r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, &info->pfp_fw_version, + &info->pfp_fw_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n"); + return false; + } + + r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_CE, 0, 0, &info->ce_fw_version, + &info->ce_fw_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n"); + return false; + } + + r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0, &uvd_version, &uvd_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n"); + return false; + } + + r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCE, 0, &vce); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n"); + return false; + } + + r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0, &vce_version, &vce_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n"); + return false; + } + + r = amdgpu_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n"); + return false; + } + + r = amdgpu_query_gds_info(dev, &gds); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_gds_info failed.\n"); + return false; + } + + if (info->drm_minor >= 9) { + struct drm_amdgpu_memory_info meminfo = {}; + + r = amdgpu_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_info(memory) failed.\n"); + return false; + } + + /* Note: usable_heap_size values can be random and can't be relied on. */ + info->gart_size = meminfo.gtt.total_heap_size; + info->vram_size = fix_vram_size(meminfo.vram.total_heap_size); + info->vram_vis_size = meminfo.cpu_accessible_vram.total_heap_size; + } else { + /* This is a deprecated interface, which reports usable sizes + * (total minus pinned), but the pinned size computation is + * buggy, so the values returned from these functions can be + * random. + */ + struct amdgpu_heap_info vram, vram_vis, gtt; + + r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n"); + return false; + } + + r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED, + &vram_vis); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram_vis) failed.\n"); + return false; + } + + r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_GTT, 0, >t); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n"); + return false; + } + + info->gart_size = gtt.heap_size; + info->vram_size = fix_vram_size(vram.heap_size); + info->vram_vis_size = vram_vis.heap_size; + } + + /* Set chip identification. */ + info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */ + info->pci_rev_id = amdinfo->pci_rev_id; + info->vce_harvest_config = amdinfo->vce_harvest_config; + +#define identify_chip2(asic, chipname) \ + if (ASICREV_IS(amdinfo->chip_external_rev, asic)) { \ + info->family = CHIP_##chipname; \ + info->name = #chipname; \ + } #define identify_chip(chipname) identify_chip2(chipname, chipname) - switch (amdinfo->family_id) { - case FAMILY_SI: - identify_chip(TAHITI); - identify_chip(PITCAIRN); - identify_chip2(CAPEVERDE, VERDE); - identify_chip(OLAND); - identify_chip(HAINAN); - break; - case FAMILY_CI: - identify_chip(BONAIRE); - identify_chip(HAWAII); - break; - case FAMILY_KV: - identify_chip2(SPECTRE, KAVERI); - identify_chip2(SPOOKY, KAVERI); - identify_chip2(KALINDI, KABINI); - identify_chip2(GODAVARI, KABINI); - break; - case FAMILY_VI: - identify_chip(ICELAND); - identify_chip(TONGA); - identify_chip(FIJI); - identify_chip(POLARIS10); - identify_chip(POLARIS11); - identify_chip(POLARIS12); - identify_chip(VEGAM); - break; - case FAMILY_CZ: - identify_chip(CARRIZO); - identify_chip(STONEY); - break; - case FAMILY_AI: - identify_chip(VEGA10); - identify_chip(VEGA12); - identify_chip(VEGA20); - identify_chip(ARCTURUS); - break; - case FAMILY_RV: - identify_chip(RAVEN); - identify_chip(RAVEN2); - identify_chip(RENOIR); - break; - case FAMILY_NV: - identify_chip(NAVI10); - identify_chip(NAVI12); - identify_chip(NAVI14); - identify_chip(SIENNA_CICHLID); - identify_chip(NAVY_FLOUNDER); - break; - } - - if (!info->name) { - fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n", - amdinfo->family_id, amdinfo->chip_external_rev); - return false; - } - - if (info->family >= CHIP_SIENNA_CICHLID) - info->chip_class = GFX10_3; - else if (info->family >= CHIP_NAVI10) - info->chip_class = GFX10; - else if (info->family >= CHIP_VEGA10) - info->chip_class = GFX9; - else if (info->family >= CHIP_TONGA) - info->chip_class = GFX8; - else if (info->family >= CHIP_BONAIRE) - info->chip_class = GFX7; - else if (info->family >= CHIP_TAHITI) - info->chip_class = GFX6; - else { - fprintf(stderr, "amdgpu: Unknown family.\n"); - return false; - } - - info->family_id = amdinfo->family_id; - info->chip_external_rev = amdinfo->chip_external_rev; - info->marketing_name = amdgpu_get_marketing_name(dev); - info->is_pro_graphics = info->marketing_name && - (!strcmp(info->marketing_name, "Pro") || - !strcmp(info->marketing_name, "PRO") || - !strcmp(info->marketing_name, "Frontier")); - - /* Set which chips have dedicated VRAM. */ - info->has_dedicated_vram = - !(amdinfo->ids_flags & AMDGPU_IDS_FLAGS_FUSION); - - /* The kernel can split large buffers in VRAM but not in GTT, so large - * allocations can fail or cause buffer movement failures in the kernel. - */ - if (info->has_dedicated_vram) - info->max_alloc_size = info->vram_size * 0.8; - else - info->max_alloc_size = info->gart_size * 0.7; - - info->vram_type = amdinfo->vram_type; - info->vram_bit_width = amdinfo->vram_bit_width; - info->ce_ram_size = amdinfo->ce_ram_size; - - info->l2_cache_size = get_l2_cache_size(info->family); - info->l1_cache_size = 16384; - - /* Set which chips have uncached device memory. */ - info->has_l2_uncached = info->chip_class >= GFX9; - - /* Set hardware information. */ - info->gds_size = gds.gds_total_size; - info->gds_gfx_partition_size = gds.gds_gfx_partition_size; - /* convert the shader/memory clocks from KHz to MHz */ - info->max_shader_clock = amdinfo->max_engine_clk / 1000; - info->max_memory_clock = amdinfo->max_memory_clk / 1000; - info->num_tcc_blocks = device_info.num_tcc_blocks; - info->max_se = amdinfo->num_shader_engines; - info->max_sh_per_se = amdinfo->num_shader_arrays_per_engine; - info->has_hw_decode = - (uvd.available_rings != 0) || (vcn_dec.available_rings != 0) || - (vcn_jpeg.available_rings != 0); - info->uvd_fw_version = - uvd.available_rings ? uvd_version : 0; - info->vce_fw_version = - vce.available_rings ? vce_version : 0; - info->uvd_enc_supported = - uvd_enc.available_rings ? true : false; - info->has_userptr = true; - info->has_syncobj = has_syncobj(fd); - info->has_timeline_syncobj = has_timeline_syncobj(fd); - info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20; - info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21; - info->has_ctx_priority = info->drm_minor >= 22; - info->has_local_buffers = info->drm_minor >= 20; - info->kernel_flushes_hdp_before_ib = true; - info->htile_cmask_support_1d_tiling = true; - info->si_TA_CS_BC_BASE_ADDR_allowed = true; - info->has_bo_metadata = true; - info->has_gpu_reset_status_query = true; - info->has_eqaa_surface_allocator = true; - info->has_format_bc1_through_bc7 = true; - /* DRM 3.1.0 doesn't flush TC for GFX8 correctly. */ - info->kernel_flushes_tc_l2_after_ib = info->chip_class != GFX8 || - info->drm_minor >= 2; - info->has_indirect_compute_dispatch = true; - /* GFX6 doesn't support unaligned loads. */ - info->has_unaligned_shader_loads = info->chip_class != GFX6; - /* Disable sparse mappings on GFX6 due to VM faults in CP DMA. Enable them once - * these faults are mitigated in software. - */ - info->has_sparse_vm_mappings = info->chip_class >= GFX7 && info->drm_minor >= 13; - info->has_2d_tiling = true; - info->has_read_registers_query = true; - info->has_scheduled_fence_dependency = info->drm_minor >= 28; - info->mid_command_buffer_preemption_enabled = - amdinfo->ids_flags & AMDGPU_IDS_FLAGS_PREEMPTION; - - info->pa_sc_tile_steering_override = device_info.pa_sc_tile_steering_override; - info->num_render_backends = amdinfo->rb_pipes; - /* The value returned by the kernel driver was wrong. */ - if (info->family == CHIP_KAVERI) - info->num_render_backends = 2; - - info->clock_crystal_freq = amdinfo->gpu_counter_freq; - if (!info->clock_crystal_freq) { - fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n"); - info->clock_crystal_freq = 1; - } - if (info->chip_class >= GFX10) { - info->tcc_cache_line_size = 128; - - if (info->drm_minor >= 35) { - info->tcc_harvested = device_info.tcc_disabled_mask != 0; - } else { - /* This is a hack, but it's all we can do without a kernel upgrade. */ - info->tcc_harvested = - (info->vram_size / info->num_tcc_blocks) != 512*1024*1024; - } - } else { - info->tcc_cache_line_size = 64; - } - info->gb_addr_config = amdinfo->gb_addr_cfg; - if (info->chip_class >= GFX9) { - info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(amdinfo->gb_addr_cfg); - info->pipe_interleave_bytes = - 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(amdinfo->gb_addr_cfg); - } else { - info->num_tile_pipes = cik_get_num_tile_pipes(amdinfo); - info->pipe_interleave_bytes = - 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(amdinfo->gb_addr_cfg); - } - info->r600_has_virtual_memory = true; - - /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above - * 16KB makes some SIMDs unoccupied). - * - * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used. - */ - info->lds_size_per_workgroup = info->chip_class >= GFX10 ? 128 * 1024 : 64 * 1024; - info->lds_granularity = info->chip_class >= GFX7 ? 128 * 4 : 64 * 4; - - assert(util_is_power_of_two_or_zero(dma.available_rings + 1)); - assert(util_is_power_of_two_or_zero(compute.available_rings + 1)); - - info->has_graphics = gfx.available_rings > 0; - info->num_rings[RING_GFX] = util_bitcount(gfx.available_rings); - info->num_rings[RING_COMPUTE] = util_bitcount(compute.available_rings); - info->num_rings[RING_DMA] = util_bitcount(dma.available_rings); - info->num_rings[RING_UVD] = util_bitcount(uvd.available_rings); - info->num_rings[RING_VCE] = util_bitcount(vce.available_rings); - info->num_rings[RING_UVD_ENC] = util_bitcount(uvd_enc.available_rings); - info->num_rings[RING_VCN_DEC] = util_bitcount(vcn_dec.available_rings); - info->num_rings[RING_VCN_ENC] = util_bitcount(vcn_enc.available_rings); - info->num_rings[RING_VCN_JPEG] = util_bitcount(vcn_jpeg.available_rings); - - /* This is "align_mask" copied from the kernel, maximums of all IP versions. */ - info->ib_pad_dw_mask[RING_GFX] = 0xff; - info->ib_pad_dw_mask[RING_COMPUTE] = 0xff; - info->ib_pad_dw_mask[RING_DMA] = 0xf; - info->ib_pad_dw_mask[RING_UVD] = 0xf; - info->ib_pad_dw_mask[RING_VCE] = 0x3f; - info->ib_pad_dw_mask[RING_UVD_ENC] = 0x3f; - info->ib_pad_dw_mask[RING_VCN_DEC] = 0xf; - info->ib_pad_dw_mask[RING_VCN_ENC] = 0x3f; - info->ib_pad_dw_mask[RING_VCN_JPEG] = 0xf; - - /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs - * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc. - * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel. - */ - info->has_clear_state = info->chip_class >= GFX7; - - info->has_distributed_tess = info->chip_class >= GFX10 || - (info->chip_class >= GFX8 && info->max_se >= 2); - - info->has_dcc_constant_encode = info->family == CHIP_RAVEN2 || - info->family == CHIP_RENOIR || - info->chip_class >= GFX10; - - info->has_rbplus = info->family == CHIP_STONEY || - info->chip_class >= GFX9; - - /* Some chips have RB+ registers, but don't support RB+. Those must - * always disable it. - */ - info->rbplus_allowed = info->has_rbplus && - (info->family == CHIP_STONEY || - info->family == CHIP_VEGA12 || - info->family == CHIP_RAVEN || - info->family == CHIP_RAVEN2 || - info->family == CHIP_RENOIR || - info->chip_class >= GFX10_3); - - info->has_out_of_order_rast = info->chip_class >= GFX8 && - info->chip_class <= GFX9 && - info->max_se >= 2; - - /* Whether chips support double rate packed math instructions. */ - info->has_packed_math_16bit = info->chip_class >= GFX9; - - /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */ - info->has_load_ctx_reg_pkt = info->chip_class >= GFX9 || - (info->chip_class >= GFX8 && - info->me_fw_feature >= 41); - - info->cpdma_prefetch_writes_memory = info->chip_class <= GFX8; - - info->has_gfx9_scissor_bug = info->family == CHIP_VEGA10 || - info->family == CHIP_RAVEN; - - info->has_tc_compat_zrange_bug = info->chip_class >= GFX8 && - info->chip_class <= GFX9; - - info->has_msaa_sample_loc_bug = (info->family >= CHIP_POLARIS10 && - info->family <= CHIP_POLARIS12) || - info->family == CHIP_VEGA10 || - info->family == CHIP_RAVEN; - - info->has_ls_vgpr_init_bug = info->family == CHIP_VEGA10 || - info->family == CHIP_RAVEN; - - /* Get the number of good compute units. */ - info->num_good_compute_units = 0; - for (i = 0; i < info->max_se; i++) { - for (j = 0; j < info->max_sh_per_se; j++) { - /* - * The cu bitmap in amd gpu info structure is - * 4x4 size array, and it's usually suitable for Vega - * ASICs which has 4*2 SE/SH layout. - * But for Arcturus, SE/SH layout is changed to 8*1. - * To mostly reduce the impact, we make it compatible - * with current bitmap array as below: - * SE4,SH0 --> cu_bitmap[0][1] - * SE5,SH0 --> cu_bitmap[1][1] - * SE6,SH0 --> cu_bitmap[2][1] - * SE7,SH0 --> cu_bitmap[3][1] - */ - info->cu_mask[i%4][j+i/4] = amdinfo->cu_bitmap[i%4][j+i/4]; - info->num_good_compute_units += - util_bitcount(info->cu_mask[i][j]); - } - } - - /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled, - * and max - min <= 2. - */ - unsigned cu_group = info->chip_class >= GFX10 ? 2 : 1; - info->max_good_cu_per_sa = DIV_ROUND_UP(info->num_good_compute_units, - (info->max_se * info->max_sh_per_se * cu_group)) * cu_group; - info->min_good_cu_per_sa = (info->num_good_compute_units / - (info->max_se * info->max_sh_per_se * cu_group)) * cu_group; - - memcpy(info->si_tile_mode_array, amdinfo->gb_tile_mode, - sizeof(amdinfo->gb_tile_mode)); - info->enabled_rb_mask = amdinfo->enabled_rb_pipes_mask; - - memcpy(info->cik_macrotile_mode_array, amdinfo->gb_macro_tile_mode, - sizeof(amdinfo->gb_macro_tile_mode)); - - info->pte_fragment_size = alignment_info.size_local; - info->gart_page_size = alignment_info.size_remote; - - if (info->chip_class == GFX6) - info->gfx_ib_pad_with_type2 = true; - - unsigned ib_align = 0; - ib_align = MAX2(ib_align, gfx.ib_start_alignment); - ib_align = MAX2(ib_align, gfx.ib_size_alignment); - ib_align = MAX2(ib_align, compute.ib_start_alignment); - ib_align = MAX2(ib_align, compute.ib_size_alignment); - ib_align = MAX2(ib_align, dma.ib_start_alignment); - ib_align = MAX2(ib_align, dma.ib_size_alignment); - ib_align = MAX2(ib_align, uvd.ib_start_alignment); - ib_align = MAX2(ib_align, uvd.ib_size_alignment); - ib_align = MAX2(ib_align, uvd_enc.ib_start_alignment); - ib_align = MAX2(ib_align, uvd_enc.ib_size_alignment); - ib_align = MAX2(ib_align, vce.ib_start_alignment); - ib_align = MAX2(ib_align, vce.ib_size_alignment); - ib_align = MAX2(ib_align, vcn_dec.ib_start_alignment); - ib_align = MAX2(ib_align, vcn_dec.ib_size_alignment); - ib_align = MAX2(ib_align, vcn_enc.ib_start_alignment); - ib_align = MAX2(ib_align, vcn_enc.ib_size_alignment); - ib_align = MAX2(ib_align, vcn_jpeg.ib_start_alignment); - ib_align = MAX2(ib_align, vcn_jpeg.ib_size_alignment); - /* GFX10 and maybe GFX9 need this alignment for cache coherency. */ - if (info->chip_class >= GFX9) - ib_align = MAX2(ib_align, info->tcc_cache_line_size); - /* The kernel pads gfx and compute IBs to 256 dwords since: - * 66f3b2d527154bd258a57c8815004b5964aa1cf5 - * Do the same. - */ - ib_align = MAX2(ib_align, 1024); - info->ib_alignment = ib_align; - - if ((info->drm_minor >= 31 && - (info->family == CHIP_RAVEN || - info->family == CHIP_RAVEN2 || - info->family == CHIP_RENOIR)) || - (info->drm_minor >= 34 && - (info->family == CHIP_NAVI12 || - info->family == CHIP_NAVI14)) || - info->chip_class >= GFX10_3) { - if (info->num_render_backends == 1) - info->use_display_dcc_unaligned = true; - else - info->use_display_dcc_with_retile_blit = true; - } - - info->has_gds_ordered_append = info->chip_class >= GFX7 && - info->drm_minor >= 29; - - if (info->chip_class >= GFX9) { - unsigned pc_lines = 0; - - switch (info->family) { - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_VEGA20: - pc_lines = 2048; - break; - case CHIP_RAVEN: - case CHIP_RAVEN2: - case CHIP_RENOIR: - case CHIP_NAVI10: - case CHIP_NAVI12: - case CHIP_SIENNA_CICHLID: - case CHIP_NAVY_FLOUNDER: - pc_lines = 1024; - break; - case CHIP_NAVI14: - pc_lines = 512; - break; - case CHIP_ARCTURUS: - break; - default: - assert(0); - } - - info->pc_lines = pc_lines; - - if (info->chip_class >= GFX10) { - info->pbb_max_alloc_count = pc_lines / 3; - } else { - info->pbb_max_alloc_count = - MIN2(128, pc_lines / (4 * info->max_se)); - } - } - - /* The number of SDPs is the same as the number of TCCs for now. */ - if (info->chip_class >= GFX10) - info->num_sdp_interfaces = device_info.num_tcc_blocks; - - if (info->chip_class >= GFX10_3) - info->max_wave64_per_simd = 16; - else if (info->chip_class == GFX10) - info->max_wave64_per_simd = 20; - else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM) - info->max_wave64_per_simd = 8; - else - info->max_wave64_per_simd = 10; - - if (info->chip_class >= GFX10) { - info->num_physical_sgprs_per_simd = 128 * info->max_wave64_per_simd; - info->min_sgpr_alloc = 128; - info->sgpr_alloc_granularity = 128; - /* Don't use late alloc on small chips. */ - info->use_late_alloc = info->num_render_backends > 4; - } else if (info->chip_class >= GFX8) { - info->num_physical_sgprs_per_simd = 800; - info->min_sgpr_alloc = 16; - info->sgpr_alloc_granularity = 16; - info->use_late_alloc = true; - } else { - info->num_physical_sgprs_per_simd = 512; - info->min_sgpr_alloc = 8; - info->sgpr_alloc_granularity = 8; - /* Potential hang on Kabini: */ - info->use_late_alloc = info->family != CHIP_KABINI; - } - - info->max_sgpr_alloc = info->family == CHIP_TONGA || - info->family == CHIP_ICELAND ? 96 : 104; - - info->min_wave64_vgpr_alloc = 4; - info->max_vgpr_alloc = 256; - info->wave64_vgpr_alloc_granularity = 4; - - info->num_physical_wave64_vgprs_per_simd = info->chip_class >= GFX10 ? 512 : 256; - info->num_simd_per_compute_unit = info->chip_class >= GFX10 ? 2 : 4; - - return true; + switch (amdinfo->family_id) { + case FAMILY_SI: + identify_chip(TAHITI); + identify_chip(PITCAIRN); + identify_chip2(CAPEVERDE, VERDE); + identify_chip(OLAND); + identify_chip(HAINAN); + break; + case FAMILY_CI: + identify_chip(BONAIRE); + identify_chip(HAWAII); + break; + case FAMILY_KV: + identify_chip2(SPECTRE, KAVERI); + identify_chip2(SPOOKY, KAVERI); + identify_chip2(KALINDI, KABINI); + identify_chip2(GODAVARI, KABINI); + break; + case FAMILY_VI: + identify_chip(ICELAND); + identify_chip(TONGA); + identify_chip(FIJI); + identify_chip(POLARIS10); + identify_chip(POLARIS11); + identify_chip(POLARIS12); + identify_chip(VEGAM); + break; + case FAMILY_CZ: + identify_chip(CARRIZO); + identify_chip(STONEY); + break; + case FAMILY_AI: + identify_chip(VEGA10); + identify_chip(VEGA12); + identify_chip(VEGA20); + identify_chip(ARCTURUS); + break; + case FAMILY_RV: + identify_chip(RAVEN); + identify_chip(RAVEN2); + identify_chip(RENOIR); + break; + case FAMILY_NV: + identify_chip(NAVI10); + identify_chip(NAVI12); + identify_chip(NAVI14); + identify_chip(SIENNA_CICHLID); + identify_chip(NAVY_FLOUNDER); + break; + } + + if (!info->name) { + fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n", + amdinfo->family_id, amdinfo->chip_external_rev); + return false; + } + + if (info->family >= CHIP_SIENNA_CICHLID) + info->chip_class = GFX10_3; + else if (info->family >= CHIP_NAVI10) + info->chip_class = GFX10; + else if (info->family >= CHIP_VEGA10) + info->chip_class = GFX9; + else if (info->family >= CHIP_TONGA) + info->chip_class = GFX8; + else if (info->family >= CHIP_BONAIRE) + info->chip_class = GFX7; + else if (info->family >= CHIP_TAHITI) + info->chip_class = GFX6; + else { + fprintf(stderr, "amdgpu: Unknown family.\n"); + return false; + } + + info->family_id = amdinfo->family_id; + info->chip_external_rev = amdinfo->chip_external_rev; + info->marketing_name = amdgpu_get_marketing_name(dev); + info->is_pro_graphics = info->marketing_name && (!strcmp(info->marketing_name, "Pro") || + !strcmp(info->marketing_name, "PRO") || + !strcmp(info->marketing_name, "Frontier")); + + /* Set which chips have dedicated VRAM. */ + info->has_dedicated_vram = !(amdinfo->ids_flags & AMDGPU_IDS_FLAGS_FUSION); + + /* The kernel can split large buffers in VRAM but not in GTT, so large + * allocations can fail or cause buffer movement failures in the kernel. + */ + if (info->has_dedicated_vram) + info->max_alloc_size = info->vram_size * 0.8; + else + info->max_alloc_size = info->gart_size * 0.7; + + info->vram_type = amdinfo->vram_type; + info->vram_bit_width = amdinfo->vram_bit_width; + info->ce_ram_size = amdinfo->ce_ram_size; + + info->l2_cache_size = get_l2_cache_size(info->family); + info->l1_cache_size = 16384; + + /* Set which chips have uncached device memory. */ + info->has_l2_uncached = info->chip_class >= GFX9; + + /* Set hardware information. */ + info->gds_size = gds.gds_total_size; + info->gds_gfx_partition_size = gds.gds_gfx_partition_size; + /* convert the shader/memory clocks from KHz to MHz */ + info->max_shader_clock = amdinfo->max_engine_clk / 1000; + info->max_memory_clock = amdinfo->max_memory_clk / 1000; + info->num_tcc_blocks = device_info.num_tcc_blocks; + info->max_se = amdinfo->num_shader_engines; + info->max_sh_per_se = amdinfo->num_shader_arrays_per_engine; + info->has_hw_decode = (uvd.available_rings != 0) || (vcn_dec.available_rings != 0) || + (vcn_jpeg.available_rings != 0); + info->uvd_fw_version = uvd.available_rings ? uvd_version : 0; + info->vce_fw_version = vce.available_rings ? vce_version : 0; + info->uvd_enc_supported = uvd_enc.available_rings ? true : false; + info->has_userptr = true; + info->has_syncobj = has_syncobj(fd); + info->has_timeline_syncobj = has_timeline_syncobj(fd); + info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20; + info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21; + info->has_ctx_priority = info->drm_minor >= 22; + info->has_local_buffers = info->drm_minor >= 20; + info->kernel_flushes_hdp_before_ib = true; + info->htile_cmask_support_1d_tiling = true; + info->si_TA_CS_BC_BASE_ADDR_allowed = true; + info->has_bo_metadata = true; + info->has_gpu_reset_status_query = true; + info->has_eqaa_surface_allocator = true; + info->has_format_bc1_through_bc7 = true; + /* DRM 3.1.0 doesn't flush TC for GFX8 correctly. */ + info->kernel_flushes_tc_l2_after_ib = info->chip_class != GFX8 || info->drm_minor >= 2; + info->has_indirect_compute_dispatch = true; + /* GFX6 doesn't support unaligned loads. */ + info->has_unaligned_shader_loads = info->chip_class != GFX6; + /* Disable sparse mappings on GFX6 due to VM faults in CP DMA. Enable them once + * these faults are mitigated in software. + */ + info->has_sparse_vm_mappings = info->chip_class >= GFX7 && info->drm_minor >= 13; + info->has_2d_tiling = true; + info->has_read_registers_query = true; + info->has_scheduled_fence_dependency = info->drm_minor >= 28; + info->mid_command_buffer_preemption_enabled = amdinfo->ids_flags & AMDGPU_IDS_FLAGS_PREEMPTION; + + info->pa_sc_tile_steering_override = device_info.pa_sc_tile_steering_override; + info->num_render_backends = amdinfo->rb_pipes; + /* The value returned by the kernel driver was wrong. */ + if (info->family == CHIP_KAVERI) + info->num_render_backends = 2; + + info->clock_crystal_freq = amdinfo->gpu_counter_freq; + if (!info->clock_crystal_freq) { + fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n"); + info->clock_crystal_freq = 1; + } + if (info->chip_class >= GFX10) { + info->tcc_cache_line_size = 128; + + if (info->drm_minor >= 35) { + info->tcc_harvested = device_info.tcc_disabled_mask != 0; + } else { + /* This is a hack, but it's all we can do without a kernel upgrade. */ + info->tcc_harvested = (info->vram_size / info->num_tcc_blocks) != 512 * 1024 * 1024; + } + } else { + info->tcc_cache_line_size = 64; + } + info->gb_addr_config = amdinfo->gb_addr_cfg; + if (info->chip_class >= GFX9) { + info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(amdinfo->gb_addr_cfg); + info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(amdinfo->gb_addr_cfg); + } else { + info->num_tile_pipes = cik_get_num_tile_pipes(amdinfo); + info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(amdinfo->gb_addr_cfg); + } + info->r600_has_virtual_memory = true; + + /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above + * 16KB makes some SIMDs unoccupied). + * + * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used. + */ + info->lds_size_per_workgroup = info->chip_class >= GFX10 ? 128 * 1024 : 64 * 1024; + info->lds_granularity = info->chip_class >= GFX7 ? 128 * 4 : 64 * 4; + + assert(util_is_power_of_two_or_zero(dma.available_rings + 1)); + assert(util_is_power_of_two_or_zero(compute.available_rings + 1)); + + info->has_graphics = gfx.available_rings > 0; + info->num_rings[RING_GFX] = util_bitcount(gfx.available_rings); + info->num_rings[RING_COMPUTE] = util_bitcount(compute.available_rings); + info->num_rings[RING_DMA] = util_bitcount(dma.available_rings); + info->num_rings[RING_UVD] = util_bitcount(uvd.available_rings); + info->num_rings[RING_VCE] = util_bitcount(vce.available_rings); + info->num_rings[RING_UVD_ENC] = util_bitcount(uvd_enc.available_rings); + info->num_rings[RING_VCN_DEC] = util_bitcount(vcn_dec.available_rings); + info->num_rings[RING_VCN_ENC] = util_bitcount(vcn_enc.available_rings); + info->num_rings[RING_VCN_JPEG] = util_bitcount(vcn_jpeg.available_rings); + + /* This is "align_mask" copied from the kernel, maximums of all IP versions. */ + info->ib_pad_dw_mask[RING_GFX] = 0xff; + info->ib_pad_dw_mask[RING_COMPUTE] = 0xff; + info->ib_pad_dw_mask[RING_DMA] = 0xf; + info->ib_pad_dw_mask[RING_UVD] = 0xf; + info->ib_pad_dw_mask[RING_VCE] = 0x3f; + info->ib_pad_dw_mask[RING_UVD_ENC] = 0x3f; + info->ib_pad_dw_mask[RING_VCN_DEC] = 0xf; + info->ib_pad_dw_mask[RING_VCN_ENC] = 0x3f; + info->ib_pad_dw_mask[RING_VCN_JPEG] = 0xf; + + /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs + * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc. + * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel. + */ + info->has_clear_state = info->chip_class >= GFX7; + + info->has_distributed_tess = + info->chip_class >= GFX10 || (info->chip_class >= GFX8 && info->max_se >= 2); + + info->has_dcc_constant_encode = + info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->chip_class >= GFX10; + + info->has_rbplus = info->family == CHIP_STONEY || info->chip_class >= GFX9; + + /* Some chips have RB+ registers, but don't support RB+. Those must + * always disable it. + */ + info->rbplus_allowed = + info->has_rbplus && + (info->family == CHIP_STONEY || info->family == CHIP_VEGA12 || info->family == CHIP_RAVEN || + info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->chip_class >= GFX10_3); + + info->has_out_of_order_rast = + info->chip_class >= GFX8 && info->chip_class <= GFX9 && info->max_se >= 2; + + /* Whether chips support double rate packed math instructions. */ + info->has_packed_math_16bit = info->chip_class >= GFX9; + + /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */ + info->has_load_ctx_reg_pkt = + info->chip_class >= GFX9 || (info->chip_class >= GFX8 && info->me_fw_feature >= 41); + + info->cpdma_prefetch_writes_memory = info->chip_class <= GFX8; + + info->has_gfx9_scissor_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN; + + info->has_tc_compat_zrange_bug = info->chip_class >= GFX8 && info->chip_class <= GFX9; + + info->has_msaa_sample_loc_bug = + (info->family >= CHIP_POLARIS10 && info->family <= CHIP_POLARIS12) || + info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN; + + info->has_ls_vgpr_init_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN; + + /* Get the number of good compute units. */ + info->num_good_compute_units = 0; + for (i = 0; i < info->max_se; i++) { + for (j = 0; j < info->max_sh_per_se; j++) { + /* + * The cu bitmap in amd gpu info structure is + * 4x4 size array, and it's usually suitable for Vega + * ASICs which has 4*2 SE/SH layout. + * But for Arcturus, SE/SH layout is changed to 8*1. + * To mostly reduce the impact, we make it compatible + * with current bitmap array as below: + * SE4,SH0 --> cu_bitmap[0][1] + * SE5,SH0 --> cu_bitmap[1][1] + * SE6,SH0 --> cu_bitmap[2][1] + * SE7,SH0 --> cu_bitmap[3][1] + */ + info->cu_mask[i % 4][j + i / 4] = amdinfo->cu_bitmap[i % 4][j + i / 4]; + info->num_good_compute_units += util_bitcount(info->cu_mask[i][j]); + } + } + + /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled, + * and max - min <= 2. + */ + unsigned cu_group = info->chip_class >= GFX10 ? 2 : 1; + info->max_good_cu_per_sa = + DIV_ROUND_UP(info->num_good_compute_units, (info->max_se * info->max_sh_per_se * cu_group)) * + cu_group; + info->min_good_cu_per_sa = + (info->num_good_compute_units / (info->max_se * info->max_sh_per_se * cu_group)) * cu_group; + + memcpy(info->si_tile_mode_array, amdinfo->gb_tile_mode, sizeof(amdinfo->gb_tile_mode)); + info->enabled_rb_mask = amdinfo->enabled_rb_pipes_mask; + + memcpy(info->cik_macrotile_mode_array, amdinfo->gb_macro_tile_mode, + sizeof(amdinfo->gb_macro_tile_mode)); + + info->pte_fragment_size = alignment_info.size_local; + info->gart_page_size = alignment_info.size_remote; + + if (info->chip_class == GFX6) + info->gfx_ib_pad_with_type2 = true; + + unsigned ib_align = 0; + ib_align = MAX2(ib_align, gfx.ib_start_alignment); + ib_align = MAX2(ib_align, gfx.ib_size_alignment); + ib_align = MAX2(ib_align, compute.ib_start_alignment); + ib_align = MAX2(ib_align, compute.ib_size_alignment); + ib_align = MAX2(ib_align, dma.ib_start_alignment); + ib_align = MAX2(ib_align, dma.ib_size_alignment); + ib_align = MAX2(ib_align, uvd.ib_start_alignment); + ib_align = MAX2(ib_align, uvd.ib_size_alignment); + ib_align = MAX2(ib_align, uvd_enc.ib_start_alignment); + ib_align = MAX2(ib_align, uvd_enc.ib_size_alignment); + ib_align = MAX2(ib_align, vce.ib_start_alignment); + ib_align = MAX2(ib_align, vce.ib_size_alignment); + ib_align = MAX2(ib_align, vcn_dec.ib_start_alignment); + ib_align = MAX2(ib_align, vcn_dec.ib_size_alignment); + ib_align = MAX2(ib_align, vcn_enc.ib_start_alignment); + ib_align = MAX2(ib_align, vcn_enc.ib_size_alignment); + ib_align = MAX2(ib_align, vcn_jpeg.ib_start_alignment); + ib_align = MAX2(ib_align, vcn_jpeg.ib_size_alignment); + /* GFX10 and maybe GFX9 need this alignment for cache coherency. */ + if (info->chip_class >= GFX9) + ib_align = MAX2(ib_align, info->tcc_cache_line_size); + /* The kernel pads gfx and compute IBs to 256 dwords since: + * 66f3b2d527154bd258a57c8815004b5964aa1cf5 + * Do the same. + */ + ib_align = MAX2(ib_align, 1024); + info->ib_alignment = ib_align; + + if ((info->drm_minor >= 31 && (info->family == CHIP_RAVEN || info->family == CHIP_RAVEN2 || + info->family == CHIP_RENOIR)) || + (info->drm_minor >= 34 && (info->family == CHIP_NAVI12 || info->family == CHIP_NAVI14)) || + info->chip_class >= GFX10_3) { + if (info->num_render_backends == 1) + info->use_display_dcc_unaligned = true; + else + info->use_display_dcc_with_retile_blit = true; + } + + info->has_gds_ordered_append = info->chip_class >= GFX7 && info->drm_minor >= 29; + + if (info->chip_class >= GFX9) { + unsigned pc_lines = 0; + + switch (info->family) { + case CHIP_VEGA10: + case CHIP_VEGA12: + case CHIP_VEGA20: + pc_lines = 2048; + break; + case CHIP_RAVEN: + case CHIP_RAVEN2: + case CHIP_RENOIR: + case CHIP_NAVI10: + case CHIP_NAVI12: + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: + pc_lines = 1024; + break; + case CHIP_NAVI14: + pc_lines = 512; + break; + case CHIP_ARCTURUS: + break; + default: + assert(0); + } + + info->pc_lines = pc_lines; + + if (info->chip_class >= GFX10) { + info->pbb_max_alloc_count = pc_lines / 3; + } else { + info->pbb_max_alloc_count = MIN2(128, pc_lines / (4 * info->max_se)); + } + } + + /* The number of SDPs is the same as the number of TCCs for now. */ + if (info->chip_class >= GFX10) + info->num_sdp_interfaces = device_info.num_tcc_blocks; + + if (info->chip_class >= GFX10_3) + info->max_wave64_per_simd = 16; + else if (info->chip_class == GFX10) + info->max_wave64_per_simd = 20; + else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM) + info->max_wave64_per_simd = 8; + else + info->max_wave64_per_simd = 10; + + if (info->chip_class >= GFX10) { + info->num_physical_sgprs_per_simd = 128 * info->max_wave64_per_simd; + info->min_sgpr_alloc = 128; + info->sgpr_alloc_granularity = 128; + /* Don't use late alloc on small chips. */ + info->use_late_alloc = info->num_render_backends > 4; + } else if (info->chip_class >= GFX8) { + info->num_physical_sgprs_per_simd = 800; + info->min_sgpr_alloc = 16; + info->sgpr_alloc_granularity = 16; + info->use_late_alloc = true; + } else { + info->num_physical_sgprs_per_simd = 512; + info->min_sgpr_alloc = 8; + info->sgpr_alloc_granularity = 8; + /* Potential hang on Kabini: */ + info->use_late_alloc = info->family != CHIP_KABINI; + } + + info->max_sgpr_alloc = info->family == CHIP_TONGA || info->family == CHIP_ICELAND ? 96 : 104; + + info->min_wave64_vgpr_alloc = 4; + info->max_vgpr_alloc = 256; + info->wave64_vgpr_alloc_granularity = 4; + + info->num_physical_wave64_vgprs_per_simd = info->chip_class >= GFX10 ? 512 : 256; + info->num_simd_per_compute_unit = info->chip_class >= GFX10 ? 2 : 4; + + return true; } void ac_compute_driver_uuid(char *uuid, size_t size) { - char amd_uuid[] = "AMD-MESA-DRV"; + char amd_uuid[] = "AMD-MESA-DRV"; - assert(size >= sizeof(amd_uuid)); + assert(size >= sizeof(amd_uuid)); - memset(uuid, 0, size); - strncpy(uuid, amd_uuid, size); + memset(uuid, 0, size); + strncpy(uuid, amd_uuid, size); } void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size) { - uint32_t *uint_uuid = (uint32_t*)uuid; - - assert(size >= sizeof(uint32_t)*4); - - /** - * Use the device info directly instead of using a sha1. GL/VK UUIDs - * are 16 byte vs 20 byte for sha1, and the truncation that would be - * required would get rid of part of the little entropy we have. - * */ - memset(uuid, 0, size); - uint_uuid[0] = info->pci_domain; - uint_uuid[1] = info->pci_bus; - uint_uuid[2] = info->pci_dev; - uint_uuid[3] = info->pci_func; + uint32_t *uint_uuid = (uint32_t *)uuid; + + assert(size >= sizeof(uint32_t) * 4); + + /** + * Use the device info directly instead of using a sha1. GL/VK UUIDs + * are 16 byte vs 20 byte for sha1, and the truncation that would be + * required would get rid of part of the little entropy we have. + * */ + memset(uuid, 0, size); + uint_uuid[0] = info->pci_domain; + uint_uuid[1] = info->pci_bus; + uint_uuid[2] = info->pci_dev; + uint_uuid[3] = info->pci_func; } void ac_print_gpu_info(struct radeon_info *info) { - printf("Device info:\n"); - printf(" pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n", - info->pci_domain, info->pci_bus, - info->pci_dev, info->pci_func); - - printf(" name = %s\n", info->name); - printf(" marketing_name = %s\n", info->marketing_name); - printf(" is_pro_graphics = %u\n", info->is_pro_graphics); - printf(" pci_id = 0x%x\n", info->pci_id); - printf(" pci_rev_id = 0x%x\n", info->pci_rev_id); - printf(" family = %i\n", info->family); - printf(" chip_class = %i\n", info->chip_class); - printf(" family_id = %i\n", info->family_id); - printf(" chip_external_rev = %i\n", info->chip_external_rev); - printf(" clock_crystal_freq = %i\n", info->clock_crystal_freq); - - printf("Features:\n"); - printf(" has_graphics = %i\n", info->has_graphics); - printf(" num_rings[RING_GFX] = %i\n", info->num_rings[RING_GFX]); - printf(" num_rings[RING_DMA] = %i\n", info->num_rings[RING_DMA]); - printf(" num_rings[RING_COMPUTE] = %u\n", info->num_rings[RING_COMPUTE]); - printf(" num_rings[RING_UVD] = %i\n", info->num_rings[RING_UVD]); - printf(" num_rings[RING_VCE] = %i\n", info->num_rings[RING_VCE]); - printf(" num_rings[RING_UVD_ENC] = %i\n", info->num_rings[RING_UVD_ENC]); - printf(" num_rings[RING_VCN_DEC] = %i\n", info->num_rings[RING_VCN_DEC]); - printf(" num_rings[RING_VCN_ENC] = %i\n", info->num_rings[RING_VCN_ENC]); - printf(" num_rings[RING_VCN_JPEG] = %i\n", info->num_rings[RING_VCN_JPEG]); - printf(" has_clear_state = %u\n", info->has_clear_state); - printf(" has_distributed_tess = %u\n", info->has_distributed_tess); - printf(" has_dcc_constant_encode = %u\n", info->has_dcc_constant_encode); - printf(" has_rbplus = %u\n", info->has_rbplus); - printf(" rbplus_allowed = %u\n", info->rbplus_allowed); - printf(" has_load_ctx_reg_pkt = %u\n", info->has_load_ctx_reg_pkt); - printf(" has_out_of_order_rast = %u\n", info->has_out_of_order_rast); - printf(" cpdma_prefetch_writes_memory = %u\n", info->cpdma_prefetch_writes_memory); - printf(" has_gfx9_scissor_bug = %i\n", info->has_gfx9_scissor_bug); - printf(" has_tc_compat_zrange_bug = %i\n", info->has_tc_compat_zrange_bug); - printf(" has_msaa_sample_loc_bug = %i\n", info->has_msaa_sample_loc_bug); - printf(" has_ls_vgpr_init_bug = %i\n", info->has_ls_vgpr_init_bug); - - printf("Display features:\n"); - printf(" use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned); - printf(" use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit); - - printf("Memory info:\n"); - printf(" pte_fragment_size = %u\n", info->pte_fragment_size); - printf(" gart_page_size = %u\n", info->gart_page_size); - printf(" gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size, 1024*1024)); - printf(" vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size, 1024*1024)); - printf(" vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_vis_size, 1024*1024)); - printf(" vram_type = %i\n", info->vram_type); - printf(" vram_bit_width = %i\n", info->vram_bit_width); - printf(" gds_size = %u kB\n", info->gds_size / 1024); - printf(" gds_gfx_partition_size = %u kB\n", info->gds_gfx_partition_size / 1024); - printf(" max_alloc_size = %i MB\n", - (int)DIV_ROUND_UP(info->max_alloc_size, 1024*1024)); - printf(" min_alloc_size = %u\n", info->min_alloc_size); - printf(" address32_hi = %u\n", info->address32_hi); - printf(" has_dedicated_vram = %u\n", info->has_dedicated_vram); - printf(" num_sdp_interfaces = %u\n", info->num_sdp_interfaces); - printf(" num_tcc_blocks = %i\n", info->num_tcc_blocks); - printf(" tcc_cache_line_size = %u\n", info->tcc_cache_line_size); - printf(" tcc_harvested = %u\n", info->tcc_harvested); - printf(" pc_lines = %u\n", info->pc_lines); - printf(" lds_size_per_workgroup = %u\n", info->lds_size_per_workgroup); - printf(" lds_granularity = %i\n", info->lds_granularity); - printf(" max_memory_clock = %i\n", info->max_memory_clock); - printf(" ce_ram_size = %i\n", info->ce_ram_size); - printf(" l1_cache_size = %i\n", info->l1_cache_size); - printf(" l2_cache_size = %i\n", info->l2_cache_size); - - printf("CP info:\n"); - printf(" gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2); - printf(" ib_alignment = %u\n", info->ib_alignment); - printf(" me_fw_version = %i\n", info->me_fw_version); - printf(" me_fw_feature = %i\n", info->me_fw_feature); - printf(" pfp_fw_version = %i\n", info->pfp_fw_version); - printf(" pfp_fw_feature = %i\n", info->pfp_fw_feature); - printf(" ce_fw_version = %i\n", info->ce_fw_version); - printf(" ce_fw_feature = %i\n", info->ce_fw_feature); - - printf("Multimedia info:\n"); - printf(" has_hw_decode = %u\n", info->has_hw_decode); - printf(" uvd_enc_supported = %u\n", info->uvd_enc_supported); - printf(" uvd_fw_version = %u\n", info->uvd_fw_version); - printf(" vce_fw_version = %u\n", info->vce_fw_version); - printf(" vce_harvest_config = %i\n", info->vce_harvest_config); - - printf("Kernel & winsys capabilities:\n"); - printf(" drm = %i.%i.%i\n", info->drm_major, - info->drm_minor, info->drm_patchlevel); - printf(" has_userptr = %i\n", info->has_userptr); - printf(" has_syncobj = %u\n", info->has_syncobj); - printf(" has_syncobj_wait_for_submit = %u\n", info->has_syncobj_wait_for_submit); - printf(" has_timeline_syncobj = %u\n", info->has_timeline_syncobj); - printf(" has_fence_to_handle = %u\n", info->has_fence_to_handle); - printf(" has_ctx_priority = %u\n", info->has_ctx_priority); - printf(" has_local_buffers = %u\n", info->has_local_buffers); - printf(" kernel_flushes_hdp_before_ib = %u\n", info->kernel_flushes_hdp_before_ib); - printf(" htile_cmask_support_1d_tiling = %u\n", info->htile_cmask_support_1d_tiling); - printf(" si_TA_CS_BC_BASE_ADDR_allowed = %u\n", info->si_TA_CS_BC_BASE_ADDR_allowed); - printf(" has_bo_metadata = %u\n", info->has_bo_metadata); - printf(" has_gpu_reset_status_query = %u\n", info->has_gpu_reset_status_query); - printf(" has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator); - printf(" has_format_bc1_through_bc7 = %u\n", info->has_format_bc1_through_bc7); - printf(" kernel_flushes_tc_l2_after_ib = %u\n", info->kernel_flushes_tc_l2_after_ib); - printf(" has_indirect_compute_dispatch = %u\n", info->has_indirect_compute_dispatch); - printf(" has_unaligned_shader_loads = %u\n", info->has_unaligned_shader_loads); - printf(" has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings); - printf(" has_2d_tiling = %u\n", info->has_2d_tiling); - printf(" has_read_registers_query = %u\n", info->has_read_registers_query); - printf(" has_gds_ordered_append = %u\n", info->has_gds_ordered_append); - printf(" has_scheduled_fence_dependency = %u\n", info->has_scheduled_fence_dependency); - printf(" mid_command_buffer_preemption_enabled = %u\n", info->mid_command_buffer_preemption_enabled); - - printf("Shader core info:\n"); - printf(" max_shader_clock = %i\n", info->max_shader_clock); - printf(" num_good_compute_units = %i\n", info->num_good_compute_units); - printf(" max_good_cu_per_sa = %i\n", info->max_good_cu_per_sa); - printf(" min_good_cu_per_sa = %i\n", info->min_good_cu_per_sa); - printf(" max_se = %i\n", info->max_se); - printf(" max_sh_per_se = %i\n", info->max_sh_per_se); - printf(" max_wave64_per_simd = %i\n", info->max_wave64_per_simd); - printf(" num_physical_sgprs_per_simd = %i\n", info->num_physical_sgprs_per_simd); - printf(" num_physical_wave64_vgprs_per_simd = %i\n", info->num_physical_wave64_vgprs_per_simd); - printf(" num_simd_per_compute_unit = %i\n", info->num_simd_per_compute_unit); - printf(" min_sgpr_alloc = %i\n", info->min_sgpr_alloc); - printf(" max_sgpr_alloc = %i\n", info->max_sgpr_alloc); - printf(" sgpr_alloc_granularity = %i\n", info->sgpr_alloc_granularity); - printf(" min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc); - printf(" max_vgpr_alloc = %i\n", info->max_vgpr_alloc); - printf(" wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity); - - printf("Render backend info:\n"); - printf(" pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override); - printf(" num_render_backends = %i\n", info->num_render_backends); - printf(" num_tile_pipes = %i\n", info->num_tile_pipes); - printf(" pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes); - printf(" enabled_rb_mask = 0x%x\n", info->enabled_rb_mask); - printf(" max_alignment = %u\n", (unsigned)info->max_alignment); - printf(" pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count); - - printf("GB_ADDR_CONFIG: 0x%08x\n", info->gb_addr_config); - if (info->chip_class >= GFX10) { - printf(" num_pipes = %u\n", - 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); - printf(" pipe_interleave_size = %u\n", - 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config)); - printf(" max_compressed_frags = %u\n", - 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config)); - } else if (info->chip_class == GFX9) { - printf(" num_pipes = %u\n", - 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); - printf(" pipe_interleave_size = %u\n", - 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config)); - printf(" max_compressed_frags = %u\n", - 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config)); - printf(" bank_interleave_size = %u\n", - 1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config)); - printf(" num_banks = %u\n", - 1 << G_0098F8_NUM_BANKS(info->gb_addr_config)); - printf(" shader_engine_tile_size = %u\n", - 16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config)); - printf(" num_shader_engines = %u\n", - 1 << G_0098F8_NUM_SHADER_ENGINES_GFX9(info->gb_addr_config)); - printf(" num_gpus = %u (raw)\n", - G_0098F8_NUM_GPUS_GFX9(info->gb_addr_config)); - printf(" multi_gpu_tile_size = %u (raw)\n", - G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config)); - printf(" num_rb_per_se = %u\n", - 1 << G_0098F8_NUM_RB_PER_SE(info->gb_addr_config)); - printf(" row_size = %u\n", - 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config)); - printf(" num_lower_pipes = %u (raw)\n", - G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config)); - printf(" se_enable = %u (raw)\n", - G_0098F8_SE_ENABLE(info->gb_addr_config)); - } else { - printf(" num_pipes = %u\n", - 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); - printf(" pipe_interleave_size = %u\n", - 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config)); - printf(" bank_interleave_size = %u\n", - 1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config)); - printf(" num_shader_engines = %u\n", - 1 << G_0098F8_NUM_SHADER_ENGINES_GFX6(info->gb_addr_config)); - printf(" shader_engine_tile_size = %u\n", - 16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config)); - printf(" num_gpus = %u (raw)\n", - G_0098F8_NUM_GPUS_GFX6(info->gb_addr_config)); - printf(" multi_gpu_tile_size = %u (raw)\n", - G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config)); - printf(" row_size = %u\n", - 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config)); - printf(" num_lower_pipes = %u (raw)\n", - G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config)); - } + printf("Device info:\n"); + printf(" pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n", info->pci_domain, info->pci_bus, + info->pci_dev, info->pci_func); + + printf(" name = %s\n", info->name); + printf(" marketing_name = %s\n", info->marketing_name); + printf(" is_pro_graphics = %u\n", info->is_pro_graphics); + printf(" pci_id = 0x%x\n", info->pci_id); + printf(" pci_rev_id = 0x%x\n", info->pci_rev_id); + printf(" family = %i\n", info->family); + printf(" chip_class = %i\n", info->chip_class); + printf(" family_id = %i\n", info->family_id); + printf(" chip_external_rev = %i\n", info->chip_external_rev); + printf(" clock_crystal_freq = %i\n", info->clock_crystal_freq); + + printf("Features:\n"); + printf(" has_graphics = %i\n", info->has_graphics); + printf(" num_rings[RING_GFX] = %i\n", info->num_rings[RING_GFX]); + printf(" num_rings[RING_DMA] = %i\n", info->num_rings[RING_DMA]); + printf(" num_rings[RING_COMPUTE] = %u\n", info->num_rings[RING_COMPUTE]); + printf(" num_rings[RING_UVD] = %i\n", info->num_rings[RING_UVD]); + printf(" num_rings[RING_VCE] = %i\n", info->num_rings[RING_VCE]); + printf(" num_rings[RING_UVD_ENC] = %i\n", info->num_rings[RING_UVD_ENC]); + printf(" num_rings[RING_VCN_DEC] = %i\n", info->num_rings[RING_VCN_DEC]); + printf(" num_rings[RING_VCN_ENC] = %i\n", info->num_rings[RING_VCN_ENC]); + printf(" num_rings[RING_VCN_JPEG] = %i\n", info->num_rings[RING_VCN_JPEG]); + printf(" has_clear_state = %u\n", info->has_clear_state); + printf(" has_distributed_tess = %u\n", info->has_distributed_tess); + printf(" has_dcc_constant_encode = %u\n", info->has_dcc_constant_encode); + printf(" has_rbplus = %u\n", info->has_rbplus); + printf(" rbplus_allowed = %u\n", info->rbplus_allowed); + printf(" has_load_ctx_reg_pkt = %u\n", info->has_load_ctx_reg_pkt); + printf(" has_out_of_order_rast = %u\n", info->has_out_of_order_rast); + printf(" cpdma_prefetch_writes_memory = %u\n", info->cpdma_prefetch_writes_memory); + printf(" has_gfx9_scissor_bug = %i\n", info->has_gfx9_scissor_bug); + printf(" has_tc_compat_zrange_bug = %i\n", info->has_tc_compat_zrange_bug); + printf(" has_msaa_sample_loc_bug = %i\n", info->has_msaa_sample_loc_bug); + printf(" has_ls_vgpr_init_bug = %i\n", info->has_ls_vgpr_init_bug); + + printf("Display features:\n"); + printf(" use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned); + printf(" use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit); + + printf("Memory info:\n"); + printf(" pte_fragment_size = %u\n", info->pte_fragment_size); + printf(" gart_page_size = %u\n", info->gart_page_size); + printf(" gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size, 1024 * 1024)); + printf(" vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size, 1024 * 1024)); + printf(" vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_vis_size, 1024 * 1024)); + printf(" vram_type = %i\n", info->vram_type); + printf(" vram_bit_width = %i\n", info->vram_bit_width); + printf(" gds_size = %u kB\n", info->gds_size / 1024); + printf(" gds_gfx_partition_size = %u kB\n", info->gds_gfx_partition_size / 1024); + printf(" max_alloc_size = %i MB\n", (int)DIV_ROUND_UP(info->max_alloc_size, 1024 * 1024)); + printf(" min_alloc_size = %u\n", info->min_alloc_size); + printf(" address32_hi = %u\n", info->address32_hi); + printf(" has_dedicated_vram = %u\n", info->has_dedicated_vram); + printf(" num_sdp_interfaces = %u\n", info->num_sdp_interfaces); + printf(" num_tcc_blocks = %i\n", info->num_tcc_blocks); + printf(" tcc_cache_line_size = %u\n", info->tcc_cache_line_size); + printf(" tcc_harvested = %u\n", info->tcc_harvested); + printf(" pc_lines = %u\n", info->pc_lines); + printf(" lds_size_per_workgroup = %u\n", info->lds_size_per_workgroup); + printf(" lds_granularity = %i\n", info->lds_granularity); + printf(" max_memory_clock = %i\n", info->max_memory_clock); + printf(" ce_ram_size = %i\n", info->ce_ram_size); + printf(" l1_cache_size = %i\n", info->l1_cache_size); + printf(" l2_cache_size = %i\n", info->l2_cache_size); + + printf("CP info:\n"); + printf(" gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2); + printf(" ib_alignment = %u\n", info->ib_alignment); + printf(" me_fw_version = %i\n", info->me_fw_version); + printf(" me_fw_feature = %i\n", info->me_fw_feature); + printf(" pfp_fw_version = %i\n", info->pfp_fw_version); + printf(" pfp_fw_feature = %i\n", info->pfp_fw_feature); + printf(" ce_fw_version = %i\n", info->ce_fw_version); + printf(" ce_fw_feature = %i\n", info->ce_fw_feature); + + printf("Multimedia info:\n"); + printf(" has_hw_decode = %u\n", info->has_hw_decode); + printf(" uvd_enc_supported = %u\n", info->uvd_enc_supported); + printf(" uvd_fw_version = %u\n", info->uvd_fw_version); + printf(" vce_fw_version = %u\n", info->vce_fw_version); + printf(" vce_harvest_config = %i\n", info->vce_harvest_config); + + printf("Kernel & winsys capabilities:\n"); + printf(" drm = %i.%i.%i\n", info->drm_major, info->drm_minor, info->drm_patchlevel); + printf(" has_userptr = %i\n", info->has_userptr); + printf(" has_syncobj = %u\n", info->has_syncobj); + printf(" has_syncobj_wait_for_submit = %u\n", info->has_syncobj_wait_for_submit); + printf(" has_timeline_syncobj = %u\n", info->has_timeline_syncobj); + printf(" has_fence_to_handle = %u\n", info->has_fence_to_handle); + printf(" has_ctx_priority = %u\n", info->has_ctx_priority); + printf(" has_local_buffers = %u\n", info->has_local_buffers); + printf(" kernel_flushes_hdp_before_ib = %u\n", info->kernel_flushes_hdp_before_ib); + printf(" htile_cmask_support_1d_tiling = %u\n", info->htile_cmask_support_1d_tiling); + printf(" si_TA_CS_BC_BASE_ADDR_allowed = %u\n", info->si_TA_CS_BC_BASE_ADDR_allowed); + printf(" has_bo_metadata = %u\n", info->has_bo_metadata); + printf(" has_gpu_reset_status_query = %u\n", info->has_gpu_reset_status_query); + printf(" has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator); + printf(" has_format_bc1_through_bc7 = %u\n", info->has_format_bc1_through_bc7); + printf(" kernel_flushes_tc_l2_after_ib = %u\n", info->kernel_flushes_tc_l2_after_ib); + printf(" has_indirect_compute_dispatch = %u\n", info->has_indirect_compute_dispatch); + printf(" has_unaligned_shader_loads = %u\n", info->has_unaligned_shader_loads); + printf(" has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings); + printf(" has_2d_tiling = %u\n", info->has_2d_tiling); + printf(" has_read_registers_query = %u\n", info->has_read_registers_query); + printf(" has_gds_ordered_append = %u\n", info->has_gds_ordered_append); + printf(" has_scheduled_fence_dependency = %u\n", info->has_scheduled_fence_dependency); + printf(" mid_command_buffer_preemption_enabled = %u\n", + info->mid_command_buffer_preemption_enabled); + + printf("Shader core info:\n"); + printf(" max_shader_clock = %i\n", info->max_shader_clock); + printf(" num_good_compute_units = %i\n", info->num_good_compute_units); + printf(" max_good_cu_per_sa = %i\n", info->max_good_cu_per_sa); + printf(" min_good_cu_per_sa = %i\n", info->min_good_cu_per_sa); + printf(" max_se = %i\n", info->max_se); + printf(" max_sh_per_se = %i\n", info->max_sh_per_se); + printf(" max_wave64_per_simd = %i\n", info->max_wave64_per_simd); + printf(" num_physical_sgprs_per_simd = %i\n", info->num_physical_sgprs_per_simd); + printf(" num_physical_wave64_vgprs_per_simd = %i\n", + info->num_physical_wave64_vgprs_per_simd); + printf(" num_simd_per_compute_unit = %i\n", info->num_simd_per_compute_unit); + printf(" min_sgpr_alloc = %i\n", info->min_sgpr_alloc); + printf(" max_sgpr_alloc = %i\n", info->max_sgpr_alloc); + printf(" sgpr_alloc_granularity = %i\n", info->sgpr_alloc_granularity); + printf(" min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc); + printf(" max_vgpr_alloc = %i\n", info->max_vgpr_alloc); + printf(" wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity); + + printf("Render backend info:\n"); + printf(" pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override); + printf(" num_render_backends = %i\n", info->num_render_backends); + printf(" num_tile_pipes = %i\n", info->num_tile_pipes); + printf(" pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes); + printf(" enabled_rb_mask = 0x%x\n", info->enabled_rb_mask); + printf(" max_alignment = %u\n", (unsigned)info->max_alignment); + printf(" pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count); + + printf("GB_ADDR_CONFIG: 0x%08x\n", info->gb_addr_config); + if (info->chip_class >= GFX10) { + printf(" num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); + printf(" pipe_interleave_size = %u\n", + 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config)); + printf(" max_compressed_frags = %u\n", + 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config)); + } else if (info->chip_class == GFX9) { + printf(" num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); + printf(" pipe_interleave_size = %u\n", + 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config)); + printf(" max_compressed_frags = %u\n", + 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config)); + printf(" bank_interleave_size = %u\n", + 1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config)); + printf(" num_banks = %u\n", 1 << G_0098F8_NUM_BANKS(info->gb_addr_config)); + printf(" shader_engine_tile_size = %u\n", + 16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config)); + printf(" num_shader_engines = %u\n", + 1 << G_0098F8_NUM_SHADER_ENGINES_GFX9(info->gb_addr_config)); + printf(" num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX9(info->gb_addr_config)); + printf(" multi_gpu_tile_size = %u (raw)\n", + G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config)); + printf(" num_rb_per_se = %u\n", 1 << G_0098F8_NUM_RB_PER_SE(info->gb_addr_config)); + printf(" row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config)); + printf(" num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config)); + printf(" se_enable = %u (raw)\n", G_0098F8_SE_ENABLE(info->gb_addr_config)); + } else { + printf(" num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config)); + printf(" pipe_interleave_size = %u\n", + 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config)); + printf(" bank_interleave_size = %u\n", + 1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config)); + printf(" num_shader_engines = %u\n", + 1 << G_0098F8_NUM_SHADER_ENGINES_GFX6(info->gb_addr_config)); + printf(" shader_engine_tile_size = %u\n", + 16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config)); + printf(" num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX6(info->gb_addr_config)); + printf(" multi_gpu_tile_size = %u (raw)\n", + G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config)); + printf(" row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config)); + printf(" num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config)); + } } -int -ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family) +int ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family) { - if (chip_class >= GFX9) - return -1; - - switch (family) { - case CHIP_OLAND: - case CHIP_HAINAN: - case CHIP_KAVERI: - case CHIP_KABINI: - case CHIP_ICELAND: - case CHIP_CARRIZO: - case CHIP_STONEY: - return 16; - case CHIP_TAHITI: - case CHIP_PITCAIRN: - case CHIP_VERDE: - case CHIP_BONAIRE: - case CHIP_HAWAII: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_VEGAM: - return 32; - default: - unreachable("Unknown GPU"); - } + if (chip_class >= GFX9) + return -1; + + switch (family) { + case CHIP_OLAND: + case CHIP_HAINAN: + case CHIP_KAVERI: + case CHIP_KABINI: + case CHIP_ICELAND: + case CHIP_CARRIZO: + case CHIP_STONEY: + return 16; + case CHIP_TAHITI: + case CHIP_PITCAIRN: + case CHIP_VERDE: + case CHIP_BONAIRE: + case CHIP_HAWAII: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + case CHIP_VEGAM: + return 32; + default: + unreachable("Unknown GPU"); + } } -void -ac_get_raster_config(struct radeon_info *info, - uint32_t *raster_config_p, - uint32_t *raster_config_1_p, - uint32_t *se_tile_repeat_p) +void ac_get_raster_config(struct radeon_info *info, uint32_t *raster_config_p, + uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p) { - unsigned raster_config, raster_config_1, se_tile_repeat; - - switch (info->family) { - /* 1 SE / 1 RB */ - case CHIP_HAINAN: - case CHIP_KABINI: - case CHIP_STONEY: - raster_config = 0x00000000; - raster_config_1 = 0x00000000; - break; - /* 1 SE / 4 RBs */ - case CHIP_VERDE: - raster_config = 0x0000124a; - raster_config_1 = 0x00000000; - break; - /* 1 SE / 2 RBs (Oland is special) */ - case CHIP_OLAND: - raster_config = 0x00000082; - raster_config_1 = 0x00000000; - break; - /* 1 SE / 2 RBs */ - case CHIP_KAVERI: - case CHIP_ICELAND: - case CHIP_CARRIZO: - raster_config = 0x00000002; - raster_config_1 = 0x00000000; - break; - /* 2 SEs / 4 RBs */ - case CHIP_BONAIRE: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - raster_config = 0x16000012; - raster_config_1 = 0x00000000; - break; - /* 2 SEs / 8 RBs */ - case CHIP_TAHITI: - case CHIP_PITCAIRN: - raster_config = 0x2a00126a; - raster_config_1 = 0x00000000; - break; - /* 4 SEs / 8 RBs */ - case CHIP_TONGA: - case CHIP_POLARIS10: - raster_config = 0x16000012; - raster_config_1 = 0x0000002a; - break; - /* 4 SEs / 16 RBs */ - case CHIP_HAWAII: - case CHIP_FIJI: - case CHIP_VEGAM: - raster_config = 0x3a00161a; - raster_config_1 = 0x0000002e; - break; - default: - fprintf(stderr, - "ac: Unknown GPU, using 0 for raster_config\n"); - raster_config = 0x00000000; - raster_config_1 = 0x00000000; - break; - } - - /* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it. - * This decreases performance by up to 50% when the RB is the bottleneck. - */ - if (info->family == CHIP_KAVERI && !info->is_amdgpu) - raster_config = 0x00000000; - - /* Fiji: Old kernels have incorrect tiling config. This decreases - * RB performance by 25%. (it disables 1 RB in the second packer) - */ - if (info->family == CHIP_FIJI && - info->cik_macrotile_mode_array[0] == 0x000000e8) { - raster_config = 0x16000012; - raster_config_1 = 0x0000002a; - } - - unsigned se_width = 8 << G_028350_SE_XSEL_GFX6(raster_config); - unsigned se_height = 8 << G_028350_SE_YSEL_GFX6(raster_config); - - /* I don't know how to calculate this, though this is probably a good guess. */ - se_tile_repeat = MAX2(se_width, se_height) * info->max_se; - - *raster_config_p = raster_config; - *raster_config_1_p = raster_config_1; - if (se_tile_repeat_p) - *se_tile_repeat_p = se_tile_repeat; + unsigned raster_config, raster_config_1, se_tile_repeat; + + switch (info->family) { + /* 1 SE / 1 RB */ + case CHIP_HAINAN: + case CHIP_KABINI: + case CHIP_STONEY: + raster_config = 0x00000000; + raster_config_1 = 0x00000000; + break; + /* 1 SE / 4 RBs */ + case CHIP_VERDE: + raster_config = 0x0000124a; + raster_config_1 = 0x00000000; + break; + /* 1 SE / 2 RBs (Oland is special) */ + case CHIP_OLAND: + raster_config = 0x00000082; + raster_config_1 = 0x00000000; + break; + /* 1 SE / 2 RBs */ + case CHIP_KAVERI: + case CHIP_ICELAND: + case CHIP_CARRIZO: + raster_config = 0x00000002; + raster_config_1 = 0x00000000; + break; + /* 2 SEs / 4 RBs */ + case CHIP_BONAIRE: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + raster_config = 0x16000012; + raster_config_1 = 0x00000000; + break; + /* 2 SEs / 8 RBs */ + case CHIP_TAHITI: + case CHIP_PITCAIRN: + raster_config = 0x2a00126a; + raster_config_1 = 0x00000000; + break; + /* 4 SEs / 8 RBs */ + case CHIP_TONGA: + case CHIP_POLARIS10: + raster_config = 0x16000012; + raster_config_1 = 0x0000002a; + break; + /* 4 SEs / 16 RBs */ + case CHIP_HAWAII: + case CHIP_FIJI: + case CHIP_VEGAM: + raster_config = 0x3a00161a; + raster_config_1 = 0x0000002e; + break; + default: + fprintf(stderr, "ac: Unknown GPU, using 0 for raster_config\n"); + raster_config = 0x00000000; + raster_config_1 = 0x00000000; + break; + } + + /* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it. + * This decreases performance by up to 50% when the RB is the bottleneck. + */ + if (info->family == CHIP_KAVERI && !info->is_amdgpu) + raster_config = 0x00000000; + + /* Fiji: Old kernels have incorrect tiling config. This decreases + * RB performance by 25%. (it disables 1 RB in the second packer) + */ + if (info->family == CHIP_FIJI && info->cik_macrotile_mode_array[0] == 0x000000e8) { + raster_config = 0x16000012; + raster_config_1 = 0x0000002a; + } + + unsigned se_width = 8 << G_028350_SE_XSEL_GFX6(raster_config); + unsigned se_height = 8 << G_028350_SE_YSEL_GFX6(raster_config); + + /* I don't know how to calculate this, though this is probably a good guess. */ + se_tile_repeat = MAX2(se_width, se_height) * info->max_se; + + *raster_config_p = raster_config; + *raster_config_1_p = raster_config_1; + if (se_tile_repeat_p) + *se_tile_repeat_p = se_tile_repeat; } -void -ac_get_harvested_configs(struct radeon_info *info, - unsigned raster_config, - unsigned *cik_raster_config_1_p, - unsigned *raster_config_se) +void ac_get_harvested_configs(struct radeon_info *info, unsigned raster_config, + unsigned *cik_raster_config_1_p, unsigned *raster_config_se) { - unsigned sh_per_se = MAX2(info->max_sh_per_se, 1); - unsigned num_se = MAX2(info->max_se, 1); - unsigned rb_mask = info->enabled_rb_mask; - unsigned num_rb = MIN2(info->num_render_backends, 16); - unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2); - unsigned rb_per_se = num_rb / num_se; - unsigned se_mask[4]; - unsigned se; - - se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask; - se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask; - se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask; - se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask; - - assert(num_se == 1 || num_se == 2 || num_se == 4); - assert(sh_per_se == 1 || sh_per_se == 2); - assert(rb_per_pkr == 1 || rb_per_pkr == 2); - - - if (info->chip_class >= GFX7) { - unsigned raster_config_1 = *cik_raster_config_1_p; - if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) || - (!se_mask[2] && !se_mask[3]))) { - raster_config_1 &= C_028354_SE_PAIR_MAP; - - if (!se_mask[0] && !se_mask[1]) { - raster_config_1 |= - S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3); - } else { - raster_config_1 |= - S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0); - } - *cik_raster_config_1_p = raster_config_1; - } - } - - for (se = 0; se < num_se; se++) { - unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se); - unsigned pkr1_mask = pkr0_mask << rb_per_pkr; - int idx = (se / 2) * 2; - - raster_config_se[se] = raster_config; - if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) { - raster_config_se[se] &= C_028350_SE_MAP; - - if (!se_mask[idx]) { - raster_config_se[se] |= - S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3); - } else { - raster_config_se[se] |= - S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0); - } - } - - pkr0_mask &= rb_mask; - pkr1_mask &= rb_mask; - if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) { - raster_config_se[se] &= C_028350_PKR_MAP; - - if (!pkr0_mask) { - raster_config_se[se] |= - S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3); - } else { - raster_config_se[se] |= - S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0); - } - } - - if (rb_per_se >= 2) { - unsigned rb0_mask = 1 << (se * rb_per_se); - unsigned rb1_mask = rb0_mask << 1; - - rb0_mask &= rb_mask; - rb1_mask &= rb_mask; - if (!rb0_mask || !rb1_mask) { - raster_config_se[se] &= C_028350_RB_MAP_PKR0; - - if (!rb0_mask) { - raster_config_se[se] |= - S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3); - } else { - raster_config_se[se] |= - S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0); - } - } - - if (rb_per_se > 2) { - rb0_mask = 1 << (se * rb_per_se + rb_per_pkr); - rb1_mask = rb0_mask << 1; - rb0_mask &= rb_mask; - rb1_mask &= rb_mask; - if (!rb0_mask || !rb1_mask) { - raster_config_se[se] &= C_028350_RB_MAP_PKR1; - - if (!rb0_mask) { - raster_config_se[se] |= - S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3); - } else { - raster_config_se[se] |= - S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0); - } - } - } - } - } + unsigned sh_per_se = MAX2(info->max_sh_per_se, 1); + unsigned num_se = MAX2(info->max_se, 1); + unsigned rb_mask = info->enabled_rb_mask; + unsigned num_rb = MIN2(info->num_render_backends, 16); + unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2); + unsigned rb_per_se = num_rb / num_se; + unsigned se_mask[4]; + unsigned se; + + se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask; + se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask; + se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask; + se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask; + + assert(num_se == 1 || num_se == 2 || num_se == 4); + assert(sh_per_se == 1 || sh_per_se == 2); + assert(rb_per_pkr == 1 || rb_per_pkr == 2); + + if (info->chip_class >= GFX7) { + unsigned raster_config_1 = *cik_raster_config_1_p; + if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) || (!se_mask[2] && !se_mask[3]))) { + raster_config_1 &= C_028354_SE_PAIR_MAP; + + if (!se_mask[0] && !se_mask[1]) { + raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3); + } else { + raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0); + } + *cik_raster_config_1_p = raster_config_1; + } + } + + for (se = 0; se < num_se; se++) { + unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se); + unsigned pkr1_mask = pkr0_mask << rb_per_pkr; + int idx = (se / 2) * 2; + + raster_config_se[se] = raster_config; + if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) { + raster_config_se[se] &= C_028350_SE_MAP; + + if (!se_mask[idx]) { + raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3); + } else { + raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0); + } + } + + pkr0_mask &= rb_mask; + pkr1_mask &= rb_mask; + if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) { + raster_config_se[se] &= C_028350_PKR_MAP; + + if (!pkr0_mask) { + raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3); + } else { + raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0); + } + } + + if (rb_per_se >= 2) { + unsigned rb0_mask = 1 << (se * rb_per_se); + unsigned rb1_mask = rb0_mask << 1; + + rb0_mask &= rb_mask; + rb1_mask &= rb_mask; + if (!rb0_mask || !rb1_mask) { + raster_config_se[se] &= C_028350_RB_MAP_PKR0; + + if (!rb0_mask) { + raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3); + } else { + raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0); + } + } + + if (rb_per_se > 2) { + rb0_mask = 1 << (se * rb_per_se + rb_per_pkr); + rb1_mask = rb0_mask << 1; + rb0_mask &= rb_mask; + rb1_mask &= rb_mask; + if (!rb0_mask || !rb1_mask) { + raster_config_se[se] &= C_028350_RB_MAP_PKR1; + + if (!rb0_mask) { + raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3); + } else { + raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0); + } + } + } + } + } } -unsigned ac_get_compute_resource_limits(struct radeon_info *info, - unsigned waves_per_threadgroup, - unsigned max_waves_per_sh, - unsigned threadgroups_per_cu) +unsigned ac_get_compute_resource_limits(struct radeon_info *info, unsigned waves_per_threadgroup, + unsigned max_waves_per_sh, unsigned threadgroups_per_cu) { - unsigned compute_resource_limits = - S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); - - if (info->chip_class >= GFX7) { - unsigned num_cu_per_se = info->num_good_compute_units / - info->max_se; - - /* Force even distribution on all SIMDs in CU if the workgroup - * size is 64. This has shown some good improvements if # of CUs - * per SE is not a multiple of 4. - */ - if (num_cu_per_se % 4 && waves_per_threadgroup == 1) - compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); - - assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8); - compute_resource_limits |= S_00B854_WAVES_PER_SH(max_waves_per_sh) | - S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1); - } else { - /* GFX6 */ - if (max_waves_per_sh) { - unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16); - compute_resource_limits |= S_00B854_WAVES_PER_SH_GFX6(limit_div16); - } - } - return compute_resource_limits; + unsigned compute_resource_limits = S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); + + if (info->chip_class >= GFX7) { + unsigned num_cu_per_se = info->num_good_compute_units / info->max_se; + + /* Force even distribution on all SIMDs in CU if the workgroup + * size is 64. This has shown some good improvements if # of CUs + * per SE is not a multiple of 4. + */ + if (num_cu_per_se % 4 && waves_per_threadgroup == 1) + compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); + + assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8); + compute_resource_limits |= + S_00B854_WAVES_PER_SH(max_waves_per_sh) | S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1); + } else { + /* GFX6 */ + if (max_waves_per_sh) { + unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16); + compute_resource_limits |= S_00B854_WAVES_PER_SH_GFX6(limit_div16); + } + } + return compute_resource_limits; } diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index 70e53f1..f6d4e62 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -26,10 +26,11 @@ #ifndef AC_GPU_INFO_H #define AC_GPU_INFO_H +#include "amd_family.h" + +#include #include #include -#include -#include "amd_family.h" #ifdef __cplusplus extern "C" { @@ -38,186 +39,179 @@ extern "C" { struct amdgpu_gpu_info; struct radeon_info { - /* PCI info: domain:bus:dev:func */ - uint32_t pci_domain; - uint32_t pci_bus; - uint32_t pci_dev; - uint32_t pci_func; - - /* Device info. */ - const char *name; - const char *marketing_name; - bool is_pro_graphics; - uint32_t pci_id; - uint32_t pci_rev_id; - enum radeon_family family; - enum chip_class chip_class; - uint32_t family_id; - uint32_t chip_external_rev; - uint32_t clock_crystal_freq; - - /* Features. */ - bool has_graphics; /* false if the chip is compute-only */ - uint32_t num_rings[NUM_RING_TYPES]; - uint32_t ib_pad_dw_mask[NUM_RING_TYPES]; - bool has_clear_state; - bool has_distributed_tess; - bool has_dcc_constant_encode; - bool has_rbplus; /* if RB+ registers exist */ - bool rbplus_allowed; /* if RB+ is allowed */ - bool has_load_ctx_reg_pkt; - bool has_out_of_order_rast; - bool has_packed_math_16bit; - bool cpdma_prefetch_writes_memory; - bool has_gfx9_scissor_bug; - bool has_tc_compat_zrange_bug; - bool has_msaa_sample_loc_bug; - bool has_ls_vgpr_init_bug; - - /* Display features. */ - /* There are 2 display DCC codepaths, because display expects unaligned DCC. */ - /* Disable RB and pipe alignment to skip the retile blit. (1 RB chips only) */ - bool use_display_dcc_unaligned; - /* Allocate both aligned and unaligned DCC and use the retile blit. */ - bool use_display_dcc_with_retile_blit; - - /* Memory info. */ - uint32_t pte_fragment_size; - uint32_t gart_page_size; - uint64_t gart_size; - uint64_t vram_size; - uint64_t vram_vis_size; - uint32_t vram_bit_width; - uint32_t vram_type; - unsigned gds_size; - unsigned gds_gfx_partition_size; - uint64_t max_alloc_size; - uint32_t min_alloc_size; - uint32_t address32_hi; - bool has_dedicated_vram; - bool has_l2_uncached; - bool r600_has_virtual_memory; - uint32_t num_sdp_interfaces; - uint32_t num_tcc_blocks; - uint32_t tcc_cache_line_size; - bool tcc_harvested; - unsigned pc_lines; - uint32_t lds_size_per_workgroup; - uint32_t lds_granularity; - uint32_t max_memory_clock; - uint32_t ce_ram_size; - uint32_t l1_cache_size; - uint32_t l2_cache_size; - - /* CP info. */ - bool gfx_ib_pad_with_type2; - unsigned ib_alignment; /* both start and size alignment */ - uint32_t me_fw_version; - uint32_t me_fw_feature; - uint32_t pfp_fw_version; - uint32_t pfp_fw_feature; - uint32_t ce_fw_version; - uint32_t ce_fw_feature; - - /* Multimedia info. */ - bool has_hw_decode; - bool uvd_enc_supported; - uint32_t uvd_fw_version; - uint32_t vce_fw_version; - uint32_t vce_harvest_config; - - /* Kernel & winsys capabilities. */ - uint32_t drm_major; /* version */ - uint32_t drm_minor; - uint32_t drm_patchlevel; - bool is_amdgpu; - bool has_userptr; - bool has_syncobj; - bool has_syncobj_wait_for_submit; - bool has_timeline_syncobj; - bool has_fence_to_handle; - bool has_ctx_priority; - bool has_local_buffers; - bool kernel_flushes_hdp_before_ib; - bool htile_cmask_support_1d_tiling; - bool si_TA_CS_BC_BASE_ADDR_allowed; - bool has_bo_metadata; - bool has_gpu_reset_status_query; - bool has_eqaa_surface_allocator; - bool has_format_bc1_through_bc7; - bool kernel_flushes_tc_l2_after_ib; - bool has_indirect_compute_dispatch; - bool has_unaligned_shader_loads; - bool has_sparse_vm_mappings; - bool has_2d_tiling; - bool has_read_registers_query; - bool has_gds_ordered_append; - bool has_scheduled_fence_dependency; - /* Whether SR-IOV is enabled or amdgpu.mcbp=1 was set on the kernel command line. */ - bool mid_command_buffer_preemption_enabled; - - /* Shader cores. */ - uint32_t cu_mask[4][2]; - uint32_t r600_max_quad_pipes; /* wave size / 16 */ - uint32_t max_shader_clock; - uint32_t num_good_compute_units; - uint32_t max_good_cu_per_sa; - uint32_t min_good_cu_per_sa; /* min != max if SAs have different # of CUs */ - uint32_t max_se; /* shader engines */ - uint32_t max_sh_per_se; /* shader arrays per shader engine */ - uint32_t max_wave64_per_simd; - uint32_t num_physical_sgprs_per_simd; - uint32_t num_physical_wave64_vgprs_per_simd; - uint32_t num_simd_per_compute_unit; - uint32_t min_sgpr_alloc; - uint32_t max_sgpr_alloc; - uint32_t sgpr_alloc_granularity; - uint32_t min_wave64_vgpr_alloc; - uint32_t max_vgpr_alloc; - uint32_t wave64_vgpr_alloc_granularity; - bool use_late_alloc; /* VS and GS: late pos/param allocation */ - - /* Render backends (color + depth blocks). */ - uint32_t r300_num_gb_pipes; - uint32_t r300_num_z_pipes; - uint32_t r600_gb_backend_map; /* R600 harvest config */ - bool r600_gb_backend_map_valid; - uint32_t r600_num_banks; - uint32_t gb_addr_config; - uint32_t pa_sc_tile_steering_override; /* CLEAR_STATE also sets this */ - uint32_t num_render_backends; - uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */ - uint32_t pipe_interleave_bytes; - uint32_t enabled_rb_mask; /* GCN harvest config */ - uint64_t max_alignment; /* from addrlib */ - uint32_t pbb_max_alloc_count; - - /* Tile modes. */ - uint32_t si_tile_mode_array[32]; - uint32_t cik_macrotile_mode_array[16]; + /* PCI info: domain:bus:dev:func */ + uint32_t pci_domain; + uint32_t pci_bus; + uint32_t pci_dev; + uint32_t pci_func; + + /* Device info. */ + const char *name; + const char *marketing_name; + bool is_pro_graphics; + uint32_t pci_id; + uint32_t pci_rev_id; + enum radeon_family family; + enum chip_class chip_class; + uint32_t family_id; + uint32_t chip_external_rev; + uint32_t clock_crystal_freq; + + /* Features. */ + bool has_graphics; /* false if the chip is compute-only */ + uint32_t num_rings[NUM_RING_TYPES]; + uint32_t ib_pad_dw_mask[NUM_RING_TYPES]; + bool has_clear_state; + bool has_distributed_tess; + bool has_dcc_constant_encode; + bool has_rbplus; /* if RB+ registers exist */ + bool rbplus_allowed; /* if RB+ is allowed */ + bool has_load_ctx_reg_pkt; + bool has_out_of_order_rast; + bool has_packed_math_16bit; + bool cpdma_prefetch_writes_memory; + bool has_gfx9_scissor_bug; + bool has_tc_compat_zrange_bug; + bool has_msaa_sample_loc_bug; + bool has_ls_vgpr_init_bug; + + /* Display features. */ + /* There are 2 display DCC codepaths, because display expects unaligned DCC. */ + /* Disable RB and pipe alignment to skip the retile blit. (1 RB chips only) */ + bool use_display_dcc_unaligned; + /* Allocate both aligned and unaligned DCC and use the retile blit. */ + bool use_display_dcc_with_retile_blit; + + /* Memory info. */ + uint32_t pte_fragment_size; + uint32_t gart_page_size; + uint64_t gart_size; + uint64_t vram_size; + uint64_t vram_vis_size; + uint32_t vram_bit_width; + uint32_t vram_type; + unsigned gds_size; + unsigned gds_gfx_partition_size; + uint64_t max_alloc_size; + uint32_t min_alloc_size; + uint32_t address32_hi; + bool has_dedicated_vram; + bool has_l2_uncached; + bool r600_has_virtual_memory; + uint32_t num_sdp_interfaces; + uint32_t num_tcc_blocks; + uint32_t tcc_cache_line_size; + bool tcc_harvested; + unsigned pc_lines; + uint32_t lds_size_per_workgroup; + uint32_t lds_granularity; + uint32_t max_memory_clock; + uint32_t ce_ram_size; + uint32_t l1_cache_size; + uint32_t l2_cache_size; + + /* CP info. */ + bool gfx_ib_pad_with_type2; + unsigned ib_alignment; /* both start and size alignment */ + uint32_t me_fw_version; + uint32_t me_fw_feature; + uint32_t pfp_fw_version; + uint32_t pfp_fw_feature; + uint32_t ce_fw_version; + uint32_t ce_fw_feature; + + /* Multimedia info. */ + bool has_hw_decode; + bool uvd_enc_supported; + uint32_t uvd_fw_version; + uint32_t vce_fw_version; + uint32_t vce_harvest_config; + + /* Kernel & winsys capabilities. */ + uint32_t drm_major; /* version */ + uint32_t drm_minor; + uint32_t drm_patchlevel; + bool is_amdgpu; + bool has_userptr; + bool has_syncobj; + bool has_syncobj_wait_for_submit; + bool has_timeline_syncobj; + bool has_fence_to_handle; + bool has_ctx_priority; + bool has_local_buffers; + bool kernel_flushes_hdp_before_ib; + bool htile_cmask_support_1d_tiling; + bool si_TA_CS_BC_BASE_ADDR_allowed; + bool has_bo_metadata; + bool has_gpu_reset_status_query; + bool has_eqaa_surface_allocator; + bool has_format_bc1_through_bc7; + bool kernel_flushes_tc_l2_after_ib; + bool has_indirect_compute_dispatch; + bool has_unaligned_shader_loads; + bool has_sparse_vm_mappings; + bool has_2d_tiling; + bool has_read_registers_query; + bool has_gds_ordered_append; + bool has_scheduled_fence_dependency; + /* Whether SR-IOV is enabled or amdgpu.mcbp=1 was set on the kernel command line. */ + bool mid_command_buffer_preemption_enabled; + + /* Shader cores. */ + uint32_t cu_mask[4][2]; + uint32_t r600_max_quad_pipes; /* wave size / 16 */ + uint32_t max_shader_clock; + uint32_t num_good_compute_units; + uint32_t max_good_cu_per_sa; + uint32_t min_good_cu_per_sa; /* min != max if SAs have different # of CUs */ + uint32_t max_se; /* shader engines */ + uint32_t max_sh_per_se; /* shader arrays per shader engine */ + uint32_t max_wave64_per_simd; + uint32_t num_physical_sgprs_per_simd; + uint32_t num_physical_wave64_vgprs_per_simd; + uint32_t num_simd_per_compute_unit; + uint32_t min_sgpr_alloc; + uint32_t max_sgpr_alloc; + uint32_t sgpr_alloc_granularity; + uint32_t min_wave64_vgpr_alloc; + uint32_t max_vgpr_alloc; + uint32_t wave64_vgpr_alloc_granularity; + bool use_late_alloc; /* VS and GS: late pos/param allocation */ + + /* Render backends (color + depth blocks). */ + uint32_t r300_num_gb_pipes; + uint32_t r300_num_z_pipes; + uint32_t r600_gb_backend_map; /* R600 harvest config */ + bool r600_gb_backend_map_valid; + uint32_t r600_num_banks; + uint32_t gb_addr_config; + uint32_t pa_sc_tile_steering_override; /* CLEAR_STATE also sets this */ + uint32_t num_render_backends; + uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */ + uint32_t pipe_interleave_bytes; + uint32_t enabled_rb_mask; /* GCN harvest config */ + uint64_t max_alignment; /* from addrlib */ + uint32_t pbb_max_alloc_count; + + /* Tile modes. */ + uint32_t si_tile_mode_array[32]; + uint32_t cik_macrotile_mode_array[16]; }; -bool ac_query_gpu_info(int fd, void *dev_p, - struct radeon_info *info, - struct amdgpu_gpu_info *amdinfo); +bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, + struct amdgpu_gpu_info *amdinfo); void ac_compute_driver_uuid(char *uuid, size_t size); void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size); void ac_print_gpu_info(struct radeon_info *info); int ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family); -void ac_get_raster_config(struct radeon_info *info, - uint32_t *raster_config_p, - uint32_t *raster_config_1_p, - uint32_t *se_tile_repeat_p); -void ac_get_harvested_configs(struct radeon_info *info, - unsigned raster_config, - unsigned *cik_raster_config_1_p, - unsigned *raster_config_se); -unsigned ac_get_compute_resource_limits(struct radeon_info *info, - unsigned waves_per_threadgroup, - unsigned max_waves_per_sh, - unsigned threadgroups_per_cu); +void ac_get_raster_config(struct radeon_info *info, uint32_t *raster_config_p, + uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p); +void ac_get_harvested_configs(struct radeon_info *info, unsigned raster_config, + unsigned *cik_raster_config_1_p, unsigned *raster_config_se); +unsigned ac_get_compute_resource_limits(struct radeon_info *info, unsigned waves_per_threadgroup, + unsigned max_waves_per_sh, unsigned threadgroups_per_cu); #ifdef __cplusplus } diff --git a/src/amd/common/ac_rtld.c b/src/amd/common/ac_rtld.c index e512b8f..8a9cd7c 100644 --- a/src/amd/common/ac_rtld.c +++ b/src/amd/common/ac_rtld.c @@ -23,6 +23,11 @@ #include "ac_rtld.h" +#include "ac_binary.h" +#include "ac_gpu_info.h" +#include "util/u_dynarray.h" +#include "util/u_math.h" + #include #include #include @@ -30,11 +35,6 @@ #include #include -#include "ac_binary.h" -#include "ac_gpu_info.h" -#include "util/u_dynarray.h" -#include "util/u_math.h" - // Old distributions may not have this enum constant #define MY_EM_AMDGPU 224 @@ -47,71 +47,71 @@ #endif #ifndef R_AMDGPU_NONE -#define R_AMDGPU_NONE 0 -#define R_AMDGPU_ABS32_LO 1 -#define R_AMDGPU_ABS32_HI 2 -#define R_AMDGPU_ABS64 3 -#define R_AMDGPU_REL32 4 -#define R_AMDGPU_REL64 5 -#define R_AMDGPU_ABS32 6 -#define R_AMDGPU_GOTPCREL 7 +#define R_AMDGPU_NONE 0 +#define R_AMDGPU_ABS32_LO 1 +#define R_AMDGPU_ABS32_HI 2 +#define R_AMDGPU_ABS64 3 +#define R_AMDGPU_REL32 4 +#define R_AMDGPU_REL64 5 +#define R_AMDGPU_ABS32 6 +#define R_AMDGPU_GOTPCREL 7 #define R_AMDGPU_GOTPCREL32_LO 8 #define R_AMDGPU_GOTPCREL32_HI 9 -#define R_AMDGPU_REL32_LO 10 -#define R_AMDGPU_REL32_HI 11 -#define R_AMDGPU_RELATIVE64 13 +#define R_AMDGPU_REL32_LO 10 +#define R_AMDGPU_REL32_HI 11 +#define R_AMDGPU_RELATIVE64 13 #endif /* For the UMR disassembler. */ -#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ -#define DEBUGGER_NUM_MARKERS 5 +#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ +#define DEBUGGER_NUM_MARKERS 5 struct ac_rtld_section { - bool is_rx : 1; - bool is_pasted_text : 1; - uint64_t offset; - const char *name; + bool is_rx : 1; + bool is_pasted_text : 1; + uint64_t offset; + const char *name; }; struct ac_rtld_part { - Elf *elf; - struct ac_rtld_section *sections; - unsigned num_sections; + Elf *elf; + struct ac_rtld_section *sections; + unsigned num_sections; }; static void report_erroraf(const char *fmt, va_list va) { - char *msg; - int ret = vasprintf(&msg, fmt, va); - if (ret < 0) - msg = "(vasprintf failed)"; + char *msg; + int ret = vasprintf(&msg, fmt, va); + if (ret < 0) + msg = "(vasprintf failed)"; - fprintf(stderr, "ac_rtld error: %s\n", msg); + fprintf(stderr, "ac_rtld error: %s\n", msg); - if (ret >= 0) - free(msg); + if (ret >= 0) + free(msg); } static void report_errorf(const char *fmt, ...) PRINTFLIKE(1, 2); static void report_errorf(const char *fmt, ...) { - va_list va; - va_start(va, fmt); - report_erroraf(fmt, va); - va_end(va); + va_list va; + va_start(va, fmt); + report_erroraf(fmt, va); + va_end(va); } static void report_elf_errorf(const char *fmt, ...) PRINTFLIKE(1, 2); static void report_elf_errorf(const char *fmt, ...) { - va_list va; - va_start(va, fmt); - report_erroraf(fmt, va); - va_end(va); + va_list va; + va_start(va, fmt); + report_erroraf(fmt, va); + va_end(va); - fprintf(stderr, "ELF error: %s\n", elf_errmsg(elf_errno())); + fprintf(stderr, "ELF error: %s\n", elf_errmsg(elf_errno())); } /** @@ -119,54 +119,53 @@ static void report_elf_errorf(const char *fmt, ...) * \p part_idx. */ static const struct ac_rtld_symbol *find_symbol(const struct util_dynarray *symbols, - const char *name, unsigned part_idx) + const char *name, unsigned part_idx) { - util_dynarray_foreach(symbols, struct ac_rtld_symbol, symbol) { - if ((symbol->part_idx == ~0u || symbol->part_idx == part_idx) && - !strcmp(name, symbol->name)) - return symbol; - } - return 0; + util_dynarray_foreach (symbols, struct ac_rtld_symbol, symbol) { + if ((symbol->part_idx == ~0u || symbol->part_idx == part_idx) && !strcmp(name, symbol->name)) + return symbol; + } + return 0; } static int compare_symbol_by_align(const void *lhsp, const void *rhsp) { - const struct ac_rtld_symbol *lhs = lhsp; - const struct ac_rtld_symbol *rhs = rhsp; - if (rhs->align > lhs->align) - return 1; - if (rhs->align < lhs->align) - return -1; - return 0; + const struct ac_rtld_symbol *lhs = lhsp; + const struct ac_rtld_symbol *rhs = rhsp; + if (rhs->align > lhs->align) + return 1; + if (rhs->align < lhs->align) + return -1; + return 0; } /** * Sort the given symbol list by decreasing alignment and assign offsets. */ static bool layout_symbols(struct ac_rtld_symbol *symbols, unsigned num_symbols, - uint64_t *ptotal_size) + uint64_t *ptotal_size) { - qsort(symbols, num_symbols, sizeof(*symbols), compare_symbol_by_align); + qsort(symbols, num_symbols, sizeof(*symbols), compare_symbol_by_align); - uint64_t total_size = *ptotal_size; + uint64_t total_size = *ptotal_size; - for (unsigned i = 0; i < num_symbols; ++i) { - struct ac_rtld_symbol *s = &symbols[i]; - assert(util_is_power_of_two_nonzero(s->align)); + for (unsigned i = 0; i < num_symbols; ++i) { + struct ac_rtld_symbol *s = &symbols[i]; + assert(util_is_power_of_two_nonzero(s->align)); - total_size = align64(total_size, s->align); - s->offset = total_size; + total_size = align64(total_size, s->align); + s->offset = total_size; - if (total_size + s->size < total_size) { - report_errorf("%s: size overflow", __FUNCTION__); - return false; - } + if (total_size + s->size < total_size) { + report_errorf("%s: size overflow", __FUNCTION__); + return false; + } - total_size += s->size; - } + total_size += s->size; + } - *ptotal_size = total_size; - return true; + *ptotal_size = total_size; + return true; } /** @@ -175,71 +174,68 @@ static bool layout_symbols(struct ac_rtld_symbol *symbols, unsigned num_symbols, * * Shared LDS symbols are filtered out. */ -static bool read_private_lds_symbols(struct ac_rtld_binary *binary, - unsigned part_idx, - Elf_Scn *section, - uint32_t *lds_end_align) +static bool read_private_lds_symbols(struct ac_rtld_binary *binary, unsigned part_idx, + Elf_Scn *section, uint32_t *lds_end_align) { -#define report_if(cond) \ - do { \ - if ((cond)) { \ - report_errorf(#cond); \ - return false; \ - } \ - } while (false) -#define report_elf_if(cond) \ - do { \ - if ((cond)) { \ - report_elf_errorf(#cond); \ - return false; \ - } \ - } while (false) - - struct ac_rtld_part *part = &binary->parts[part_idx]; - Elf64_Shdr *shdr = elf64_getshdr(section); - uint32_t strtabidx = shdr->sh_link; - Elf_Data *symbols_data = elf_getdata(section, NULL); - report_elf_if(!symbols_data); - - const Elf64_Sym *symbol = symbols_data->d_buf; - size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym); - - for (size_t j = 0; j < num_symbols; ++j, ++symbol) { - struct ac_rtld_symbol s = {}; - - if (ELF64_ST_TYPE(symbol->st_info) == STT_AMDGPU_LDS) { - /* old-style LDS symbols from initial prototype -- remove eventually */ - s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16); - } else if (symbol->st_shndx == SHN_AMDGPU_LDS) { - s.align = MIN2(symbol->st_value, 1u << 16); - report_if(!util_is_power_of_two_nonzero(s.align)); - } else - continue; - - report_if(symbol->st_size > 1u << 29); - - s.name = elf_strptr(part->elf, strtabidx, symbol->st_name); - s.size = symbol->st_size; - s.part_idx = part_idx; - - if (!strcmp(s.name, "__lds_end")) { - report_elf_if(s.size != 0); - *lds_end_align = MAX2(*lds_end_align, s.align); - continue; - } - - const struct ac_rtld_symbol *shared = - find_symbol(&binary->lds_symbols, s.name, part_idx); - if (shared) { - report_elf_if(s.align > shared->align); - report_elf_if(s.size > shared->size); - continue; - } - - util_dynarray_append(&binary->lds_symbols, struct ac_rtld_symbol, s); - } - - return true; +#define report_if(cond) \ + do { \ + if ((cond)) { \ + report_errorf(#cond); \ + return false; \ + } \ + } while (false) +#define report_elf_if(cond) \ + do { \ + if ((cond)) { \ + report_elf_errorf(#cond); \ + return false; \ + } \ + } while (false) + + struct ac_rtld_part *part = &binary->parts[part_idx]; + Elf64_Shdr *shdr = elf64_getshdr(section); + uint32_t strtabidx = shdr->sh_link; + Elf_Data *symbols_data = elf_getdata(section, NULL); + report_elf_if(!symbols_data); + + const Elf64_Sym *symbol = symbols_data->d_buf; + size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym); + + for (size_t j = 0; j < num_symbols; ++j, ++symbol) { + struct ac_rtld_symbol s = {}; + + if (ELF64_ST_TYPE(symbol->st_info) == STT_AMDGPU_LDS) { + /* old-style LDS symbols from initial prototype -- remove eventually */ + s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16); + } else if (symbol->st_shndx == SHN_AMDGPU_LDS) { + s.align = MIN2(symbol->st_value, 1u << 16); + report_if(!util_is_power_of_two_nonzero(s.align)); + } else + continue; + + report_if(symbol->st_size > 1u << 29); + + s.name = elf_strptr(part->elf, strtabidx, symbol->st_name); + s.size = symbol->st_size; + s.part_idx = part_idx; + + if (!strcmp(s.name, "__lds_end")) { + report_elf_if(s.size != 0); + *lds_end_align = MAX2(*lds_end_align, s.align); + continue; + } + + const struct ac_rtld_symbol *shared = find_symbol(&binary->lds_symbols, s.name, part_idx); + if (shared) { + report_elf_if(s.align > shared->align); + report_elf_if(s.size > shared->size); + continue; + } + + util_dynarray_append(&binary->lds_symbols, struct ac_rtld_symbol, s); + } + + return true; #undef report_if #undef report_elf_if @@ -251,486 +247,476 @@ static bool read_private_lds_symbols(struct ac_rtld_binary *binary, * \param binary the uninitialized struct * \param i binary opening parameters */ -bool ac_rtld_open(struct ac_rtld_binary *binary, - struct ac_rtld_open_info i) +bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i) { - /* One of the libelf implementations - * (http://www.mr511.de/software/english.htm) requires calling - * elf_version() before elf_memory(). - */ - elf_version(EV_CURRENT); - - memset(binary, 0, sizeof(*binary)); - memcpy(&binary->options, &i.options, sizeof(binary->options)); - binary->wave_size = i.wave_size; - binary->num_parts = i.num_parts; - binary->parts = calloc(sizeof(*binary->parts), i.num_parts); - if (!binary->parts) - return false; - - uint64_t pasted_text_size = 0; - uint64_t rx_align = 1; - uint64_t rx_size = 0; - uint64_t exec_size = 0; - -#define report_if(cond) \ - do { \ - if ((cond)) { \ - report_errorf(#cond); \ - goto fail; \ - } \ - } while (false) -#define report_elf_if(cond) \ - do { \ - if ((cond)) { \ - report_elf_errorf(#cond); \ - goto fail; \ - } \ - } while (false) - - /* Copy and layout shared LDS symbols. */ - if (i.num_shared_lds_symbols) { - if (!util_dynarray_resize(&binary->lds_symbols, struct ac_rtld_symbol, - i.num_shared_lds_symbols)) - goto fail; - - memcpy(binary->lds_symbols.data, i.shared_lds_symbols, binary->lds_symbols.size); - } - - util_dynarray_foreach(&binary->lds_symbols, struct ac_rtld_symbol, symbol) - symbol->part_idx = ~0u; - - unsigned max_lds_size = 64 * 1024; - - if (i.info->chip_class == GFX6 || - (i.shader_type != MESA_SHADER_COMPUTE && - i.shader_type != MESA_SHADER_FRAGMENT)) - max_lds_size = 32 * 1024; - - uint64_t shared_lds_size = 0; - if (!layout_symbols(binary->lds_symbols.data, i.num_shared_lds_symbols, &shared_lds_size)) - goto fail; - - if (shared_lds_size > max_lds_size) { - fprintf(stderr, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n", - (unsigned)shared_lds_size, max_lds_size); - goto fail; - } - binary->lds_size = shared_lds_size; - - /* First pass over all parts: open ELFs, pre-determine the placement of - * sections in the memory image, and collect and layout private LDS symbols. */ - uint32_t lds_end_align = 0; - - if (binary->options.halt_at_entry) - pasted_text_size += 4; - - for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) { - struct ac_rtld_part *part = &binary->parts[part_idx]; - unsigned part_lds_symbols_begin = - util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol); - - part->elf = elf_memory((char *)i.elf_ptrs[part_idx], i.elf_sizes[part_idx]); - report_elf_if(!part->elf); - - const Elf64_Ehdr *ehdr = elf64_getehdr(part->elf); - report_elf_if(!ehdr); - report_if(ehdr->e_machine != MY_EM_AMDGPU); - - size_t section_str_index; - size_t num_shdrs; - report_elf_if(elf_getshdrstrndx(part->elf, §ion_str_index) < 0); - report_elf_if(elf_getshdrnum(part->elf, &num_shdrs) < 0); - - part->num_sections = num_shdrs; - part->sections = calloc(sizeof(*part->sections), num_shdrs); - report_if(!part->sections); - - Elf_Scn *section = NULL; - while ((section = elf_nextscn(part->elf, section))) { - Elf64_Shdr *shdr = elf64_getshdr(section); - struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)]; - s->name = elf_strptr(part->elf, section_str_index, shdr->sh_name); - report_elf_if(!s->name); - - /* Cannot actually handle linked objects yet */ - report_elf_if(shdr->sh_addr != 0); - - /* Alignment must be 0 or a power of two */ - report_elf_if(shdr->sh_addralign & (shdr->sh_addralign - 1)); - uint64_t sh_align = MAX2(shdr->sh_addralign, 1); - - if (shdr->sh_flags & SHF_ALLOC && - shdr->sh_type != SHT_NOTE) { - report_if(shdr->sh_flags & SHF_WRITE); - - s->is_rx = true; - - if (shdr->sh_flags & SHF_EXECINSTR) { - report_elf_if(shdr->sh_size & 3); - - if (!strcmp(s->name, ".text")) - s->is_pasted_text = true; - - exec_size += shdr->sh_size; - } - - if (s->is_pasted_text) { - s->offset = pasted_text_size; - pasted_text_size += shdr->sh_size; - } else { - rx_align = align(rx_align, sh_align); - rx_size = align(rx_size, sh_align); - s->offset = rx_size; - rx_size += shdr->sh_size; - } - } else if (shdr->sh_type == SHT_SYMTAB) { - if (!read_private_lds_symbols(binary, part_idx, section, &lds_end_align)) - goto fail; - } - } - - uint64_t part_lds_size = shared_lds_size; - if (!layout_symbols( - util_dynarray_element(&binary->lds_symbols, struct ac_rtld_symbol, part_lds_symbols_begin), - util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol) - part_lds_symbols_begin, - &part_lds_size)) - goto fail; - binary->lds_size = MAX2(binary->lds_size, part_lds_size); - } - - binary->rx_end_markers = pasted_text_size; - pasted_text_size += 4 * DEBUGGER_NUM_MARKERS; - - /* __lds_end is a special symbol that points at the end of the memory - * occupied by other LDS symbols. Its alignment is taken as the - * maximum of its alignment over all shader parts where it occurs. - */ - if (lds_end_align) { - binary->lds_size = align(binary->lds_size, lds_end_align); - - struct ac_rtld_symbol *lds_end = - util_dynarray_grow(&binary->lds_symbols, struct ac_rtld_symbol, 1); - lds_end->name = "__lds_end"; - lds_end->size = 0; - lds_end->align = lds_end_align; - lds_end->offset = binary->lds_size; - lds_end->part_idx = ~0u; - } - - if (binary->lds_size > max_lds_size) { - fprintf(stderr, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n", - (unsigned)binary->lds_size, max_lds_size); - goto fail; - } - - /* Second pass: Adjust offsets of non-pasted text sections. */ - binary->rx_size = pasted_text_size; - binary->rx_size = align(binary->rx_size, rx_align); - - for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) { - struct ac_rtld_part *part = &binary->parts[part_idx]; - size_t num_shdrs; - elf_getshdrnum(part->elf, &num_shdrs); - - for (unsigned j = 0; j < num_shdrs; ++j) { - struct ac_rtld_section *s = &part->sections[j]; - if (s->is_rx && !s->is_pasted_text) - s->offset += binary->rx_size; - } - } - - binary->rx_size += rx_size; - binary->exec_size = exec_size; - - if (i.info->chip_class >= GFX10) { - /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords - * ahead of the PC, configurable by SH_MEM_CONFIG and - * S_INST_PREFETCH. This can cause two issues: - * - * (1) Crossing a page boundary to an unmapped page. The logic - * does not distinguish between a required fetch and a "mere" - * prefetch and will fault. - * - * (2) Prefetching instructions that will be changed for a - * different shader. - * - * (2) is not currently an issue because we flush the I$ at IB - * boundaries, but (1) needs to be addressed. Due to buffer - * suballocation, we just play it safe. - */ - binary->rx_size = align(binary->rx_size + 3 * 64, 64); - } - - return true; + /* One of the libelf implementations + * (http://www.mr511.de/software/english.htm) requires calling + * elf_version() before elf_memory(). + */ + elf_version(EV_CURRENT); + + memset(binary, 0, sizeof(*binary)); + memcpy(&binary->options, &i.options, sizeof(binary->options)); + binary->wave_size = i.wave_size; + binary->num_parts = i.num_parts; + binary->parts = calloc(sizeof(*binary->parts), i.num_parts); + if (!binary->parts) + return false; + + uint64_t pasted_text_size = 0; + uint64_t rx_align = 1; + uint64_t rx_size = 0; + uint64_t exec_size = 0; + +#define report_if(cond) \ + do { \ + if ((cond)) { \ + report_errorf(#cond); \ + goto fail; \ + } \ + } while (false) +#define report_elf_if(cond) \ + do { \ + if ((cond)) { \ + report_elf_errorf(#cond); \ + goto fail; \ + } \ + } while (false) + + /* Copy and layout shared LDS symbols. */ + if (i.num_shared_lds_symbols) { + if (!util_dynarray_resize(&binary->lds_symbols, struct ac_rtld_symbol, + i.num_shared_lds_symbols)) + goto fail; + + memcpy(binary->lds_symbols.data, i.shared_lds_symbols, binary->lds_symbols.size); + } + + util_dynarray_foreach (&binary->lds_symbols, struct ac_rtld_symbol, symbol) + symbol->part_idx = ~0u; + + unsigned max_lds_size = 64 * 1024; + + if (i.info->chip_class == GFX6 || + (i.shader_type != MESA_SHADER_COMPUTE && i.shader_type != MESA_SHADER_FRAGMENT)) + max_lds_size = 32 * 1024; + + uint64_t shared_lds_size = 0; + if (!layout_symbols(binary->lds_symbols.data, i.num_shared_lds_symbols, &shared_lds_size)) + goto fail; + + if (shared_lds_size > max_lds_size) { + fprintf(stderr, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n", + (unsigned)shared_lds_size, max_lds_size); + goto fail; + } + binary->lds_size = shared_lds_size; + + /* First pass over all parts: open ELFs, pre-determine the placement of + * sections in the memory image, and collect and layout private LDS symbols. */ + uint32_t lds_end_align = 0; + + if (binary->options.halt_at_entry) + pasted_text_size += 4; + + for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) { + struct ac_rtld_part *part = &binary->parts[part_idx]; + unsigned part_lds_symbols_begin = + util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol); + + part->elf = elf_memory((char *)i.elf_ptrs[part_idx], i.elf_sizes[part_idx]); + report_elf_if(!part->elf); + + const Elf64_Ehdr *ehdr = elf64_getehdr(part->elf); + report_elf_if(!ehdr); + report_if(ehdr->e_machine != MY_EM_AMDGPU); + + size_t section_str_index; + size_t num_shdrs; + report_elf_if(elf_getshdrstrndx(part->elf, §ion_str_index) < 0); + report_elf_if(elf_getshdrnum(part->elf, &num_shdrs) < 0); + + part->num_sections = num_shdrs; + part->sections = calloc(sizeof(*part->sections), num_shdrs); + report_if(!part->sections); + + Elf_Scn *section = NULL; + while ((section = elf_nextscn(part->elf, section))) { + Elf64_Shdr *shdr = elf64_getshdr(section); + struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)]; + s->name = elf_strptr(part->elf, section_str_index, shdr->sh_name); + report_elf_if(!s->name); + + /* Cannot actually handle linked objects yet */ + report_elf_if(shdr->sh_addr != 0); + + /* Alignment must be 0 or a power of two */ + report_elf_if(shdr->sh_addralign & (shdr->sh_addralign - 1)); + uint64_t sh_align = MAX2(shdr->sh_addralign, 1); + + if (shdr->sh_flags & SHF_ALLOC && shdr->sh_type != SHT_NOTE) { + report_if(shdr->sh_flags & SHF_WRITE); + + s->is_rx = true; + + if (shdr->sh_flags & SHF_EXECINSTR) { + report_elf_if(shdr->sh_size & 3); + + if (!strcmp(s->name, ".text")) + s->is_pasted_text = true; + + exec_size += shdr->sh_size; + } + + if (s->is_pasted_text) { + s->offset = pasted_text_size; + pasted_text_size += shdr->sh_size; + } else { + rx_align = align(rx_align, sh_align); + rx_size = align(rx_size, sh_align); + s->offset = rx_size; + rx_size += shdr->sh_size; + } + } else if (shdr->sh_type == SHT_SYMTAB) { + if (!read_private_lds_symbols(binary, part_idx, section, &lds_end_align)) + goto fail; + } + } + + uint64_t part_lds_size = shared_lds_size; + if (!layout_symbols(util_dynarray_element(&binary->lds_symbols, struct ac_rtld_symbol, + part_lds_symbols_begin), + util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol) - + part_lds_symbols_begin, + &part_lds_size)) + goto fail; + binary->lds_size = MAX2(binary->lds_size, part_lds_size); + } + + binary->rx_end_markers = pasted_text_size; + pasted_text_size += 4 * DEBUGGER_NUM_MARKERS; + + /* __lds_end is a special symbol that points at the end of the memory + * occupied by other LDS symbols. Its alignment is taken as the + * maximum of its alignment over all shader parts where it occurs. + */ + if (lds_end_align) { + binary->lds_size = align(binary->lds_size, lds_end_align); + + struct ac_rtld_symbol *lds_end = + util_dynarray_grow(&binary->lds_symbols, struct ac_rtld_symbol, 1); + lds_end->name = "__lds_end"; + lds_end->size = 0; + lds_end->align = lds_end_align; + lds_end->offset = binary->lds_size; + lds_end->part_idx = ~0u; + } + + if (binary->lds_size > max_lds_size) { + fprintf(stderr, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n", + (unsigned)binary->lds_size, max_lds_size); + goto fail; + } + + /* Second pass: Adjust offsets of non-pasted text sections. */ + binary->rx_size = pasted_text_size; + binary->rx_size = align(binary->rx_size, rx_align); + + for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) { + struct ac_rtld_part *part = &binary->parts[part_idx]; + size_t num_shdrs; + elf_getshdrnum(part->elf, &num_shdrs); + + for (unsigned j = 0; j < num_shdrs; ++j) { + struct ac_rtld_section *s = &part->sections[j]; + if (s->is_rx && !s->is_pasted_text) + s->offset += binary->rx_size; + } + } + + binary->rx_size += rx_size; + binary->exec_size = exec_size; + + if (i.info->chip_class >= GFX10) { + /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords + * ahead of the PC, configurable by SH_MEM_CONFIG and + * S_INST_PREFETCH. This can cause two issues: + * + * (1) Crossing a page boundary to an unmapped page. The logic + * does not distinguish between a required fetch and a "mere" + * prefetch and will fault. + * + * (2) Prefetching instructions that will be changed for a + * different shader. + * + * (2) is not currently an issue because we flush the I$ at IB + * boundaries, but (1) needs to be addressed. Due to buffer + * suballocation, we just play it safe. + */ + binary->rx_size = align(binary->rx_size + 3 * 64, 64); + } + + return true; #undef report_if #undef report_elf_if fail: - ac_rtld_close(binary); - return false; + ac_rtld_close(binary); + return false; } void ac_rtld_close(struct ac_rtld_binary *binary) { - for (unsigned i = 0; i < binary->num_parts; ++i) { - struct ac_rtld_part *part = &binary->parts[i]; - free(part->sections); - elf_end(part->elf); - } - - util_dynarray_fini(&binary->lds_symbols); - free(binary->parts); - binary->parts = NULL; - binary->num_parts = 0; + for (unsigned i = 0; i < binary->num_parts; ++i) { + struct ac_rtld_part *part = &binary->parts[i]; + free(part->sections); + elf_end(part->elf); + } + + util_dynarray_fini(&binary->lds_symbols); + free(binary->parts); + binary->parts = NULL; + binary->num_parts = 0; } -static bool get_section_by_name(struct ac_rtld_part *part, const char *name, - const char **data, size_t *nbytes) +static bool get_section_by_name(struct ac_rtld_part *part, const char *name, const char **data, + size_t *nbytes) { - for (unsigned i = 0; i < part->num_sections; ++i) { - struct ac_rtld_section *s = &part->sections[i]; - if (s->name && !strcmp(name, s->name)) { - Elf_Scn *target_scn = elf_getscn(part->elf, i); - Elf_Data *target_data = elf_getdata(target_scn, NULL); - if (!target_data) { - report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata"); - return false; - } - - *data = target_data->d_buf; - *nbytes = target_data->d_size; - return true; - } - } - return false; + for (unsigned i = 0; i < part->num_sections; ++i) { + struct ac_rtld_section *s = &part->sections[i]; + if (s->name && !strcmp(name, s->name)) { + Elf_Scn *target_scn = elf_getscn(part->elf, i); + Elf_Data *target_data = elf_getdata(target_scn, NULL); + if (!target_data) { + report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata"); + return false; + } + + *data = target_data->d_buf; + *nbytes = target_data->d_size; + return true; + } + } + return false; } -bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name, - const char **data, size_t *nbytes) +bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name, const char **data, + size_t *nbytes) { - assert(binary->num_parts == 1); - return get_section_by_name(&binary->parts[0], name, data, nbytes); + assert(binary->num_parts == 1); + return get_section_by_name(&binary->parts[0], name, data, nbytes); } -bool ac_rtld_read_config(const struct radeon_info *info, - struct ac_rtld_binary *binary, - struct ac_shader_config *config) +bool ac_rtld_read_config(const struct radeon_info *info, struct ac_rtld_binary *binary, + struct ac_shader_config *config) { - for (unsigned i = 0; i < binary->num_parts; ++i) { - struct ac_rtld_part *part = &binary->parts[i]; - const char *config_data; - size_t config_nbytes; - - if (!get_section_by_name(part, ".AMDGPU.config", - &config_data, &config_nbytes)) - return false; - - /* TODO: be precise about scratch use? */ - struct ac_shader_config c = {}; - ac_parse_shader_binary_config(config_data, config_nbytes, - binary->wave_size, true, info, &c); - - config->num_sgprs = MAX2(config->num_sgprs, c.num_sgprs); - config->num_vgprs = MAX2(config->num_vgprs, c.num_vgprs); - config->spilled_sgprs = MAX2(config->spilled_sgprs, c.spilled_sgprs); - config->spilled_vgprs = MAX2(config->spilled_vgprs, c.spilled_vgprs); - config->scratch_bytes_per_wave = MAX2(config->scratch_bytes_per_wave, - c.scratch_bytes_per_wave); - - assert(i == 0 || config->float_mode == c.float_mode); - config->float_mode = c.float_mode; - - /* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from - * the main shader part is used. */ - assert(config->spi_ps_input_ena == 0 && - config->spi_ps_input_addr == 0); - config->spi_ps_input_ena = c.spi_ps_input_ena; - config->spi_ps_input_addr = c.spi_ps_input_addr; - - /* TODO: consistently use LDS symbols for this */ - config->lds_size = MAX2(config->lds_size, c.lds_size); - - /* TODO: Should we combine these somehow? It's currently only - * used for radeonsi's compute, where multiple parts aren't used. */ - assert(config->rsrc1 == 0 && config->rsrc2 == 0); - config->rsrc1 = c.rsrc1; - config->rsrc2 = c.rsrc2; - } - - return true; + for (unsigned i = 0; i < binary->num_parts; ++i) { + struct ac_rtld_part *part = &binary->parts[i]; + const char *config_data; + size_t config_nbytes; + + if (!get_section_by_name(part, ".AMDGPU.config", &config_data, &config_nbytes)) + return false; + + /* TODO: be precise about scratch use? */ + struct ac_shader_config c = {}; + ac_parse_shader_binary_config(config_data, config_nbytes, binary->wave_size, true, info, &c); + + config->num_sgprs = MAX2(config->num_sgprs, c.num_sgprs); + config->num_vgprs = MAX2(config->num_vgprs, c.num_vgprs); + config->spilled_sgprs = MAX2(config->spilled_sgprs, c.spilled_sgprs); + config->spilled_vgprs = MAX2(config->spilled_vgprs, c.spilled_vgprs); + config->scratch_bytes_per_wave = + MAX2(config->scratch_bytes_per_wave, c.scratch_bytes_per_wave); + + assert(i == 0 || config->float_mode == c.float_mode); + config->float_mode = c.float_mode; + + /* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from + * the main shader part is used. */ + assert(config->spi_ps_input_ena == 0 && config->spi_ps_input_addr == 0); + config->spi_ps_input_ena = c.spi_ps_input_ena; + config->spi_ps_input_addr = c.spi_ps_input_addr; + + /* TODO: consistently use LDS symbols for this */ + config->lds_size = MAX2(config->lds_size, c.lds_size); + + /* TODO: Should we combine these somehow? It's currently only + * used for radeonsi's compute, where multiple parts aren't used. */ + assert(config->rsrc1 == 0 && config->rsrc2 == 0); + config->rsrc1 = c.rsrc1; + config->rsrc2 = c.rsrc2; + } + + return true; } -static bool resolve_symbol(const struct ac_rtld_upload_info *u, - unsigned part_idx, const Elf64_Sym *sym, - const char *name, uint64_t *value) +static bool resolve_symbol(const struct ac_rtld_upload_info *u, unsigned part_idx, + const Elf64_Sym *sym, const char *name, uint64_t *value) { - /* TODO: properly disentangle the undef and the LDS cases once - * STT_AMDGPU_LDS is retired. */ - if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_AMDGPU_LDS) { - const struct ac_rtld_symbol *lds_sym = - find_symbol(&u->binary->lds_symbols, name, part_idx); - - if (lds_sym) { - *value = lds_sym->offset; - return true; - } - - /* TODO: resolve from other parts */ - - if (u->get_external_symbol(u->cb_data, name, value)) - return true; - - report_errorf("symbol %s: unknown", name); - return false; - } - - struct ac_rtld_part *part = &u->binary->parts[part_idx]; - if (sym->st_shndx >= part->num_sections) { - report_errorf("symbol %s: section out of bounds", name); - return false; - } - - struct ac_rtld_section *s = &part->sections[sym->st_shndx]; - if (!s->is_rx) { - report_errorf("symbol %s: bad section", name); - return false; - } - - uint64_t section_base = u->rx_va + s->offset; - - *value = section_base + sym->st_value; - return true; + /* TODO: properly disentangle the undef and the LDS cases once + * STT_AMDGPU_LDS is retired. */ + if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_AMDGPU_LDS) { + const struct ac_rtld_symbol *lds_sym = find_symbol(&u->binary->lds_symbols, name, part_idx); + + if (lds_sym) { + *value = lds_sym->offset; + return true; + } + + /* TODO: resolve from other parts */ + + if (u->get_external_symbol(u->cb_data, name, value)) + return true; + + report_errorf("symbol %s: unknown", name); + return false; + } + + struct ac_rtld_part *part = &u->binary->parts[part_idx]; + if (sym->st_shndx >= part->num_sections) { + report_errorf("symbol %s: section out of bounds", name); + return false; + } + + struct ac_rtld_section *s = &part->sections[sym->st_shndx]; + if (!s->is_rx) { + report_errorf("symbol %s: bad section", name); + return false; + } + + uint64_t section_base = u->rx_va + s->offset; + + *value = section_base + sym->st_value; + return true; } -static bool apply_relocs(const struct ac_rtld_upload_info *u, - unsigned part_idx, const Elf64_Shdr *reloc_shdr, - const Elf_Data *reloc_data) +static bool apply_relocs(const struct ac_rtld_upload_info *u, unsigned part_idx, + const Elf64_Shdr *reloc_shdr, const Elf_Data *reloc_data) { -#define report_if(cond) \ - do { \ - if ((cond)) { \ - report_errorf(#cond); \ - return false; \ - } \ - } while (false) -#define report_elf_if(cond) \ - do { \ - if ((cond)) { \ - report_elf_errorf(#cond); \ - return false; \ - } \ - } while (false) - - struct ac_rtld_part *part = &u->binary->parts[part_idx]; - Elf_Scn *target_scn = elf_getscn(part->elf, reloc_shdr->sh_info); - report_elf_if(!target_scn); - - Elf_Data *target_data = elf_getdata(target_scn, NULL); - report_elf_if(!target_data); - - Elf_Scn *symbols_scn = elf_getscn(part->elf, reloc_shdr->sh_link); - report_elf_if(!symbols_scn); - - Elf64_Shdr *symbols_shdr = elf64_getshdr(symbols_scn); - report_elf_if(!symbols_shdr); - uint32_t strtabidx = symbols_shdr->sh_link; - - Elf_Data *symbols_data = elf_getdata(symbols_scn, NULL); - report_elf_if(!symbols_data); - - const Elf64_Sym *symbols = symbols_data->d_buf; - size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym); - - struct ac_rtld_section *s = &part->sections[reloc_shdr->sh_info]; - report_if(!s->is_rx); - - const char *orig_base = target_data->d_buf; - char *dst_base = u->rx_ptr + s->offset; - uint64_t va_base = u->rx_va + s->offset; - - Elf64_Rel *rel = reloc_data->d_buf; - size_t num_relocs = reloc_data->d_size / sizeof(*rel); - for (size_t i = 0; i < num_relocs; ++i, ++rel) { - size_t r_sym = ELF64_R_SYM(rel->r_info); - unsigned r_type = ELF64_R_TYPE(rel->r_info); - - const char *orig_ptr = orig_base + rel->r_offset; - char *dst_ptr = dst_base + rel->r_offset; - uint64_t va = va_base + rel->r_offset; - - uint64_t symbol; - uint64_t addend; - - if (r_sym == STN_UNDEF) { - symbol = 0; - } else { - report_elf_if(r_sym >= num_symbols); - - const Elf64_Sym *sym = &symbols[r_sym]; - const char *symbol_name = - elf_strptr(part->elf, strtabidx, sym->st_name); - report_elf_if(!symbol_name); - - if (!resolve_symbol(u, part_idx, sym, symbol_name, &symbol)) - return false; - } - - /* TODO: Should we also support .rela sections, where the - * addend is part of the relocation record? */ - - /* Load the addend from the ELF instead of the destination, - * because the destination may be in VRAM. */ - switch (r_type) { - case R_AMDGPU_ABS32: - case R_AMDGPU_ABS32_LO: - case R_AMDGPU_ABS32_HI: - case R_AMDGPU_REL32: - case R_AMDGPU_REL32_LO: - case R_AMDGPU_REL32_HI: - addend = *(const uint32_t *)orig_ptr; - break; - case R_AMDGPU_ABS64: - case R_AMDGPU_REL64: - addend = *(const uint64_t *)orig_ptr; - break; - default: - report_errorf("unsupported r_type == %u", r_type); - return false; - } - - uint64_t abs = symbol + addend; - - switch (r_type) { - case R_AMDGPU_ABS32: - assert((uint32_t)abs == abs); - case R_AMDGPU_ABS32_LO: - *(uint32_t *)dst_ptr = util_cpu_to_le32(abs); - break; - case R_AMDGPU_ABS32_HI: - *(uint32_t *)dst_ptr = util_cpu_to_le32(abs >> 32); - break; - case R_AMDGPU_ABS64: - *(uint64_t *)dst_ptr = util_cpu_to_le64(abs); - break; - case R_AMDGPU_REL32: - assert((int64_t)(int32_t)(abs - va) == (int64_t)(abs - va)); - case R_AMDGPU_REL32_LO: - *(uint32_t *)dst_ptr = util_cpu_to_le32(abs - va); - break; - case R_AMDGPU_REL32_HI: - *(uint32_t *)dst_ptr = util_cpu_to_le32((abs - va) >> 32); - break; - case R_AMDGPU_REL64: - *(uint64_t *)dst_ptr = util_cpu_to_le64(abs - va); - break; - default: - unreachable("bad r_type"); - } - } - - return true; +#define report_if(cond) \ + do { \ + if ((cond)) { \ + report_errorf(#cond); \ + return false; \ + } \ + } while (false) +#define report_elf_if(cond) \ + do { \ + if ((cond)) { \ + report_elf_errorf(#cond); \ + return false; \ + } \ + } while (false) + + struct ac_rtld_part *part = &u->binary->parts[part_idx]; + Elf_Scn *target_scn = elf_getscn(part->elf, reloc_shdr->sh_info); + report_elf_if(!target_scn); + + Elf_Data *target_data = elf_getdata(target_scn, NULL); + report_elf_if(!target_data); + + Elf_Scn *symbols_scn = elf_getscn(part->elf, reloc_shdr->sh_link); + report_elf_if(!symbols_scn); + + Elf64_Shdr *symbols_shdr = elf64_getshdr(symbols_scn); + report_elf_if(!symbols_shdr); + uint32_t strtabidx = symbols_shdr->sh_link; + + Elf_Data *symbols_data = elf_getdata(symbols_scn, NULL); + report_elf_if(!symbols_data); + + const Elf64_Sym *symbols = symbols_data->d_buf; + size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym); + + struct ac_rtld_section *s = &part->sections[reloc_shdr->sh_info]; + report_if(!s->is_rx); + + const char *orig_base = target_data->d_buf; + char *dst_base = u->rx_ptr + s->offset; + uint64_t va_base = u->rx_va + s->offset; + + Elf64_Rel *rel = reloc_data->d_buf; + size_t num_relocs = reloc_data->d_size / sizeof(*rel); + for (size_t i = 0; i < num_relocs; ++i, ++rel) { + size_t r_sym = ELF64_R_SYM(rel->r_info); + unsigned r_type = ELF64_R_TYPE(rel->r_info); + + const char *orig_ptr = orig_base + rel->r_offset; + char *dst_ptr = dst_base + rel->r_offset; + uint64_t va = va_base + rel->r_offset; + + uint64_t symbol; + uint64_t addend; + + if (r_sym == STN_UNDEF) { + symbol = 0; + } else { + report_elf_if(r_sym >= num_symbols); + + const Elf64_Sym *sym = &symbols[r_sym]; + const char *symbol_name = elf_strptr(part->elf, strtabidx, sym->st_name); + report_elf_if(!symbol_name); + + if (!resolve_symbol(u, part_idx, sym, symbol_name, &symbol)) + return false; + } + + /* TODO: Should we also support .rela sections, where the + * addend is part of the relocation record? */ + + /* Load the addend from the ELF instead of the destination, + * because the destination may be in VRAM. */ + switch (r_type) { + case R_AMDGPU_ABS32: + case R_AMDGPU_ABS32_LO: + case R_AMDGPU_ABS32_HI: + case R_AMDGPU_REL32: + case R_AMDGPU_REL32_LO: + case R_AMDGPU_REL32_HI: + addend = *(const uint32_t *)orig_ptr; + break; + case R_AMDGPU_ABS64: + case R_AMDGPU_REL64: + addend = *(const uint64_t *)orig_ptr; + break; + default: + report_errorf("unsupported r_type == %u", r_type); + return false; + } + + uint64_t abs = symbol + addend; + + switch (r_type) { + case R_AMDGPU_ABS32: + assert((uint32_t)abs == abs); + case R_AMDGPU_ABS32_LO: + *(uint32_t *)dst_ptr = util_cpu_to_le32(abs); + break; + case R_AMDGPU_ABS32_HI: + *(uint32_t *)dst_ptr = util_cpu_to_le32(abs >> 32); + break; + case R_AMDGPU_ABS64: + *(uint64_t *)dst_ptr = util_cpu_to_le64(abs); + break; + case R_AMDGPU_REL32: + assert((int64_t)(int32_t)(abs - va) == (int64_t)(abs - va)); + case R_AMDGPU_REL32_LO: + *(uint32_t *)dst_ptr = util_cpu_to_le32(abs - va); + break; + case R_AMDGPU_REL32_HI: + *(uint32_t *)dst_ptr = util_cpu_to_le32((abs - va) >> 32); + break; + case R_AMDGPU_REL64: + *(uint64_t *)dst_ptr = util_cpu_to_le64(abs - va); + break; + default: + unreachable("bad r_type"); + } + } + + return true; #undef report_if #undef report_elf_if @@ -742,72 +728,72 @@ static bool apply_relocs(const struct ac_rtld_upload_info *u, */ bool ac_rtld_upload(struct ac_rtld_upload_info *u) { -#define report_if(cond) \ - do { \ - if ((cond)) { \ - report_errorf(#cond); \ - return false; \ - } \ - } while (false) -#define report_elf_if(cond) \ - do { \ - if ((cond)) { \ - report_errorf(#cond); \ - return false; \ - } \ - } while (false) - - if (u->binary->options.halt_at_entry) { - /* s_sethalt 1 */ - *(uint32_t *)u->rx_ptr = util_cpu_to_le32(0xbf8d0001); - } - - /* First pass: upload raw section data and lay out private LDS symbols. */ - for (unsigned i = 0; i < u->binary->num_parts; ++i) { - struct ac_rtld_part *part = &u->binary->parts[i]; - - Elf_Scn *section = NULL; - while ((section = elf_nextscn(part->elf, section))) { - Elf64_Shdr *shdr = elf64_getshdr(section); - struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)]; - - if (!s->is_rx) - continue; - - report_if(shdr->sh_type != SHT_PROGBITS); - - Elf_Data *data = elf_getdata(section, NULL); - report_elf_if(!data || data->d_size != shdr->sh_size); - memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size); - } - } - - if (u->binary->rx_end_markers) { - uint32_t *dst = (uint32_t *)(u->rx_ptr + u->binary->rx_end_markers); - for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; ++i) - *dst++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER); - } - - /* Second pass: handle relocations, overwriting uploaded data where - * appropriate. */ - for (unsigned i = 0; i < u->binary->num_parts; ++i) { - struct ac_rtld_part *part = &u->binary->parts[i]; - Elf_Scn *section = NULL; - while ((section = elf_nextscn(part->elf, section))) { - Elf64_Shdr *shdr = elf64_getshdr(section); - if (shdr->sh_type == SHT_REL) { - Elf_Data *relocs = elf_getdata(section, NULL); - report_elf_if(!relocs || relocs->d_size != shdr->sh_size); - if (!apply_relocs(u, i, shdr, relocs)) - return false; - } else if (shdr->sh_type == SHT_RELA) { - report_errorf("SHT_RELA not supported"); - return false; - } - } - } - - return true; +#define report_if(cond) \ + do { \ + if ((cond)) { \ + report_errorf(#cond); \ + return false; \ + } \ + } while (false) +#define report_elf_if(cond) \ + do { \ + if ((cond)) { \ + report_errorf(#cond); \ + return false; \ + } \ + } while (false) + + if (u->binary->options.halt_at_entry) { + /* s_sethalt 1 */ + *(uint32_t *)u->rx_ptr = util_cpu_to_le32(0xbf8d0001); + } + + /* First pass: upload raw section data and lay out private LDS symbols. */ + for (unsigned i = 0; i < u->binary->num_parts; ++i) { + struct ac_rtld_part *part = &u->binary->parts[i]; + + Elf_Scn *section = NULL; + while ((section = elf_nextscn(part->elf, section))) { + Elf64_Shdr *shdr = elf64_getshdr(section); + struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)]; + + if (!s->is_rx) + continue; + + report_if(shdr->sh_type != SHT_PROGBITS); + + Elf_Data *data = elf_getdata(section, NULL); + report_elf_if(!data || data->d_size != shdr->sh_size); + memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size); + } + } + + if (u->binary->rx_end_markers) { + uint32_t *dst = (uint32_t *)(u->rx_ptr + u->binary->rx_end_markers); + for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; ++i) + *dst++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER); + } + + /* Second pass: handle relocations, overwriting uploaded data where + * appropriate. */ + for (unsigned i = 0; i < u->binary->num_parts; ++i) { + struct ac_rtld_part *part = &u->binary->parts[i]; + Elf_Scn *section = NULL; + while ((section = elf_nextscn(part->elf, section))) { + Elf64_Shdr *shdr = elf64_getshdr(section); + if (shdr->sh_type == SHT_REL) { + Elf_Data *relocs = elf_getdata(section, NULL); + report_elf_if(!relocs || relocs->d_size != shdr->sh_size); + if (!apply_relocs(u, i, shdr, relocs)) + return false; + } else if (shdr->sh_type == SHT_RELA) { + report_errorf("SHT_RELA not supported"); + return false; + } + } + } + + return true; #undef report_if #undef report_elf_if diff --git a/src/amd/common/ac_rtld.h b/src/amd/common/ac_rtld.h index 2470a52..af03a85 100644 --- a/src/amd/common/ac_rtld.h +++ b/src/amd/common/ac_rtld.h @@ -24,12 +24,12 @@ #ifndef AC_RTLD_H #define AC_RTLD_H +#include "compiler/shader_enums.h" +#include "util/u_dynarray.h" + #include -#include #include - -#include "util/u_dynarray.h" -#include "compiler/shader_enums.h" +#include #ifdef __cplusplus extern "C" { @@ -40,37 +40,37 @@ struct ac_shader_config; struct radeon_info; struct ac_rtld_symbol { - const char *name; - uint32_t size; - uint32_t align; - uint64_t offset; /* filled in by ac_rtld_open */ - unsigned part_idx; /* shader part in which this symbol appears */ + const char *name; + uint32_t size; + uint32_t align; + uint64_t offset; /* filled in by ac_rtld_open */ + unsigned part_idx; /* shader part in which this symbol appears */ }; struct ac_rtld_options { - /* Loader will insert an s_sethalt 1 instruction as the - * first instruction. */ - bool halt_at_entry:1; + /* Loader will insert an s_sethalt 1 instruction as the + * first instruction. */ + bool halt_at_entry : 1; }; /* Lightweight wrapper around underlying ELF objects. */ struct ac_rtld_binary { - struct ac_rtld_options options; - unsigned wave_size; + struct ac_rtld_options options; + unsigned wave_size; - /* Required buffer sizes, currently read/executable only. */ - uint64_t rx_size; + /* Required buffer sizes, currently read/executable only. */ + uint64_t rx_size; - /* Size of executable code, for reporting purposes. */ - uint64_t exec_size; + /* Size of executable code, for reporting purposes. */ + uint64_t exec_size; - uint64_t rx_end_markers; + uint64_t rx_end_markers; - unsigned num_parts; - struct ac_rtld_part *parts; + unsigned num_parts; + struct ac_rtld_part *parts; - struct util_dynarray lds_symbols; - uint32_t lds_size; + struct util_dynarray lds_symbols; + uint32_t lds_size; }; /** @@ -82,8 +82,7 @@ struct ac_rtld_binary { * \param value to be filled in by the callback * \return whether the symbol was found successfully */ -typedef bool (*ac_rtld_get_external_symbol_cb)( - void *cb_data, const char *symbol, uint64_t *value); +typedef bool (*ac_rtld_get_external_symbol_cb)(void *cb_data, const char *symbol, uint64_t *value); /** * Lifetimes of \ref info, in-memory ELF objects, and the names of @@ -91,50 +90,48 @@ typedef bool (*ac_rtld_get_external_symbol_cb)( * the opened binary. */ struct ac_rtld_open_info { - const struct radeon_info *info; - struct ac_rtld_options options; - gl_shader_stage shader_type; - unsigned wave_size; - - unsigned num_parts; - const char * const *elf_ptrs; /* in-memory ELF objects of each part */ - const size_t *elf_sizes; /* sizes of corresponding in-memory ELF objects in bytes */ - - /* Shared LDS symbols are layouted such that they are accessible from - * all shader parts. Non-shared (private) LDS symbols of one part may - * overlap private LDS symbols of another shader part. - */ - unsigned num_shared_lds_symbols; - const struct ac_rtld_symbol *shared_lds_symbols; + const struct radeon_info *info; + struct ac_rtld_options options; + gl_shader_stage shader_type; + unsigned wave_size; + + unsigned num_parts; + const char *const *elf_ptrs; /* in-memory ELF objects of each part */ + const size_t *elf_sizes; /* sizes of corresponding in-memory ELF objects in bytes */ + + /* Shared LDS symbols are layouted such that they are accessible from + * all shader parts. Non-shared (private) LDS symbols of one part may + * overlap private LDS symbols of another shader part. + */ + unsigned num_shared_lds_symbols; + const struct ac_rtld_symbol *shared_lds_symbols; }; -bool ac_rtld_open(struct ac_rtld_binary *binary, - struct ac_rtld_open_info i); +bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i); void ac_rtld_close(struct ac_rtld_binary *binary); -bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name, - const char **data, size_t *nbytes); +bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name, const char **data, + size_t *nbytes); -bool ac_rtld_read_config(const struct radeon_info *info, - struct ac_rtld_binary *binary, - struct ac_shader_config *config); +bool ac_rtld_read_config(const struct radeon_info *info, struct ac_rtld_binary *binary, + struct ac_shader_config *config); struct ac_rtld_upload_info { - struct ac_rtld_binary *binary; + struct ac_rtld_binary *binary; - /** GPU mapping of the read/executable buffer. */ - uint64_t rx_va; + /** GPU mapping of the read/executable buffer. */ + uint64_t rx_va; - /** CPU mapping of the read/executable buffer */ - char *rx_ptr; + /** CPU mapping of the read/executable buffer */ + char *rx_ptr; - /** Optional callback function that will be queried for symbols not - * defined in any of the binary's parts. */ - ac_rtld_get_external_symbol_cb get_external_symbol; + /** Optional callback function that will be queried for symbols not + * defined in any of the binary's parts. */ + ac_rtld_get_external_symbol_cb get_external_symbol; - /** Caller-defined data that will be passed to callback functions. */ - void *cb_data; + /** Caller-defined data that will be passed to callback functions. */ + void *cb_data; }; bool ac_rtld_upload(struct ac_rtld_upload_info *u); diff --git a/src/amd/common/ac_shader_args.c b/src/amd/common/ac_shader_args.c index d5600ea..d3816e1 100644 --- a/src/amd/common/ac_shader_args.c +++ b/src/amd/common/ac_shader_args.c @@ -22,34 +22,33 @@ */ #include "ac_shader_args.h" + #include "nir/nir_builder.h" -void -ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, - unsigned size, enum ac_arg_type type, struct ac_arg *arg) +void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned size, + enum ac_arg_type type, struct ac_arg *arg) { - assert(info->arg_count < AC_MAX_ARGS); + assert(info->arg_count < AC_MAX_ARGS); - unsigned offset; - if (regfile == AC_ARG_SGPR) { - offset = info->num_sgprs_used; - info->num_sgprs_used += size; - } else { - assert(regfile == AC_ARG_VGPR); - offset = info->num_vgprs_used; - info->num_vgprs_used += size; - } + unsigned offset; + if (regfile == AC_ARG_SGPR) { + offset = info->num_sgprs_used; + info->num_sgprs_used += size; + } else { + assert(regfile == AC_ARG_VGPR); + offset = info->num_vgprs_used; + info->num_vgprs_used += size; + } - info->args[info->arg_count].file = regfile; - info->args[info->arg_count].offset = offset; - info->args[info->arg_count].size = size; - info->args[info->arg_count].type = type; + info->args[info->arg_count].file = regfile; + info->args[info->arg_count].offset = offset; + info->args[info->arg_count].size = size; + info->args[info->arg_count].type = type; - if (arg) { - arg->arg_index = info->arg_count; - arg->used = true; - } + if (arg) { + arg->arg_index = info->arg_count; + arg->used = true; + } - info->arg_count++; + info->arg_count++; } - diff --git a/src/amd/common/ac_shader_args.h b/src/amd/common/ac_shader_args.h index 90798c6..c3f4042 100644 --- a/src/amd/common/ac_shader_args.h +++ b/src/amd/common/ac_shader_args.h @@ -24,91 +24,90 @@ #ifndef AC_SHADER_ARGS_H #define AC_SHADER_ARGS_H -#include #include +#include #define AC_MAX_INLINE_PUSH_CONSTS 8 -enum ac_arg_regfile { - AC_ARG_SGPR, - AC_ARG_VGPR, +enum ac_arg_regfile +{ + AC_ARG_SGPR, + AC_ARG_VGPR, }; -enum ac_arg_type { - AC_ARG_FLOAT, - AC_ARG_INT, - AC_ARG_CONST_PTR, /* Pointer to i8 array */ - AC_ARG_CONST_FLOAT_PTR, /* Pointer to f32 array */ - AC_ARG_CONST_PTR_PTR, /* Pointer to pointer to i8 array */ - AC_ARG_CONST_DESC_PTR, /* Pointer to v4i32 array */ - AC_ARG_CONST_IMAGE_PTR, /* Pointer to v8i32 array */ +enum ac_arg_type +{ + AC_ARG_FLOAT, + AC_ARG_INT, + AC_ARG_CONST_PTR, /* Pointer to i8 array */ + AC_ARG_CONST_FLOAT_PTR, /* Pointer to f32 array */ + AC_ARG_CONST_PTR_PTR, /* Pointer to pointer to i8 array */ + AC_ARG_CONST_DESC_PTR, /* Pointer to v4i32 array */ + AC_ARG_CONST_IMAGE_PTR, /* Pointer to v8i32 array */ }; struct ac_arg { - uint8_t arg_index; - bool used; + uint8_t arg_index; + bool used; }; - #define AC_MAX_ARGS 128 struct ac_shader_args { - /* Info on how to declare arguments */ - struct { - enum ac_arg_type type; - enum ac_arg_regfile file; - uint8_t offset; - uint8_t size; - bool skip; - } args[AC_MAX_ARGS]; - - uint8_t arg_count; - uint8_t sgpr_count; - uint8_t num_sgprs_used; - uint8_t num_vgprs_used; - - struct ac_arg base_vertex; - struct ac_arg start_instance; - struct ac_arg draw_id; - struct ac_arg vertex_id; - struct ac_arg instance_id; - struct ac_arg tcs_patch_id; - struct ac_arg tcs_rel_ids; - struct ac_arg tes_patch_id; - struct ac_arg gs_prim_id; - struct ac_arg gs_invocation_id; - - /* PS */ - struct ac_arg frag_pos[4]; - struct ac_arg front_face; - struct ac_arg ancillary; - struct ac_arg sample_coverage; - struct ac_arg prim_mask; - struct ac_arg persp_sample; - struct ac_arg persp_center; - struct ac_arg persp_centroid; - struct ac_arg pull_model; - struct ac_arg linear_sample; - struct ac_arg linear_center; - struct ac_arg linear_centroid; - - /* CS */ - struct ac_arg local_invocation_ids; - struct ac_arg num_work_groups; - struct ac_arg workgroup_ids[3]; - struct ac_arg tg_size; - - /* Vulkan only */ - struct ac_arg push_constants; - struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS]; - unsigned num_inline_push_consts; - unsigned base_inline_push_consts; - struct ac_arg view_index; + /* Info on how to declare arguments */ + struct { + enum ac_arg_type type; + enum ac_arg_regfile file; + uint8_t offset; + uint8_t size; + bool skip; + } args[AC_MAX_ARGS]; + + uint8_t arg_count; + uint8_t sgpr_count; + uint8_t num_sgprs_used; + uint8_t num_vgprs_used; + + struct ac_arg base_vertex; + struct ac_arg start_instance; + struct ac_arg draw_id; + struct ac_arg vertex_id; + struct ac_arg instance_id; + struct ac_arg tcs_patch_id; + struct ac_arg tcs_rel_ids; + struct ac_arg tes_patch_id; + struct ac_arg gs_prim_id; + struct ac_arg gs_invocation_id; + + /* PS */ + struct ac_arg frag_pos[4]; + struct ac_arg front_face; + struct ac_arg ancillary; + struct ac_arg sample_coverage; + struct ac_arg prim_mask; + struct ac_arg persp_sample; + struct ac_arg persp_center; + struct ac_arg persp_centroid; + struct ac_arg pull_model; + struct ac_arg linear_sample; + struct ac_arg linear_center; + struct ac_arg linear_centroid; + + /* CS */ + struct ac_arg local_invocation_ids; + struct ac_arg num_work_groups; + struct ac_arg workgroup_ids[3]; + struct ac_arg tg_size; + + /* Vulkan only */ + struct ac_arg push_constants; + struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS]; + unsigned num_inline_push_consts; + unsigned base_inline_push_consts; + struct ac_arg view_index; }; -void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, - unsigned registers, enum ac_arg_type type, - struct ac_arg *arg); +void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned registers, + enum ac_arg_type type, struct ac_arg *arg); #endif - diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index d4ccf38..a57b5cac 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -21,277 +21,303 @@ * IN THE SOFTWARE. */ +#include "ac_shader_util.h" + +#include "sid.h" + #include #include #include -#include "ac_shader_util.h" -#include "sid.h" - -unsigned -ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, - bool writes_samplemask) +unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask) { - if (writes_z) { - /* Z needs 32 bits. */ - if (writes_samplemask) - return V_028710_SPI_SHADER_32_ABGR; - else if (writes_stencil) - return V_028710_SPI_SHADER_32_GR; - else - return V_028710_SPI_SHADER_32_R; - } else if (writes_stencil || writes_samplemask) { - /* Both stencil and sample mask need only 16 bits. */ - return V_028710_SPI_SHADER_UINT16_ABGR; - } else { - return V_028710_SPI_SHADER_ZERO; - } + if (writes_z) { + /* Z needs 32 bits. */ + if (writes_samplemask) + return V_028710_SPI_SHADER_32_ABGR; + else if (writes_stencil) + return V_028710_SPI_SHADER_32_GR; + else + return V_028710_SPI_SHADER_32_R; + } else if (writes_stencil || writes_samplemask) { + /* Both stencil and sample mask need only 16 bits. */ + return V_028710_SPI_SHADER_UINT16_ABGR; + } else { + return V_028710_SPI_SHADER_ZERO; + } } -unsigned -ac_get_cb_shader_mask(unsigned spi_shader_col_format) +unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format) { - unsigned i, cb_shader_mask = 0; - - for (i = 0; i < 8; i++) { - switch ((spi_shader_col_format >> (i * 4)) & 0xf) { - case V_028714_SPI_SHADER_ZERO: - break; - case V_028714_SPI_SHADER_32_R: - cb_shader_mask |= 0x1 << (i * 4); - break; - case V_028714_SPI_SHADER_32_GR: - cb_shader_mask |= 0x3 << (i * 4); - break; - case V_028714_SPI_SHADER_32_AR: - cb_shader_mask |= 0x9u << (i * 4); - break; - case V_028714_SPI_SHADER_FP16_ABGR: - case V_028714_SPI_SHADER_UNORM16_ABGR: - case V_028714_SPI_SHADER_SNORM16_ABGR: - case V_028714_SPI_SHADER_UINT16_ABGR: - case V_028714_SPI_SHADER_SINT16_ABGR: - case V_028714_SPI_SHADER_32_ABGR: - cb_shader_mask |= 0xfu << (i * 4); - break; - default: - assert(0); - } - } - return cb_shader_mask; + unsigned i, cb_shader_mask = 0; + + for (i = 0; i < 8; i++) { + switch ((spi_shader_col_format >> (i * 4)) & 0xf) { + case V_028714_SPI_SHADER_ZERO: + break; + case V_028714_SPI_SHADER_32_R: + cb_shader_mask |= 0x1 << (i * 4); + break; + case V_028714_SPI_SHADER_32_GR: + cb_shader_mask |= 0x3 << (i * 4); + break; + case V_028714_SPI_SHADER_32_AR: + cb_shader_mask |= 0x9u << (i * 4); + break; + case V_028714_SPI_SHADER_FP16_ABGR: + case V_028714_SPI_SHADER_UNORM16_ABGR: + case V_028714_SPI_SHADER_SNORM16_ABGR: + case V_028714_SPI_SHADER_UINT16_ABGR: + case V_028714_SPI_SHADER_SINT16_ABGR: + case V_028714_SPI_SHADER_32_ABGR: + cb_shader_mask |= 0xfu << (i * 4); + break; + default: + assert(0); + } + } + return cb_shader_mask; } /** * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a * geometry shader. */ -uint32_t -ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class) +uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class) { - unsigned cut_mode; - - if (gs_max_vert_out <= 128) { - cut_mode = V_028A40_GS_CUT_128; - } else if (gs_max_vert_out <= 256) { - cut_mode = V_028A40_GS_CUT_256; - } else if (gs_max_vert_out <= 512) { - cut_mode = V_028A40_GS_CUT_512; - } else { - assert(gs_max_vert_out <= 1024); - cut_mode = V_028A40_GS_CUT_1024; - } - - return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | - S_028A40_CUT_MODE(cut_mode)| - S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) | - S_028A40_GS_WRITE_OPTIMIZE(1) | - S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0); + unsigned cut_mode; + + if (gs_max_vert_out <= 128) { + cut_mode = V_028A40_GS_CUT_128; + } else if (gs_max_vert_out <= 256) { + cut_mode = V_028A40_GS_CUT_256; + } else if (gs_max_vert_out <= 512) { + cut_mode = V_028A40_GS_CUT_512; + } else { + assert(gs_max_vert_out <= 1024); + cut_mode = V_028A40_GS_CUT_1024; + } + + return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) | + S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) | + S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0); } /// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format /// value for LLVM8+ tbuffer intrinsics. -unsigned -ac_get_tbuffer_format(enum chip_class chip_class, - unsigned dfmt, unsigned nfmt) +unsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt) { - // Some games try to access vertex buffers without a valid format. - // This is a game bug, but we should still handle it gracefully. - if (dfmt == V_008F0C_IMG_FORMAT_INVALID) - return V_008F0C_IMG_FORMAT_INVALID; - - if (chip_class >= GFX10) { - unsigned format; - switch (dfmt) { - default: unreachable("bad dfmt"); - case V_008F0C_BUF_DATA_FORMAT_INVALID: format = V_008F0C_IMG_FORMAT_INVALID; break; - case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break; - case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; break; - } - - // Use the regularity properties of the combined format enum. - // - // Note: float is incompatible with 8-bit data formats, - // [us]{norm,scaled} are incomparible with 32-bit data formats. - // [us]scaled are not writable. - switch (nfmt) { - case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break; - case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break; - case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break; - case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break; - default: unreachable("bad nfmt"); - case V_008F0C_BUF_NUM_FORMAT_UINT: break; - case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break; - case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break; - } - - return format; - } else { - return dfmt | (nfmt << 4); - } + // Some games try to access vertex buffers without a valid format. + // This is a game bug, but we should still handle it gracefully. + if (dfmt == V_008F0C_IMG_FORMAT_INVALID) + return V_008F0C_IMG_FORMAT_INVALID; + + if (chip_class >= GFX10) { + unsigned format; + switch (dfmt) { + default: + unreachable("bad dfmt"); + case V_008F0C_BUF_DATA_FORMAT_INVALID: + format = V_008F0C_IMG_FORMAT_INVALID; + break; + case V_008F0C_BUF_DATA_FORMAT_8: + format = V_008F0C_IMG_FORMAT_8_UINT; + break; + case V_008F0C_BUF_DATA_FORMAT_8_8: + format = V_008F0C_IMG_FORMAT_8_8_UINT; + break; + case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: + format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; + break; + case V_008F0C_BUF_DATA_FORMAT_16: + format = V_008F0C_IMG_FORMAT_16_UINT; + break; + case V_008F0C_BUF_DATA_FORMAT_16_16: + format = V_008F0C_IMG_FORMAT_16_16_UINT; + break; + case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: + format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; + break; + case V_008F0C_BUF_DATA_FORMAT_32: + format = V_008F0C_IMG_FORMAT_32_UINT; + break; + case V_008F0C_BUF_DATA_FORMAT_32_32: + format = V_008F0C_IMG_FORMAT_32_32_UINT; + break; + case V_008F0C_BUF_DATA_FORMAT_32_32_32: + format = V_008F0C_IMG_FORMAT_32_32_32_UINT; + break; + case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: + format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; + break; + case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: + format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; + break; + } + + // Use the regularity properties of the combined format enum. + // + // Note: float is incompatible with 8-bit data formats, + // [us]{norm,scaled} are incomparible with 32-bit data formats. + // [us]scaled are not writable. + switch (nfmt) { + case V_008F0C_BUF_NUM_FORMAT_UNORM: + format -= 4; + break; + case V_008F0C_BUF_NUM_FORMAT_SNORM: + format -= 3; + break; + case V_008F0C_BUF_NUM_FORMAT_USCALED: + format -= 2; + break; + case V_008F0C_BUF_NUM_FORMAT_SSCALED: + format -= 1; + break; + default: + unreachable("bad nfmt"); + case V_008F0C_BUF_NUM_FORMAT_UINT: + break; + case V_008F0C_BUF_NUM_FORMAT_SINT: + format += 1; + break; + case V_008F0C_BUF_NUM_FORMAT_FLOAT: + format += 2; + break; + } + + return format; + } else { + return dfmt | (nfmt << 4); + } } static const struct ac_data_format_info data_format_table[] = { - [V_008F0C_BUF_DATA_FORMAT_INVALID] = { 0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID }, - [V_008F0C_BUF_DATA_FORMAT_8] = { 1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8 }, - [V_008F0C_BUF_DATA_FORMAT_16] = { 2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16 }, - [V_008F0C_BUF_DATA_FORMAT_8_8] = { 2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8 }, - [V_008F0C_BUF_DATA_FORMAT_32] = { 4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32 }, - [V_008F0C_BUF_DATA_FORMAT_16_16] = { 4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16 }, - [V_008F0C_BUF_DATA_FORMAT_10_11_11] = { 4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11 }, - [V_008F0C_BUF_DATA_FORMAT_11_11_10] = { 4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10 }, - [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = { 4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2 }, - [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = { 4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10 }, - [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = { 4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8 }, - [V_008F0C_BUF_DATA_FORMAT_32_32] = { 8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32 }, - [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = { 8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16 }, - [V_008F0C_BUF_DATA_FORMAT_32_32_32] = { 12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32 }, - [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = { 16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32 }, + [V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID}, + [V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8}, + [V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16}, + [V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8}, + [V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32}, + [V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16}, + [V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11}, + [V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10}, + [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2}, + [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10}, + [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8}, + [V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32}, + [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16}, + [V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32}, + [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32}, }; -const struct ac_data_format_info * -ac_get_data_format_info(unsigned dfmt) +const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt) { - assert(dfmt < ARRAY_SIZE(data_format_table)); - return &data_format_table[dfmt]; + assert(dfmt < ARRAY_SIZE(data_format_table)); + return &data_format_table[dfmt]; } -enum ac_image_dim -ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim, - bool is_array) +enum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim, + bool is_array) { - switch (dim) { - case GLSL_SAMPLER_DIM_1D: - if (chip_class == GFX9) - return is_array ? ac_image_2darray : ac_image_2d; - return is_array ? ac_image_1darray : ac_image_1d; - case GLSL_SAMPLER_DIM_2D: - case GLSL_SAMPLER_DIM_RECT: - case GLSL_SAMPLER_DIM_EXTERNAL: - return is_array ? ac_image_2darray : ac_image_2d; - case GLSL_SAMPLER_DIM_3D: - return ac_image_3d; - case GLSL_SAMPLER_DIM_CUBE: - return ac_image_cube; - case GLSL_SAMPLER_DIM_MS: - return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa; - case GLSL_SAMPLER_DIM_SUBPASS: - return ac_image_2darray; - case GLSL_SAMPLER_DIM_SUBPASS_MS: - return ac_image_2darraymsaa; - default: - unreachable("bad sampler dim"); - } + switch (dim) { + case GLSL_SAMPLER_DIM_1D: + if (chip_class == GFX9) + return is_array ? ac_image_2darray : ac_image_2d; + return is_array ? ac_image_1darray : ac_image_1d; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + return is_array ? ac_image_2darray : ac_image_2d; + case GLSL_SAMPLER_DIM_3D: + return ac_image_3d; + case GLSL_SAMPLER_DIM_CUBE: + return ac_image_cube; + case GLSL_SAMPLER_DIM_MS: + return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa; + case GLSL_SAMPLER_DIM_SUBPASS: + return ac_image_2darray; + case GLSL_SAMPLER_DIM_SUBPASS_MS: + return ac_image_2darraymsaa; + default: + unreachable("bad sampler dim"); + } } -enum ac_image_dim -ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim, - bool is_array) +enum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim, + bool is_array) { - enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array); - - /* Match the resource type set in the descriptor. */ - if (dim == ac_image_cube || - (chip_class <= GFX8 && dim == ac_image_3d)) - dim = ac_image_2darray; - else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) { - /* When a single layer of a 3D texture is bound, the shader - * will refer to a 2D target, but the descriptor has a 3D type. - * Since the HW ignores BASE_ARRAY in this case, we need to - * send 3 coordinates. This doesn't hurt when the underlying - * texture is non-3D. - */ - dim = ac_image_3d; - } - - return dim; + enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array); + + /* Match the resource type set in the descriptor. */ + if (dim == ac_image_cube || (chip_class <= GFX8 && dim == ac_image_3d)) + dim = ac_image_2darray; + else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) { + /* When a single layer of a 3D texture is bound, the shader + * will refer to a 2D target, but the descriptor has a 3D type. + * Since the HW ignores BASE_ARRAY in this case, we need to + * send 3 coordinates. This doesn't hurt when the underlying + * texture is non-3D. + */ + dim = ac_image_3d; + } + + return dim; } -unsigned -ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, - signed char *face_vgpr_index_ptr, - signed char *ancillary_vgpr_index_ptr) +unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, + signed char *face_vgpr_index_ptr, + signed char *ancillary_vgpr_index_ptr) { - unsigned num_input_vgprs = 0; - signed char face_vgpr_index = -1; - signed char ancillary_vgpr_index = -1; - - if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 3; - if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 2; - if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) { - face_vgpr_index = num_input_vgprs; - num_input_vgprs += 1; - } - if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) { - ancillary_vgpr_index = num_input_vgprs; - num_input_vgprs += 1; - } - if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 1; - if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr)) - num_input_vgprs += 1; - - if (face_vgpr_index_ptr) - *face_vgpr_index_ptr = face_vgpr_index; - if (ancillary_vgpr_index_ptr) - *ancillary_vgpr_index_ptr = ancillary_vgpr_index; - - return num_input_vgprs; + unsigned num_input_vgprs = 0; + signed char face_vgpr_index = -1; + signed char ancillary_vgpr_index = -1; + + if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 3; + if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 2; + if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) { + face_vgpr_index = num_input_vgprs; + num_input_vgprs += 1; + } + if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) { + ancillary_vgpr_index = num_input_vgprs; + num_input_vgprs += 1; + } + if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr)) + num_input_vgprs += 1; + + if (face_vgpr_index_ptr) + *face_vgpr_index_ptr = face_vgpr_index; + if (ancillary_vgpr_index_ptr) + *ancillary_vgpr_index_ptr = ancillary_vgpr_index; + + return num_input_vgprs; } -void ac_choose_spi_color_formats(unsigned format, unsigned swap, - unsigned ntype, bool is_depth, - struct ac_spi_color_formats *formats) +void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, bool is_depth, + struct ac_spi_color_formats *formats) { /* Alpha is needed for alpha-to-coverage. * Blending may be with or without alpha. diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 49e1eb2..c2a5233 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -24,75 +24,64 @@ #ifndef AC_SHADER_UTIL_H #define AC_SHADER_UTIL_H -#include -#include - -#include "amd_family.h" #include "ac_binary.h" +#include "amd_family.h" #include "compiler/nir/nir.h" +#include +#include + #ifdef __cplusplus extern "C" { #endif -enum ac_image_dim { - ac_image_1d, - ac_image_2d, - ac_image_3d, - ac_image_cube, // includes cube arrays - ac_image_1darray, - ac_image_2darray, - ac_image_2dmsaa, - ac_image_2darraymsaa, +enum ac_image_dim +{ + ac_image_1d, + ac_image_2d, + ac_image_3d, + ac_image_cube, // includes cube arrays + ac_image_1darray, + ac_image_2darray, + ac_image_2dmsaa, + ac_image_2darraymsaa, }; struct ac_data_format_info { - uint8_t element_size; - uint8_t num_channels; - uint8_t chan_byte_size; - uint8_t chan_format; + uint8_t element_size; + uint8_t num_channels; + uint8_t chan_byte_size; + uint8_t chan_format; }; struct ac_spi_color_formats { - unsigned normal : 8; - unsigned alpha : 8; - unsigned blend : 8; - unsigned blend_alpha : 8; + unsigned normal : 8; + unsigned alpha : 8; + unsigned blend : 8; + unsigned blend_alpha : 8; }; -unsigned -ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, - bool writes_samplemask); +unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask); -unsigned -ac_get_cb_shader_mask(unsigned spi_shader_col_format); +unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format); -uint32_t -ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class); +uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class); -unsigned -ac_get_tbuffer_format(enum chip_class chip_class, - unsigned dfmt, unsigned nfmt); +unsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt); -const struct ac_data_format_info * -ac_get_data_format_info(unsigned dfmt); +const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt); -enum ac_image_dim -ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim, - bool is_array); +enum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim, + bool is_array); -enum ac_image_dim -ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim, - bool is_array); +enum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim, + bool is_array); -unsigned -ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, - signed char *face_vgpr_index, - signed char *ancillary_vgpr_index); +unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, + signed char *face_vgpr_index, signed char *ancillary_vgpr_index); -void ac_choose_spi_color_formats(unsigned format, unsigned swap, - unsigned ntype, bool is_depth, - struct ac_spi_color_formats *formats); +void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, bool is_depth, + struct ac_spi_color_formats *formats); #ifdef __cplusplus } diff --git a/src/amd/common/ac_shadowed_regs.c b/src/amd/common/ac_shadowed_regs.c index d08ccf0..1ef2df5 100644 --- a/src/amd/common/ac_shadowed_regs.c +++ b/src/amd/common/ac_shadowed_regs.c @@ -28,10 +28,12 @@ */ #include "ac_shadowed_regs.h" + #include "ac_debug.h" #include "sid.h" #include "util/macros.h" #include "util/u_debug.h" + #include static const struct ac_reg_range Gfx9UserConfigShadowRange[] = { @@ -522,7 +524,8 @@ static const struct ac_reg_range Navi10NonShadowedRanges[] = { VGT_DMA_PRIMITIVE_TYPE, VGT_DMA_LS_HS_CONFIG - VGT_DMA_PRIMITIVE_TYPE + 4, },*/ - /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be shadowed. */ + /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be + shadowed. */ { R_028A7C_VGT_DMA_INDEX_TYPE, 4, @@ -731,7 +734,8 @@ static const struct ac_reg_range Gfx103NonShadowedRanges[] = { VGT_DMA_PRIMITIVE_TYPE, VGT_DMA_LS_HS_CONFIG - VGT_DMA_PRIMITIVE_TYPE + 4, },*/ - /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be shadowed. */ + /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be + shadowed. */ { R_028A7C_VGT_DMA_INDEX_TYPE, 4, @@ -816,7 +820,11 @@ void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family, enum ac_reg_range_type type, unsigned *num_ranges, const struct ac_reg_range **ranges) { -#define RETURN(array) do { *ranges = array; *num_ranges = ARRAY_SIZE(array); } while (0) +#define RETURN(array) \ + do { \ + *ranges = array; \ + *num_ranges = ARRAY_SIZE(array); \ + } while (0) *num_ranges = 0; *ranges = NULL; @@ -841,8 +849,7 @@ void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family, case SI_REG_RANGE_SH: if (chip_class == GFX10_3 || chip_class == GFX10) RETURN(Gfx10ShShadowRange); - else if (family == CHIP_RAVEN2 || - family == CHIP_RENOIR) + else if (family == CHIP_RAVEN2 || family == CHIP_RENOIR) RETURN(Gfx9ShShadowRangeRaven2); else if (chip_class == GFX9) RETURN(Gfx9ShShadowRange); @@ -850,8 +857,7 @@ void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family, case SI_REG_RANGE_CS_SH: if (chip_class == GFX10_3 || chip_class == GFX10) RETURN(Gfx10CsShShadowRange); - else if (family == CHIP_RAVEN2 || - family == CHIP_RENOIR) + else if (family == CHIP_RAVEN2 || family == CHIP_RENOIR) RETURN(Gfx9CsShShadowRangeRaven2); else if (chip_class == GFX9) RETURN(Gfx9CsShShadowRange); @@ -876,68 +882,68 @@ static void gfx9_emulate_clear_state(struct radeon_cmdbuf *cs, set_context_reg_seq_array_fn set_context_reg_seq_array) { static const uint32_t DbRenderControlGfx9[] = { - 0x0 , // DB_RENDER_CONTROL - 0x0 , // DB_COUNT_CONTROL - 0x0 , // DB_DEPTH_VIEW - 0x0 , // DB_RENDER_OVERRIDE - 0x0 , // DB_RENDER_OVERRIDE2 - 0x0 , // DB_HTILE_DATA_BASE - 0x0 , // DB_HTILE_DATA_BASE_HI - 0x0 , // DB_DEPTH_SIZE - 0x0 , // DB_DEPTH_BOUNDS_MIN - 0x0 , // DB_DEPTH_BOUNDS_MAX - 0x0 , // DB_STENCIL_CLEAR - 0x0 , // DB_DEPTH_CLEAR - 0x0 , // PA_SC_SCREEN_SCISSOR_TL + 0x0, // DB_RENDER_CONTROL + 0x0, // DB_COUNT_CONTROL + 0x0, // DB_DEPTH_VIEW + 0x0, // DB_RENDER_OVERRIDE + 0x0, // DB_RENDER_OVERRIDE2 + 0x0, // DB_HTILE_DATA_BASE + 0x0, // DB_HTILE_DATA_BASE_HI + 0x0, // DB_DEPTH_SIZE + 0x0, // DB_DEPTH_BOUNDS_MIN + 0x0, // DB_DEPTH_BOUNDS_MAX + 0x0, // DB_STENCIL_CLEAR + 0x0, // DB_DEPTH_CLEAR + 0x0, // PA_SC_SCREEN_SCISSOR_TL 0x40004000, // PA_SC_SCREEN_SCISSOR_BR - 0x0 , // DB_Z_INFO - 0x0 , // DB_STENCIL_INFO - 0x0 , // DB_Z_READ_BASE - 0x0 , // DB_Z_READ_BASE_HI - 0x0 , // DB_STENCIL_READ_BASE - 0x0 , // DB_STENCIL_READ_BASE_HI - 0x0 , // DB_Z_WRITE_BASE - 0x0 , // DB_Z_WRITE_BASE_HI - 0x0 , // DB_STENCIL_WRITE_BASE - 0x0 , // DB_STENCIL_WRITE_BASE_HI - 0x0 , // DB_DFSM_CONTROL - 0x0 , // - 0x0 , // DB_Z_INFO2 - 0x0 , // DB_STENCIL_INFO2 - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // TA_BC_BASE_ADDR + 0x0, // DB_Z_INFO + 0x0, // DB_STENCIL_INFO + 0x0, // DB_Z_READ_BASE + 0x0, // DB_Z_READ_BASE_HI + 0x0, // DB_STENCIL_READ_BASE + 0x0, // DB_STENCIL_READ_BASE_HI + 0x0, // DB_Z_WRITE_BASE + 0x0, // DB_Z_WRITE_BASE_HI + 0x0, // DB_STENCIL_WRITE_BASE + 0x0, // DB_STENCIL_WRITE_BASE_HI + 0x0, // DB_DFSM_CONTROL + 0x0, // + 0x0, // DB_Z_INFO2 + 0x0, // DB_STENCIL_INFO2 + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // TA_BC_BASE_ADDR 0x0 // TA_BC_BASE_ADDR_HI }; static const uint32_t CoherDestBaseHi0Gfx9[] = { - 0x0 , // COHER_DEST_BASE_HI_0 - 0x0 , // COHER_DEST_BASE_HI_1 - 0x0 , // COHER_DEST_BASE_HI_2 - 0x0 , // COHER_DEST_BASE_HI_3 - 0x0 , // COHER_DEST_BASE_2 - 0x0 , // COHER_DEST_BASE_3 - 0x0 , // PA_SC_WINDOW_OFFSET + 0x0, // COHER_DEST_BASE_HI_0 + 0x0, // COHER_DEST_BASE_HI_1 + 0x0, // COHER_DEST_BASE_HI_2 + 0x0, // COHER_DEST_BASE_HI_3 + 0x0, // COHER_DEST_BASE_2 + 0x0, // COHER_DEST_BASE_3 + 0x0, // PA_SC_WINDOW_OFFSET 0x80000000, // PA_SC_WINDOW_SCISSOR_TL 0x40004000, // PA_SC_WINDOW_SCISSOR_BR - 0xffff , // PA_SC_CLIPRECT_RULE - 0x0 , // PA_SC_CLIPRECT_0_TL + 0xffff, // PA_SC_CLIPRECT_RULE + 0x0, // PA_SC_CLIPRECT_0_TL 0x40004000, // PA_SC_CLIPRECT_0_BR - 0x0 , // PA_SC_CLIPRECT_1_TL + 0x0, // PA_SC_CLIPRECT_1_TL 0x40004000, // PA_SC_CLIPRECT_1_BR - 0x0 , // PA_SC_CLIPRECT_2_TL + 0x0, // PA_SC_CLIPRECT_2_TL 0x40004000, // PA_SC_CLIPRECT_2_BR - 0x0 , // PA_SC_CLIPRECT_3_TL + 0x0, // PA_SC_CLIPRECT_3_TL 0x40004000, // PA_SC_CLIPRECT_3_BR 0xaa99aaaa, // PA_SC_EDGERULE - 0x0 , // PA_SU_HARDWARE_SCREEN_OFFSET + 0x0, // PA_SU_HARDWARE_SCREEN_OFFSET 0xffffffff, // CB_TARGET_MASK 0xffffffff, // CB_SHADER_MASK 0x80000000, // PA_SC_GENERIC_SCISSOR_TL 0x40004000, // PA_SC_GENERIC_SCISSOR_BR - 0x0 , // COHER_DEST_BASE_0 - 0x0 , // COHER_DEST_BASE_1 + 0x0, // COHER_DEST_BASE_0 + 0x0, // COHER_DEST_BASE_1 0x80000000, // PA_SC_VPORT_SCISSOR_0_TL 0x40004000, // PA_SC_VPORT_SCISSOR_0_BR 0x80000000, // PA_SC_VPORT_SCISSOR_1_TL @@ -970,529 +976,529 @@ static void gfx9_emulate_clear_state(struct radeon_cmdbuf *cs, 0x40004000, // PA_SC_VPORT_SCISSOR_14_BR 0x80000000, // PA_SC_VPORT_SCISSOR_15_TL 0x40004000, // PA_SC_VPORT_SCISSOR_15_BR - 0x0 , // PA_SC_VPORT_ZMIN_0 + 0x0, // PA_SC_VPORT_ZMIN_0 0x3f800000, // PA_SC_VPORT_ZMAX_0 - 0x0 , // PA_SC_VPORT_ZMIN_1 + 0x0, // PA_SC_VPORT_ZMIN_1 0x3f800000, // PA_SC_VPORT_ZMAX_1 - 0x0 , // PA_SC_VPORT_ZMIN_2 + 0x0, // PA_SC_VPORT_ZMIN_2 0x3f800000, // PA_SC_VPORT_ZMAX_2 - 0x0 , // PA_SC_VPORT_ZMIN_3 + 0x0, // PA_SC_VPORT_ZMIN_3 0x3f800000, // PA_SC_VPORT_ZMAX_3 - 0x0 , // PA_SC_VPORT_ZMIN_4 + 0x0, // PA_SC_VPORT_ZMIN_4 0x3f800000, // PA_SC_VPORT_ZMAX_4 - 0x0 , // PA_SC_VPORT_ZMIN_5 + 0x0, // PA_SC_VPORT_ZMIN_5 0x3f800000, // PA_SC_VPORT_ZMAX_5 - 0x0 , // PA_SC_VPORT_ZMIN_6 + 0x0, // PA_SC_VPORT_ZMIN_6 0x3f800000, // PA_SC_VPORT_ZMAX_6 - 0x0 , // PA_SC_VPORT_ZMIN_7 + 0x0, // PA_SC_VPORT_ZMIN_7 0x3f800000, // PA_SC_VPORT_ZMAX_7 - 0x0 , // PA_SC_VPORT_ZMIN_8 + 0x0, // PA_SC_VPORT_ZMIN_8 0x3f800000, // PA_SC_VPORT_ZMAX_8 - 0x0 , // PA_SC_VPORT_ZMIN_9 + 0x0, // PA_SC_VPORT_ZMIN_9 0x3f800000, // PA_SC_VPORT_ZMAX_9 - 0x0 , // PA_SC_VPORT_ZMIN_10 + 0x0, // PA_SC_VPORT_ZMIN_10 0x3f800000, // PA_SC_VPORT_ZMAX_10 - 0x0 , // PA_SC_VPORT_ZMIN_11 + 0x0, // PA_SC_VPORT_ZMIN_11 0x3f800000, // PA_SC_VPORT_ZMAX_11 - 0x0 , // PA_SC_VPORT_ZMIN_12 + 0x0, // PA_SC_VPORT_ZMIN_12 0x3f800000, // PA_SC_VPORT_ZMAX_12 - 0x0 , // PA_SC_VPORT_ZMIN_13 + 0x0, // PA_SC_VPORT_ZMIN_13 0x3f800000, // PA_SC_VPORT_ZMAX_13 - 0x0 , // PA_SC_VPORT_ZMIN_14 + 0x0, // PA_SC_VPORT_ZMIN_14 0x3f800000, // PA_SC_VPORT_ZMAX_14 - 0x0 , // PA_SC_VPORT_ZMIN_15 + 0x0, // PA_SC_VPORT_ZMIN_15 0x3f800000, // PA_SC_VPORT_ZMAX_15 - 0x0 , // PA_SC_RASTER_CONFIG - 0x0 , // PA_SC_RASTER_CONFIG_1 - 0x0 , // + 0x0, // PA_SC_RASTER_CONFIG + 0x0, // PA_SC_RASTER_CONFIG_1 + 0x0, // 0x0 // PA_SC_TILE_STEERING_OVERRIDE }; static const uint32_t VgtMultiPrimIbResetIndxGfx9[] = { - 0x0 // VGT_MULTI_PRIM_IB_RESET_INDX + 0x0 // VGT_MULTI_PRIM_IB_RESET_INDX }; static const uint32_t CbBlendRedGfx9[] = { - 0x0 , // CB_BLEND_RED - 0x0 , // CB_BLEND_GREEN - 0x0 , // CB_BLEND_BLUE - 0x0 , // CB_BLEND_ALPHA - 0x0 , // CB_DCC_CONTROL - 0x0 , // - 0x0 , // DB_STENCIL_CONTROL - 0x1000000 , // DB_STENCILREFMASK - 0x1000000 , // DB_STENCILREFMASK_BF - 0x0 , // - 0x0 , // PA_CL_VPORT_XSCALE - 0x0 , // PA_CL_VPORT_XOFFSET - 0x0 , // PA_CL_VPORT_YSCALE - 0x0 , // PA_CL_VPORT_YOFFSET - 0x0 , // PA_CL_VPORT_ZSCALE - 0x0 , // PA_CL_VPORT_ZOFFSET - 0x0 , // PA_CL_VPORT_XSCALE_1 - 0x0 , // PA_CL_VPORT_XOFFSET_1 - 0x0 , // PA_CL_VPORT_YSCALE_1 - 0x0 , // PA_CL_VPORT_YOFFSET_1 - 0x0 , // PA_CL_VPORT_ZSCALE_1 - 0x0 , // PA_CL_VPORT_ZOFFSET_1 - 0x0 , // PA_CL_VPORT_XSCALE_2 - 0x0 , // PA_CL_VPORT_XOFFSET_2 - 0x0 , // PA_CL_VPORT_YSCALE_2 - 0x0 , // PA_CL_VPORT_YOFFSET_2 - 0x0 , // PA_CL_VPORT_ZSCALE_2 - 0x0 , // PA_CL_VPORT_ZOFFSET_2 - 0x0 , // PA_CL_VPORT_XSCALE_3 - 0x0 , // PA_CL_VPORT_XOFFSET_3 - 0x0 , // PA_CL_VPORT_YSCALE_3 - 0x0 , // PA_CL_VPORT_YOFFSET_3 - 0x0 , // PA_CL_VPORT_ZSCALE_3 - 0x0 , // PA_CL_VPORT_ZOFFSET_3 - 0x0 , // PA_CL_VPORT_XSCALE_4 - 0x0 , // PA_CL_VPORT_XOFFSET_4 - 0x0 , // PA_CL_VPORT_YSCALE_4 - 0x0 , // PA_CL_VPORT_YOFFSET_4 - 0x0 , // PA_CL_VPORT_ZSCALE_4 - 0x0 , // PA_CL_VPORT_ZOFFSET_4 - 0x0 , // PA_CL_VPORT_XSCALE_5 - 0x0 , // PA_CL_VPORT_XOFFSET_5 - 0x0 , // PA_CL_VPORT_YSCALE_5 - 0x0 , // PA_CL_VPORT_YOFFSET_5 - 0x0 , // PA_CL_VPORT_ZSCALE_5 - 0x0 , // PA_CL_VPORT_ZOFFSET_5 - 0x0 , // PA_CL_VPORT_XSCALE_6 - 0x0 , // PA_CL_VPORT_XOFFSET_6 - 0x0 , // PA_CL_VPORT_YSCALE_6 - 0x0 , // PA_CL_VPORT_YOFFSET_6 - 0x0 , // PA_CL_VPORT_ZSCALE_6 - 0x0 , // PA_CL_VPORT_ZOFFSET_6 - 0x0 , // PA_CL_VPORT_XSCALE_7 - 0x0 , // PA_CL_VPORT_XOFFSET_7 - 0x0 , // PA_CL_VPORT_YSCALE_7 - 0x0 , // PA_CL_VPORT_YOFFSET_7 - 0x0 , // PA_CL_VPORT_ZSCALE_7 - 0x0 , // PA_CL_VPORT_ZOFFSET_7 - 0x0 , // PA_CL_VPORT_XSCALE_8 - 0x0 , // PA_CL_VPORT_XOFFSET_8 - 0x0 , // PA_CL_VPORT_YSCALE_8 - 0x0 , // PA_CL_VPORT_YOFFSET_8 - 0x0 , // PA_CL_VPORT_ZSCALE_8 - 0x0 , // PA_CL_VPORT_ZOFFSET_8 - 0x0 , // PA_CL_VPORT_XSCALE_9 - 0x0 , // PA_CL_VPORT_XOFFSET_9 - 0x0 , // PA_CL_VPORT_YSCALE_9 - 0x0 , // PA_CL_VPORT_YOFFSET_9 - 0x0 , // PA_CL_VPORT_ZSCALE_9 - 0x0 , // PA_CL_VPORT_ZOFFSET_9 - 0x0 , // PA_CL_VPORT_XSCALE_10 - 0x0 , // PA_CL_VPORT_XOFFSET_10 - 0x0 , // PA_CL_VPORT_YSCALE_10 - 0x0 , // PA_CL_VPORT_YOFFSET_10 - 0x0 , // PA_CL_VPORT_ZSCALE_10 - 0x0 , // PA_CL_VPORT_ZOFFSET_10 - 0x0 , // PA_CL_VPORT_XSCALE_11 - 0x0 , // PA_CL_VPORT_XOFFSET_11 - 0x0 , // PA_CL_VPORT_YSCALE_11 - 0x0 , // PA_CL_VPORT_YOFFSET_11 - 0x0 , // PA_CL_VPORT_ZSCALE_11 - 0x0 , // PA_CL_VPORT_ZOFFSET_11 - 0x0 , // PA_CL_VPORT_XSCALE_12 - 0x0 , // PA_CL_VPORT_XOFFSET_12 - 0x0 , // PA_CL_VPORT_YSCALE_12 - 0x0 , // PA_CL_VPORT_YOFFSET_12 - 0x0 , // PA_CL_VPORT_ZSCALE_12 - 0x0 , // PA_CL_VPORT_ZOFFSET_12 - 0x0 , // PA_CL_VPORT_XSCALE_13 - 0x0 , // PA_CL_VPORT_XOFFSET_13 - 0x0 , // PA_CL_VPORT_YSCALE_13 - 0x0 , // PA_CL_VPORT_YOFFSET_13 - 0x0 , // PA_CL_VPORT_ZSCALE_13 - 0x0 , // PA_CL_VPORT_ZOFFSET_13 - 0x0 , // PA_CL_VPORT_XSCALE_14 - 0x0 , // PA_CL_VPORT_XOFFSET_14 - 0x0 , // PA_CL_VPORT_YSCALE_14 - 0x0 , // PA_CL_VPORT_YOFFSET_14 - 0x0 , // PA_CL_VPORT_ZSCALE_14 - 0x0 , // PA_CL_VPORT_ZOFFSET_14 - 0x0 , // PA_CL_VPORT_XSCALE_15 - 0x0 , // PA_CL_VPORT_XOFFSET_15 - 0x0 , // PA_CL_VPORT_YSCALE_15 - 0x0 , // PA_CL_VPORT_YOFFSET_15 - 0x0 , // PA_CL_VPORT_ZSCALE_15 - 0x0 , // PA_CL_VPORT_ZOFFSET_15 - 0x0 , // PA_CL_UCP_0_X - 0x0 , // PA_CL_UCP_0_Y - 0x0 , // PA_CL_UCP_0_Z - 0x0 , // PA_CL_UCP_0_W - 0x0 , // PA_CL_UCP_1_X - 0x0 , // PA_CL_UCP_1_Y - 0x0 , // PA_CL_UCP_1_Z - 0x0 , // PA_CL_UCP_1_W - 0x0 , // PA_CL_UCP_2_X - 0x0 , // PA_CL_UCP_2_Y - 0x0 , // PA_CL_UCP_2_Z - 0x0 , // PA_CL_UCP_2_W - 0x0 , // PA_CL_UCP_3_X - 0x0 , // PA_CL_UCP_3_Y - 0x0 , // PA_CL_UCP_3_Z - 0x0 , // PA_CL_UCP_3_W - 0x0 , // PA_CL_UCP_4_X - 0x0 , // PA_CL_UCP_4_Y - 0x0 , // PA_CL_UCP_4_Z - 0x0 , // PA_CL_UCP_4_W - 0x0 , // PA_CL_UCP_5_X - 0x0 , // PA_CL_UCP_5_Y - 0x0 , // PA_CL_UCP_5_Z - 0x0 // PA_CL_UCP_5_W + 0x0, // CB_BLEND_RED + 0x0, // CB_BLEND_GREEN + 0x0, // CB_BLEND_BLUE + 0x0, // CB_BLEND_ALPHA + 0x0, // CB_DCC_CONTROL + 0x0, // + 0x0, // DB_STENCIL_CONTROL + 0x1000000, // DB_STENCILREFMASK + 0x1000000, // DB_STENCILREFMASK_BF + 0x0, // + 0x0, // PA_CL_VPORT_XSCALE + 0x0, // PA_CL_VPORT_XOFFSET + 0x0, // PA_CL_VPORT_YSCALE + 0x0, // PA_CL_VPORT_YOFFSET + 0x0, // PA_CL_VPORT_ZSCALE + 0x0, // PA_CL_VPORT_ZOFFSET + 0x0, // PA_CL_VPORT_XSCALE_1 + 0x0, // PA_CL_VPORT_XOFFSET_1 + 0x0, // PA_CL_VPORT_YSCALE_1 + 0x0, // PA_CL_VPORT_YOFFSET_1 + 0x0, // PA_CL_VPORT_ZSCALE_1 + 0x0, // PA_CL_VPORT_ZOFFSET_1 + 0x0, // PA_CL_VPORT_XSCALE_2 + 0x0, // PA_CL_VPORT_XOFFSET_2 + 0x0, // PA_CL_VPORT_YSCALE_2 + 0x0, // PA_CL_VPORT_YOFFSET_2 + 0x0, // PA_CL_VPORT_ZSCALE_2 + 0x0, // PA_CL_VPORT_ZOFFSET_2 + 0x0, // PA_CL_VPORT_XSCALE_3 + 0x0, // PA_CL_VPORT_XOFFSET_3 + 0x0, // PA_CL_VPORT_YSCALE_3 + 0x0, // PA_CL_VPORT_YOFFSET_3 + 0x0, // PA_CL_VPORT_ZSCALE_3 + 0x0, // PA_CL_VPORT_ZOFFSET_3 + 0x0, // PA_CL_VPORT_XSCALE_4 + 0x0, // PA_CL_VPORT_XOFFSET_4 + 0x0, // PA_CL_VPORT_YSCALE_4 + 0x0, // PA_CL_VPORT_YOFFSET_4 + 0x0, // PA_CL_VPORT_ZSCALE_4 + 0x0, // PA_CL_VPORT_ZOFFSET_4 + 0x0, // PA_CL_VPORT_XSCALE_5 + 0x0, // PA_CL_VPORT_XOFFSET_5 + 0x0, // PA_CL_VPORT_YSCALE_5 + 0x0, // PA_CL_VPORT_YOFFSET_5 + 0x0, // PA_CL_VPORT_ZSCALE_5 + 0x0, // PA_CL_VPORT_ZOFFSET_5 + 0x0, // PA_CL_VPORT_XSCALE_6 + 0x0, // PA_CL_VPORT_XOFFSET_6 + 0x0, // PA_CL_VPORT_YSCALE_6 + 0x0, // PA_CL_VPORT_YOFFSET_6 + 0x0, // PA_CL_VPORT_ZSCALE_6 + 0x0, // PA_CL_VPORT_ZOFFSET_6 + 0x0, // PA_CL_VPORT_XSCALE_7 + 0x0, // PA_CL_VPORT_XOFFSET_7 + 0x0, // PA_CL_VPORT_YSCALE_7 + 0x0, // PA_CL_VPORT_YOFFSET_7 + 0x0, // PA_CL_VPORT_ZSCALE_7 + 0x0, // PA_CL_VPORT_ZOFFSET_7 + 0x0, // PA_CL_VPORT_XSCALE_8 + 0x0, // PA_CL_VPORT_XOFFSET_8 + 0x0, // PA_CL_VPORT_YSCALE_8 + 0x0, // PA_CL_VPORT_YOFFSET_8 + 0x0, // PA_CL_VPORT_ZSCALE_8 + 0x0, // PA_CL_VPORT_ZOFFSET_8 + 0x0, // PA_CL_VPORT_XSCALE_9 + 0x0, // PA_CL_VPORT_XOFFSET_9 + 0x0, // PA_CL_VPORT_YSCALE_9 + 0x0, // PA_CL_VPORT_YOFFSET_9 + 0x0, // PA_CL_VPORT_ZSCALE_9 + 0x0, // PA_CL_VPORT_ZOFFSET_9 + 0x0, // PA_CL_VPORT_XSCALE_10 + 0x0, // PA_CL_VPORT_XOFFSET_10 + 0x0, // PA_CL_VPORT_YSCALE_10 + 0x0, // PA_CL_VPORT_YOFFSET_10 + 0x0, // PA_CL_VPORT_ZSCALE_10 + 0x0, // PA_CL_VPORT_ZOFFSET_10 + 0x0, // PA_CL_VPORT_XSCALE_11 + 0x0, // PA_CL_VPORT_XOFFSET_11 + 0x0, // PA_CL_VPORT_YSCALE_11 + 0x0, // PA_CL_VPORT_YOFFSET_11 + 0x0, // PA_CL_VPORT_ZSCALE_11 + 0x0, // PA_CL_VPORT_ZOFFSET_11 + 0x0, // PA_CL_VPORT_XSCALE_12 + 0x0, // PA_CL_VPORT_XOFFSET_12 + 0x0, // PA_CL_VPORT_YSCALE_12 + 0x0, // PA_CL_VPORT_YOFFSET_12 + 0x0, // PA_CL_VPORT_ZSCALE_12 + 0x0, // PA_CL_VPORT_ZOFFSET_12 + 0x0, // PA_CL_VPORT_XSCALE_13 + 0x0, // PA_CL_VPORT_XOFFSET_13 + 0x0, // PA_CL_VPORT_YSCALE_13 + 0x0, // PA_CL_VPORT_YOFFSET_13 + 0x0, // PA_CL_VPORT_ZSCALE_13 + 0x0, // PA_CL_VPORT_ZOFFSET_13 + 0x0, // PA_CL_VPORT_XSCALE_14 + 0x0, // PA_CL_VPORT_XOFFSET_14 + 0x0, // PA_CL_VPORT_YSCALE_14 + 0x0, // PA_CL_VPORT_YOFFSET_14 + 0x0, // PA_CL_VPORT_ZSCALE_14 + 0x0, // PA_CL_VPORT_ZOFFSET_14 + 0x0, // PA_CL_VPORT_XSCALE_15 + 0x0, // PA_CL_VPORT_XOFFSET_15 + 0x0, // PA_CL_VPORT_YSCALE_15 + 0x0, // PA_CL_VPORT_YOFFSET_15 + 0x0, // PA_CL_VPORT_ZSCALE_15 + 0x0, // PA_CL_VPORT_ZOFFSET_15 + 0x0, // PA_CL_UCP_0_X + 0x0, // PA_CL_UCP_0_Y + 0x0, // PA_CL_UCP_0_Z + 0x0, // PA_CL_UCP_0_W + 0x0, // PA_CL_UCP_1_X + 0x0, // PA_CL_UCP_1_Y + 0x0, // PA_CL_UCP_1_Z + 0x0, // PA_CL_UCP_1_W + 0x0, // PA_CL_UCP_2_X + 0x0, // PA_CL_UCP_2_Y + 0x0, // PA_CL_UCP_2_Z + 0x0, // PA_CL_UCP_2_W + 0x0, // PA_CL_UCP_3_X + 0x0, // PA_CL_UCP_3_Y + 0x0, // PA_CL_UCP_3_Z + 0x0, // PA_CL_UCP_3_W + 0x0, // PA_CL_UCP_4_X + 0x0, // PA_CL_UCP_4_Y + 0x0, // PA_CL_UCP_4_Z + 0x0, // PA_CL_UCP_4_W + 0x0, // PA_CL_UCP_5_X + 0x0, // PA_CL_UCP_5_Y + 0x0, // PA_CL_UCP_5_Z + 0x0 // PA_CL_UCP_5_W }; static const uint32_t SpiPsInputCntl0Gfx9[] = { - 0x0 , // SPI_PS_INPUT_CNTL_0 - 0x0 , // SPI_PS_INPUT_CNTL_1 - 0x0 , // SPI_PS_INPUT_CNTL_2 - 0x0 , // SPI_PS_INPUT_CNTL_3 - 0x0 , // SPI_PS_INPUT_CNTL_4 - 0x0 , // SPI_PS_INPUT_CNTL_5 - 0x0 , // SPI_PS_INPUT_CNTL_6 - 0x0 , // SPI_PS_INPUT_CNTL_7 - 0x0 , // SPI_PS_INPUT_CNTL_8 - 0x0 , // SPI_PS_INPUT_CNTL_9 - 0x0 , // SPI_PS_INPUT_CNTL_10 - 0x0 , // SPI_PS_INPUT_CNTL_11 - 0x0 , // SPI_PS_INPUT_CNTL_12 - 0x0 , // SPI_PS_INPUT_CNTL_13 - 0x0 , // SPI_PS_INPUT_CNTL_14 - 0x0 , // SPI_PS_INPUT_CNTL_15 - 0x0 , // SPI_PS_INPUT_CNTL_16 - 0x0 , // SPI_PS_INPUT_CNTL_17 - 0x0 , // SPI_PS_INPUT_CNTL_18 - 0x0 , // SPI_PS_INPUT_CNTL_19 - 0x0 , // SPI_PS_INPUT_CNTL_20 - 0x0 , // SPI_PS_INPUT_CNTL_21 - 0x0 , // SPI_PS_INPUT_CNTL_22 - 0x0 , // SPI_PS_INPUT_CNTL_23 - 0x0 , // SPI_PS_INPUT_CNTL_24 - 0x0 , // SPI_PS_INPUT_CNTL_25 - 0x0 , // SPI_PS_INPUT_CNTL_26 - 0x0 , // SPI_PS_INPUT_CNTL_27 - 0x0 , // SPI_PS_INPUT_CNTL_28 - 0x0 , // SPI_PS_INPUT_CNTL_29 - 0x0 , // SPI_PS_INPUT_CNTL_30 - 0x0 , // SPI_PS_INPUT_CNTL_31 - 0x0 , // SPI_VS_OUT_CONFIG - 0x0 , // - 0x0 , // SPI_PS_INPUT_ENA - 0x0 , // SPI_PS_INPUT_ADDR - 0x0 , // SPI_INTERP_CONTROL_0 - 0x2 , // SPI_PS_IN_CONTROL - 0x0 , // - 0x0 , // SPI_BARYC_CNTL - 0x0 , // - 0x0 , // SPI_TMPRING_SIZE - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // SPI_SHADER_POS_FORMAT - 0x0 , // SPI_SHADER_Z_FORMAT - 0x0 // SPI_SHADER_COL_FORMAT + 0x0, // SPI_PS_INPUT_CNTL_0 + 0x0, // SPI_PS_INPUT_CNTL_1 + 0x0, // SPI_PS_INPUT_CNTL_2 + 0x0, // SPI_PS_INPUT_CNTL_3 + 0x0, // SPI_PS_INPUT_CNTL_4 + 0x0, // SPI_PS_INPUT_CNTL_5 + 0x0, // SPI_PS_INPUT_CNTL_6 + 0x0, // SPI_PS_INPUT_CNTL_7 + 0x0, // SPI_PS_INPUT_CNTL_8 + 0x0, // SPI_PS_INPUT_CNTL_9 + 0x0, // SPI_PS_INPUT_CNTL_10 + 0x0, // SPI_PS_INPUT_CNTL_11 + 0x0, // SPI_PS_INPUT_CNTL_12 + 0x0, // SPI_PS_INPUT_CNTL_13 + 0x0, // SPI_PS_INPUT_CNTL_14 + 0x0, // SPI_PS_INPUT_CNTL_15 + 0x0, // SPI_PS_INPUT_CNTL_16 + 0x0, // SPI_PS_INPUT_CNTL_17 + 0x0, // SPI_PS_INPUT_CNTL_18 + 0x0, // SPI_PS_INPUT_CNTL_19 + 0x0, // SPI_PS_INPUT_CNTL_20 + 0x0, // SPI_PS_INPUT_CNTL_21 + 0x0, // SPI_PS_INPUT_CNTL_22 + 0x0, // SPI_PS_INPUT_CNTL_23 + 0x0, // SPI_PS_INPUT_CNTL_24 + 0x0, // SPI_PS_INPUT_CNTL_25 + 0x0, // SPI_PS_INPUT_CNTL_26 + 0x0, // SPI_PS_INPUT_CNTL_27 + 0x0, // SPI_PS_INPUT_CNTL_28 + 0x0, // SPI_PS_INPUT_CNTL_29 + 0x0, // SPI_PS_INPUT_CNTL_30 + 0x0, // SPI_PS_INPUT_CNTL_31 + 0x0, // SPI_VS_OUT_CONFIG + 0x0, // + 0x0, // SPI_PS_INPUT_ENA + 0x0, // SPI_PS_INPUT_ADDR + 0x0, // SPI_INTERP_CONTROL_0 + 0x2, // SPI_PS_IN_CONTROL + 0x0, // + 0x0, // SPI_BARYC_CNTL + 0x0, // + 0x0, // SPI_TMPRING_SIZE + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // SPI_SHADER_POS_FORMAT + 0x0, // SPI_SHADER_Z_FORMAT + 0x0 // SPI_SHADER_COL_FORMAT }; static const uint32_t SxPsDownconvertGfx9[] = { - 0x0 , // SX_PS_DOWNCONVERT - 0x0 , // SX_BLEND_OPT_EPSILON - 0x0 , // SX_BLEND_OPT_CONTROL - 0x0 , // SX_MRT0_BLEND_OPT - 0x0 , // SX_MRT1_BLEND_OPT - 0x0 , // SX_MRT2_BLEND_OPT - 0x0 , // SX_MRT3_BLEND_OPT - 0x0 , // SX_MRT4_BLEND_OPT - 0x0 , // SX_MRT5_BLEND_OPT - 0x0 , // SX_MRT6_BLEND_OPT - 0x0 , // SX_MRT7_BLEND_OPT - 0x0 , // CB_BLEND0_CONTROL - 0x0 , // CB_BLEND1_CONTROL - 0x0 , // CB_BLEND2_CONTROL - 0x0 , // CB_BLEND3_CONTROL - 0x0 , // CB_BLEND4_CONTROL - 0x0 , // CB_BLEND5_CONTROL - 0x0 , // CB_BLEND6_CONTROL - 0x0 , // CB_BLEND7_CONTROL - 0x0 , // CB_MRT0_EPITCH - 0x0 , // CB_MRT1_EPITCH - 0x0 , // CB_MRT2_EPITCH - 0x0 , // CB_MRT3_EPITCH - 0x0 , // CB_MRT4_EPITCH - 0x0 , // CB_MRT5_EPITCH - 0x0 , // CB_MRT6_EPITCH - 0x0 // CB_MRT7_EPITCH + 0x0, // SX_PS_DOWNCONVERT + 0x0, // SX_BLEND_OPT_EPSILON + 0x0, // SX_BLEND_OPT_CONTROL + 0x0, // SX_MRT0_BLEND_OPT + 0x0, // SX_MRT1_BLEND_OPT + 0x0, // SX_MRT2_BLEND_OPT + 0x0, // SX_MRT3_BLEND_OPT + 0x0, // SX_MRT4_BLEND_OPT + 0x0, // SX_MRT5_BLEND_OPT + 0x0, // SX_MRT6_BLEND_OPT + 0x0, // SX_MRT7_BLEND_OPT + 0x0, // CB_BLEND0_CONTROL + 0x0, // CB_BLEND1_CONTROL + 0x0, // CB_BLEND2_CONTROL + 0x0, // CB_BLEND3_CONTROL + 0x0, // CB_BLEND4_CONTROL + 0x0, // CB_BLEND5_CONTROL + 0x0, // CB_BLEND6_CONTROL + 0x0, // CB_BLEND7_CONTROL + 0x0, // CB_MRT0_EPITCH + 0x0, // CB_MRT1_EPITCH + 0x0, // CB_MRT2_EPITCH + 0x0, // CB_MRT3_EPITCH + 0x0, // CB_MRT4_EPITCH + 0x0, // CB_MRT5_EPITCH + 0x0, // CB_MRT6_EPITCH + 0x0 // CB_MRT7_EPITCH }; static const uint32_t DbDepthControlGfx9[] = { - 0x0 , // DB_DEPTH_CONTROL - 0x0 , // DB_EQAA - 0x0 , // CB_COLOR_CONTROL - 0x0 , // DB_SHADER_CONTROL - 0x90000 , // PA_CL_CLIP_CNTL - 0x4 , // PA_SU_SC_MODE_CNTL - 0x0 , // PA_CL_VTE_CNTL - 0x0 , // PA_CL_VS_OUT_CNTL - 0x0 // PA_CL_NANINF_CNTL + 0x0, // DB_DEPTH_CONTROL + 0x0, // DB_EQAA + 0x0, // CB_COLOR_CONTROL + 0x0, // DB_SHADER_CONTROL + 0x90000, // PA_CL_CLIP_CNTL + 0x4, // PA_SU_SC_MODE_CNTL + 0x0, // PA_CL_VTE_CNTL + 0x0, // PA_CL_VS_OUT_CNTL + 0x0 // PA_CL_NANINF_CNTL }; static const uint32_t PaSuPrimFilterCntlGfx9[] = { - 0x0 , // PA_SU_PRIM_FILTER_CNTL - 0x0 , // PA_SU_SMALL_PRIM_FILTER_CNTL - 0x0 , // PA_CL_OBJPRIM_ID_CNTL - 0x0 , // PA_CL_NGG_CNTL - 0x0 , // PA_SU_OVER_RASTERIZATION_CNTL - 0x0 // PA_STEREO_CNTL + 0x0, // PA_SU_PRIM_FILTER_CNTL + 0x0, // PA_SU_SMALL_PRIM_FILTER_CNTL + 0x0, // PA_CL_OBJPRIM_ID_CNTL + 0x0, // PA_CL_NGG_CNTL + 0x0, // PA_SU_OVER_RASTERIZATION_CNTL + 0x0 // PA_STEREO_CNTL }; static const uint32_t PaSuPointSizeGfx9[] = { - 0x0 , // PA_SU_POINT_SIZE - 0x0 , // PA_SU_POINT_MINMAX - 0x0 , // PA_SU_LINE_CNTL - 0x0 // PA_SC_LINE_STIPPLE + 0x0, // PA_SU_POINT_SIZE + 0x0, // PA_SU_POINT_MINMAX + 0x0, // PA_SU_LINE_CNTL + 0x0 // PA_SC_LINE_STIPPLE }; static const uint32_t VgtHosMaxTessLevelGfx9[] = { - 0x0 , // VGT_HOS_MAX_TESS_LEVEL - 0x0 // VGT_HOS_MIN_TESS_LEVEL + 0x0, // VGT_HOS_MAX_TESS_LEVEL + 0x0 // VGT_HOS_MIN_TESS_LEVEL }; static const uint32_t VgtGsModeGfx9[] = { - 0x0 , // VGT_GS_MODE - 0x0 , // VGT_GS_ONCHIP_CNTL - 0x0 , // PA_SC_MODE_CNTL_0 - 0x0 , // PA_SC_MODE_CNTL_1 - 0x0 , // VGT_ENHANCE - 0x100 , // VGT_GS_PER_ES - 0x80 , // VGT_ES_PER_GS - 0x2 , // VGT_GS_PER_VS - 0x0 , // VGT_GSVS_RING_OFFSET_1 - 0x0 , // VGT_GSVS_RING_OFFSET_2 - 0x0 , // VGT_GSVS_RING_OFFSET_3 - 0x0 // VGT_GS_OUT_PRIM_TYPE + 0x0, // VGT_GS_MODE + 0x0, // VGT_GS_ONCHIP_CNTL + 0x0, // PA_SC_MODE_CNTL_0 + 0x0, // PA_SC_MODE_CNTL_1 + 0x0, // VGT_ENHANCE + 0x100, // VGT_GS_PER_ES + 0x80, // VGT_ES_PER_GS + 0x2, // VGT_GS_PER_VS + 0x0, // VGT_GSVS_RING_OFFSET_1 + 0x0, // VGT_GSVS_RING_OFFSET_2 + 0x0, // VGT_GSVS_RING_OFFSET_3 + 0x0 // VGT_GS_OUT_PRIM_TYPE }; static const uint32_t VgtPrimitiveidEnGfx9[] = { - 0x0 // VGT_PRIMITIVEID_EN + 0x0 // VGT_PRIMITIVEID_EN }; static const uint32_t VgtPrimitiveidResetGfx9[] = { - 0x0 // VGT_PRIMITIVEID_RESET + 0x0 // VGT_PRIMITIVEID_RESET }; static const uint32_t VgtGsMaxPrimsPerSubgroupGfx9[] = { - 0x0 , // VGT_GS_MAX_PRIMS_PER_SUBGROUP - 0x0 , // VGT_DRAW_PAYLOAD_CNTL - 0x0 , // - 0x0 , // VGT_INSTANCE_STEP_RATE_0 - 0x0 , // VGT_INSTANCE_STEP_RATE_1 - 0x0 , // - 0x0 , // VGT_ESGS_RING_ITEMSIZE - 0x0 , // VGT_GSVS_RING_ITEMSIZE - 0x0 , // VGT_REUSE_OFF - 0x0 , // VGT_VTX_CNT_EN - 0x0 , // DB_HTILE_SURFACE - 0x0 , // DB_SRESULTS_COMPARE_STATE0 - 0x0 , // DB_SRESULTS_COMPARE_STATE1 - 0x0 , // DB_PRELOAD_CONTROL - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_0 - 0x0 // VGT_STRMOUT_VTX_STRIDE_0 + 0x0, // VGT_GS_MAX_PRIMS_PER_SUBGROUP + 0x0, // VGT_DRAW_PAYLOAD_CNTL + 0x0, // + 0x0, // VGT_INSTANCE_STEP_RATE_0 + 0x0, // VGT_INSTANCE_STEP_RATE_1 + 0x0, // + 0x0, // VGT_ESGS_RING_ITEMSIZE + 0x0, // VGT_GSVS_RING_ITEMSIZE + 0x0, // VGT_REUSE_OFF + 0x0, // VGT_VTX_CNT_EN + 0x0, // DB_HTILE_SURFACE + 0x0, // DB_SRESULTS_COMPARE_STATE0 + 0x0, // DB_SRESULTS_COMPARE_STATE1 + 0x0, // DB_PRELOAD_CONTROL + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_SIZE_0 + 0x0 // VGT_STRMOUT_VTX_STRIDE_0 }; static const uint32_t VgtStrmoutBufferSize1Gfx9[] = { - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_1 - 0x0 // VGT_STRMOUT_VTX_STRIDE_1 + 0x0, // VGT_STRMOUT_BUFFER_SIZE_1 + 0x0 // VGT_STRMOUT_VTX_STRIDE_1 }; static const uint32_t VgtStrmoutBufferSize2Gfx9[] = { - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_2 - 0x0 // VGT_STRMOUT_VTX_STRIDE_2 + 0x0, // VGT_STRMOUT_BUFFER_SIZE_2 + 0x0 // VGT_STRMOUT_VTX_STRIDE_2 }; static const uint32_t VgtStrmoutBufferSize3Gfx9[] = { - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_3 - 0x0 // VGT_STRMOUT_VTX_STRIDE_3 + 0x0, // VGT_STRMOUT_BUFFER_SIZE_3 + 0x0 // VGT_STRMOUT_VTX_STRIDE_3 }; static const uint32_t VgtStrmoutDrawOpaqueOffsetGfx9[] = { - 0x0 , // VGT_STRMOUT_DRAW_OPAQUE_OFFSET - 0x0 , // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - 0x0 // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE + 0x0, // VGT_STRMOUT_DRAW_OPAQUE_OFFSET + 0x0, // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE + 0x0 // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE }; static const uint32_t VgtGsMaxVertOutGfx9[] = { - 0x0 , // VGT_GS_MAX_VERT_OUT - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // VGT_TESS_DISTRIBUTION - 0x0 , // VGT_SHADER_STAGES_EN - 0x0 , // VGT_LS_HS_CONFIG - 0x0 , // VGT_GS_VERT_ITEMSIZE - 0x0 , // VGT_GS_VERT_ITEMSIZE_1 - 0x0 , // VGT_GS_VERT_ITEMSIZE_2 - 0x0 , // VGT_GS_VERT_ITEMSIZE_3 - 0x0 , // VGT_TF_PARAM - 0x0 , // DB_ALPHA_TO_MASK - 0x0 , // VGT_DISPATCH_DRAW_INDEX - 0x0 , // PA_SU_POLY_OFFSET_DB_FMT_CNTL - 0x0 , // PA_SU_POLY_OFFSET_CLAMP - 0x0 , // PA_SU_POLY_OFFSET_FRONT_SCALE - 0x0 , // PA_SU_POLY_OFFSET_FRONT_OFFSET - 0x0 , // PA_SU_POLY_OFFSET_BACK_SCALE - 0x0 , // PA_SU_POLY_OFFSET_BACK_OFFSET - 0x0 , // VGT_GS_INSTANCE_CNT - 0x0 , // VGT_STRMOUT_CONFIG - 0x0 // VGT_STRMOUT_BUFFER_CONFIG + 0x0, // VGT_GS_MAX_VERT_OUT + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // VGT_TESS_DISTRIBUTION + 0x0, // VGT_SHADER_STAGES_EN + 0x0, // VGT_LS_HS_CONFIG + 0x0, // VGT_GS_VERT_ITEMSIZE + 0x0, // VGT_GS_VERT_ITEMSIZE_1 + 0x0, // VGT_GS_VERT_ITEMSIZE_2 + 0x0, // VGT_GS_VERT_ITEMSIZE_3 + 0x0, // VGT_TF_PARAM + 0x0, // DB_ALPHA_TO_MASK + 0x0, // VGT_DISPATCH_DRAW_INDEX + 0x0, // PA_SU_POLY_OFFSET_DB_FMT_CNTL + 0x0, // PA_SU_POLY_OFFSET_CLAMP + 0x0, // PA_SU_POLY_OFFSET_FRONT_SCALE + 0x0, // PA_SU_POLY_OFFSET_FRONT_OFFSET + 0x0, // PA_SU_POLY_OFFSET_BACK_SCALE + 0x0, // PA_SU_POLY_OFFSET_BACK_OFFSET + 0x0, // VGT_GS_INSTANCE_CNT + 0x0, // VGT_STRMOUT_CONFIG + 0x0 // VGT_STRMOUT_BUFFER_CONFIG }; static const uint32_t PaScCentroidPriority0Gfx9[] = { - 0x0 , // PA_SC_CENTROID_PRIORITY_0 - 0x0 , // PA_SC_CENTROID_PRIORITY_1 - 0x1000 , // PA_SC_LINE_CNTL - 0x0 , // PA_SC_AA_CONFIG - 0x5 , // PA_SU_VTX_CNTL + 0x0, // PA_SC_CENTROID_PRIORITY_0 + 0x0, // PA_SC_CENTROID_PRIORITY_1 + 0x1000, // PA_SC_LINE_CNTL + 0x0, // PA_SC_AA_CONFIG + 0x5, // PA_SU_VTX_CNTL 0x3f800000, // PA_CL_GB_VERT_CLIP_ADJ 0x3f800000, // PA_CL_GB_VERT_DISC_ADJ 0x3f800000, // PA_CL_GB_HORZ_CLIP_ADJ 0x3f800000, // PA_CL_GB_HORZ_DISC_ADJ - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3 0xffffffff, // PA_SC_AA_MASK_X0Y0_X1Y0 0xffffffff, // PA_SC_AA_MASK_X0Y1_X1Y1 - 0x0 , // PA_SC_SHADER_CONTROL - 0x3 , // PA_SC_BINNER_CNTL_0 - 0x0 , // PA_SC_BINNER_CNTL_1 - 0x100000 , // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL - 0x0 , // PA_SC_NGG_MODE_CNTL - 0x0 , // - 0x1e , // VGT_VERTEX_REUSE_BLOCK_CNTL - 0x20 , // VGT_OUT_DEALLOC_CNTL - 0x0 , // CB_COLOR0_BASE - 0x0 , // CB_COLOR0_BASE_EXT - 0x0 , // CB_COLOR0_ATTRIB2 - 0x0 , // CB_COLOR0_VIEW - 0x0 , // CB_COLOR0_INFO - 0x0 , // CB_COLOR0_ATTRIB - 0x0 , // CB_COLOR0_DCC_CONTROL - 0x0 , // CB_COLOR0_CMASK - 0x0 , // CB_COLOR0_CMASK_BASE_EXT - 0x0 , // CB_COLOR0_FMASK - 0x0 , // CB_COLOR0_FMASK_BASE_EXT - 0x0 , // CB_COLOR0_CLEAR_WORD0 - 0x0 , // CB_COLOR0_CLEAR_WORD1 - 0x0 , // CB_COLOR0_DCC_BASE - 0x0 , // CB_COLOR0_DCC_BASE_EXT - 0x0 , // CB_COLOR1_BASE - 0x0 , // CB_COLOR1_BASE_EXT - 0x0 , // CB_COLOR1_ATTRIB2 - 0x0 , // CB_COLOR1_VIEW - 0x0 , // CB_COLOR1_INFO - 0x0 , // CB_COLOR1_ATTRIB - 0x0 , // CB_COLOR1_DCC_CONTROL - 0x0 , // CB_COLOR1_CMASK - 0x0 , // CB_COLOR1_CMASK_BASE_EXT - 0x0 , // CB_COLOR1_FMASK - 0x0 , // CB_COLOR1_FMASK_BASE_EXT - 0x0 , // CB_COLOR1_CLEAR_WORD0 - 0x0 , // CB_COLOR1_CLEAR_WORD1 - 0x0 , // CB_COLOR1_DCC_BASE - 0x0 , // CB_COLOR1_DCC_BASE_EXT - 0x0 , // CB_COLOR2_BASE - 0x0 , // CB_COLOR2_BASE_EXT - 0x0 , // CB_COLOR2_ATTRIB2 - 0x0 , // CB_COLOR2_VIEW - 0x0 , // CB_COLOR2_INFO - 0x0 , // CB_COLOR2_ATTRIB - 0x0 , // CB_COLOR2_DCC_CONTROL - 0x0 , // CB_COLOR2_CMASK - 0x0 , // CB_COLOR2_CMASK_BASE_EXT - 0x0 , // CB_COLOR2_FMASK - 0x0 , // CB_COLOR2_FMASK_BASE_EXT - 0x0 , // CB_COLOR2_CLEAR_WORD0 - 0x0 , // CB_COLOR2_CLEAR_WORD1 - 0x0 , // CB_COLOR2_DCC_BASE - 0x0 , // CB_COLOR2_DCC_BASE_EXT - 0x0 , // CB_COLOR3_BASE - 0x0 , // CB_COLOR3_BASE_EXT - 0x0 , // CB_COLOR3_ATTRIB2 - 0x0 , // CB_COLOR3_VIEW - 0x0 , // CB_COLOR3_INFO - 0x0 , // CB_COLOR3_ATTRIB - 0x0 , // CB_COLOR3_DCC_CONTROL - 0x0 , // CB_COLOR3_CMASK - 0x0 , // CB_COLOR3_CMASK_BASE_EXT - 0x0 , // CB_COLOR3_FMASK - 0x0 , // CB_COLOR3_FMASK_BASE_EXT - 0x0 , // CB_COLOR3_CLEAR_WORD0 - 0x0 , // CB_COLOR3_CLEAR_WORD1 - 0x0 , // CB_COLOR3_DCC_BASE - 0x0 , // CB_COLOR3_DCC_BASE_EXT - 0x0 , // CB_COLOR4_BASE - 0x0 , // CB_COLOR4_BASE_EXT - 0x0 , // CB_COLOR4_ATTRIB2 - 0x0 , // CB_COLOR4_VIEW - 0x0 , // CB_COLOR4_INFO - 0x0 , // CB_COLOR4_ATTRIB - 0x0 , // CB_COLOR4_DCC_CONTROL - 0x0 , // CB_COLOR4_CMASK - 0x0 , // CB_COLOR4_CMASK_BASE_EXT - 0x0 , // CB_COLOR4_FMASK - 0x0 , // CB_COLOR4_FMASK_BASE_EXT - 0x0 , // CB_COLOR4_CLEAR_WORD0 - 0x0 , // CB_COLOR4_CLEAR_WORD1 - 0x0 , // CB_COLOR4_DCC_BASE - 0x0 , // CB_COLOR4_DCC_BASE_EXT - 0x0 , // CB_COLOR5_BASE - 0x0 , // CB_COLOR5_BASE_EXT - 0x0 , // CB_COLOR5_ATTRIB2 - 0x0 , // CB_COLOR5_VIEW - 0x0 , // CB_COLOR5_INFO - 0x0 , // CB_COLOR5_ATTRIB - 0x0 , // CB_COLOR5_DCC_CONTROL - 0x0 , // CB_COLOR5_CMASK - 0x0 , // CB_COLOR5_CMASK_BASE_EXT - 0x0 , // CB_COLOR5_FMASK - 0x0 , // CB_COLOR5_FMASK_BASE_EXT - 0x0 , // CB_COLOR5_CLEAR_WORD0 - 0x0 , // CB_COLOR5_CLEAR_WORD1 - 0x0 , // CB_COLOR5_DCC_BASE - 0x0 , // CB_COLOR5_DCC_BASE_EXT - 0x0 , // CB_COLOR6_BASE - 0x0 , // CB_COLOR6_BASE_EXT - 0x0 , // CB_COLOR6_ATTRIB2 - 0x0 , // CB_COLOR6_VIEW - 0x0 , // CB_COLOR6_INFO - 0x0 , // CB_COLOR6_ATTRIB - 0x0 , // CB_COLOR6_DCC_CONTROL - 0x0 , // CB_COLOR6_CMASK - 0x0 , // CB_COLOR6_CMASK_BASE_EXT - 0x0 , // CB_COLOR6_FMASK - 0x0 , // CB_COLOR6_FMASK_BASE_EXT - 0x0 , // CB_COLOR6_CLEAR_WORD0 - 0x0 , // CB_COLOR6_CLEAR_WORD1 - 0x0 , // CB_COLOR6_DCC_BASE - 0x0 , // CB_COLOR6_DCC_BASE_EXT - 0x0 , // CB_COLOR7_BASE - 0x0 , // CB_COLOR7_BASE_EXT - 0x0 , // CB_COLOR7_ATTRIB2 - 0x0 , // CB_COLOR7_VIEW - 0x0 , // CB_COLOR7_INFO - 0x0 , // CB_COLOR7_ATTRIB - 0x0 , // CB_COLOR7_DCC_CONTROL - 0x0 , // CB_COLOR7_CMASK - 0x0 , // CB_COLOR7_CMASK_BASE_EXT - 0x0 , // CB_COLOR7_FMASK - 0x0 , // CB_COLOR7_FMASK_BASE_EXT - 0x0 , // CB_COLOR7_CLEAR_WORD0 - 0x0 , // CB_COLOR7_CLEAR_WORD1 - 0x0 , // CB_COLOR7_DCC_BASE + 0x0, // PA_SC_SHADER_CONTROL + 0x3, // PA_SC_BINNER_CNTL_0 + 0x0, // PA_SC_BINNER_CNTL_1 + 0x100000, // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL + 0x0, // PA_SC_NGG_MODE_CNTL + 0x0, // + 0x1e, // VGT_VERTEX_REUSE_BLOCK_CNTL + 0x20, // VGT_OUT_DEALLOC_CNTL + 0x0, // CB_COLOR0_BASE + 0x0, // CB_COLOR0_BASE_EXT + 0x0, // CB_COLOR0_ATTRIB2 + 0x0, // CB_COLOR0_VIEW + 0x0, // CB_COLOR0_INFO + 0x0, // CB_COLOR0_ATTRIB + 0x0, // CB_COLOR0_DCC_CONTROL + 0x0, // CB_COLOR0_CMASK + 0x0, // CB_COLOR0_CMASK_BASE_EXT + 0x0, // CB_COLOR0_FMASK + 0x0, // CB_COLOR0_FMASK_BASE_EXT + 0x0, // CB_COLOR0_CLEAR_WORD0 + 0x0, // CB_COLOR0_CLEAR_WORD1 + 0x0, // CB_COLOR0_DCC_BASE + 0x0, // CB_COLOR0_DCC_BASE_EXT + 0x0, // CB_COLOR1_BASE + 0x0, // CB_COLOR1_BASE_EXT + 0x0, // CB_COLOR1_ATTRIB2 + 0x0, // CB_COLOR1_VIEW + 0x0, // CB_COLOR1_INFO + 0x0, // CB_COLOR1_ATTRIB + 0x0, // CB_COLOR1_DCC_CONTROL + 0x0, // CB_COLOR1_CMASK + 0x0, // CB_COLOR1_CMASK_BASE_EXT + 0x0, // CB_COLOR1_FMASK + 0x0, // CB_COLOR1_FMASK_BASE_EXT + 0x0, // CB_COLOR1_CLEAR_WORD0 + 0x0, // CB_COLOR1_CLEAR_WORD1 + 0x0, // CB_COLOR1_DCC_BASE + 0x0, // CB_COLOR1_DCC_BASE_EXT + 0x0, // CB_COLOR2_BASE + 0x0, // CB_COLOR2_BASE_EXT + 0x0, // CB_COLOR2_ATTRIB2 + 0x0, // CB_COLOR2_VIEW + 0x0, // CB_COLOR2_INFO + 0x0, // CB_COLOR2_ATTRIB + 0x0, // CB_COLOR2_DCC_CONTROL + 0x0, // CB_COLOR2_CMASK + 0x0, // CB_COLOR2_CMASK_BASE_EXT + 0x0, // CB_COLOR2_FMASK + 0x0, // CB_COLOR2_FMASK_BASE_EXT + 0x0, // CB_COLOR2_CLEAR_WORD0 + 0x0, // CB_COLOR2_CLEAR_WORD1 + 0x0, // CB_COLOR2_DCC_BASE + 0x0, // CB_COLOR2_DCC_BASE_EXT + 0x0, // CB_COLOR3_BASE + 0x0, // CB_COLOR3_BASE_EXT + 0x0, // CB_COLOR3_ATTRIB2 + 0x0, // CB_COLOR3_VIEW + 0x0, // CB_COLOR3_INFO + 0x0, // CB_COLOR3_ATTRIB + 0x0, // CB_COLOR3_DCC_CONTROL + 0x0, // CB_COLOR3_CMASK + 0x0, // CB_COLOR3_CMASK_BASE_EXT + 0x0, // CB_COLOR3_FMASK + 0x0, // CB_COLOR3_FMASK_BASE_EXT + 0x0, // CB_COLOR3_CLEAR_WORD0 + 0x0, // CB_COLOR3_CLEAR_WORD1 + 0x0, // CB_COLOR3_DCC_BASE + 0x0, // CB_COLOR3_DCC_BASE_EXT + 0x0, // CB_COLOR4_BASE + 0x0, // CB_COLOR4_BASE_EXT + 0x0, // CB_COLOR4_ATTRIB2 + 0x0, // CB_COLOR4_VIEW + 0x0, // CB_COLOR4_INFO + 0x0, // CB_COLOR4_ATTRIB + 0x0, // CB_COLOR4_DCC_CONTROL + 0x0, // CB_COLOR4_CMASK + 0x0, // CB_COLOR4_CMASK_BASE_EXT + 0x0, // CB_COLOR4_FMASK + 0x0, // CB_COLOR4_FMASK_BASE_EXT + 0x0, // CB_COLOR4_CLEAR_WORD0 + 0x0, // CB_COLOR4_CLEAR_WORD1 + 0x0, // CB_COLOR4_DCC_BASE + 0x0, // CB_COLOR4_DCC_BASE_EXT + 0x0, // CB_COLOR5_BASE + 0x0, // CB_COLOR5_BASE_EXT + 0x0, // CB_COLOR5_ATTRIB2 + 0x0, // CB_COLOR5_VIEW + 0x0, // CB_COLOR5_INFO + 0x0, // CB_COLOR5_ATTRIB + 0x0, // CB_COLOR5_DCC_CONTROL + 0x0, // CB_COLOR5_CMASK + 0x0, // CB_COLOR5_CMASK_BASE_EXT + 0x0, // CB_COLOR5_FMASK + 0x0, // CB_COLOR5_FMASK_BASE_EXT + 0x0, // CB_COLOR5_CLEAR_WORD0 + 0x0, // CB_COLOR5_CLEAR_WORD1 + 0x0, // CB_COLOR5_DCC_BASE + 0x0, // CB_COLOR5_DCC_BASE_EXT + 0x0, // CB_COLOR6_BASE + 0x0, // CB_COLOR6_BASE_EXT + 0x0, // CB_COLOR6_ATTRIB2 + 0x0, // CB_COLOR6_VIEW + 0x0, // CB_COLOR6_INFO + 0x0, // CB_COLOR6_ATTRIB + 0x0, // CB_COLOR6_DCC_CONTROL + 0x0, // CB_COLOR6_CMASK + 0x0, // CB_COLOR6_CMASK_BASE_EXT + 0x0, // CB_COLOR6_FMASK + 0x0, // CB_COLOR6_FMASK_BASE_EXT + 0x0, // CB_COLOR6_CLEAR_WORD0 + 0x0, // CB_COLOR6_CLEAR_WORD1 + 0x0, // CB_COLOR6_DCC_BASE + 0x0, // CB_COLOR6_DCC_BASE_EXT + 0x0, // CB_COLOR7_BASE + 0x0, // CB_COLOR7_BASE_EXT + 0x0, // CB_COLOR7_ATTRIB2 + 0x0, // CB_COLOR7_VIEW + 0x0, // CB_COLOR7_INFO + 0x0, // CB_COLOR7_ATTRIB + 0x0, // CB_COLOR7_DCC_CONTROL + 0x0, // CB_COLOR7_CMASK + 0x0, // CB_COLOR7_CMASK_BASE_EXT + 0x0, // CB_COLOR7_FMASK + 0x0, // CB_COLOR7_FMASK_BASE_EXT + 0x0, // CB_COLOR7_CLEAR_WORD0 + 0x0, // CB_COLOR7_CLEAR_WORD1 + 0x0, // CB_COLOR7_DCC_BASE 0x0 // CB_COLOR7_DCC_BASE_EXT }; @@ -1500,7 +1506,8 @@ static void gfx9_emulate_clear_state(struct radeon_cmdbuf *cs, set_context_reg_seq_array(cs, R_028000_DB_RENDER_CONTROL, SET(DbRenderControlGfx9)); set_context_reg_seq_array(cs, R_0281E8_COHER_DEST_BASE_HI_0, SET(CoherDestBaseHi0Gfx9)); - set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, SET(VgtMultiPrimIbResetIndxGfx9)); + set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, + SET(VgtMultiPrimIbResetIndxGfx9)); set_context_reg_seq_array(cs, R_028414_CB_BLEND_RED, SET(CbBlendRedGfx9)); set_context_reg_seq_array(cs, R_028644_SPI_PS_INPUT_CNTL_0, SET(SpiPsInputCntl0Gfx9)); set_context_reg_seq_array(cs, R_028754_SX_PS_DOWNCONVERT, SET(SxPsDownconvertGfx9)); @@ -1511,13 +1518,19 @@ static void gfx9_emulate_clear_state(struct radeon_cmdbuf *cs, set_context_reg_seq_array(cs, R_028A40_VGT_GS_MODE, SET(VgtGsModeGfx9)); set_context_reg_seq_array(cs, R_028A84_VGT_PRIMITIVEID_EN, SET(VgtPrimitiveidEnGfx9)); set_context_reg_seq_array(cs, R_028A8C_VGT_PRIMITIVEID_RESET, SET(VgtPrimitiveidResetGfx9)); - set_context_reg_seq_array(cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, SET(VgtGsMaxPrimsPerSubgroupGfx9)); - set_context_reg_seq_array(cs, R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1, SET(VgtStrmoutBufferSize1Gfx9)); - set_context_reg_seq_array(cs, R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2, SET(VgtStrmoutBufferSize2Gfx9)); - set_context_reg_seq_array(cs, R_028B00_VGT_STRMOUT_BUFFER_SIZE_3, SET(VgtStrmoutBufferSize3Gfx9)); - set_context_reg_seq_array(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, SET(VgtStrmoutDrawOpaqueOffsetGfx9)); + set_context_reg_seq_array(cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, + SET(VgtGsMaxPrimsPerSubgroupGfx9)); + set_context_reg_seq_array(cs, R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1, + SET(VgtStrmoutBufferSize1Gfx9)); + set_context_reg_seq_array(cs, R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2, + SET(VgtStrmoutBufferSize2Gfx9)); + set_context_reg_seq_array(cs, R_028B00_VGT_STRMOUT_BUFFER_SIZE_3, + SET(VgtStrmoutBufferSize3Gfx9)); + set_context_reg_seq_array(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, + SET(VgtStrmoutDrawOpaqueOffsetGfx9)); set_context_reg_seq_array(cs, R_028B38_VGT_GS_MAX_VERT_OUT, SET(VgtGsMaxVertOutGfx9)); - set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, SET(PaScCentroidPriority0Gfx9)); + set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, + SET(PaScCentroidPriority0Gfx9)); } /** @@ -1529,68 +1542,68 @@ static void gfx10_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_reg set_context_reg_seq_array_fn set_context_reg_seq_array) { static const uint32_t DbRenderControlNv10[] = { - 0x0 , // DB_RENDER_CONTROL - 0x0 , // DB_COUNT_CONTROL - 0x0 , // DB_DEPTH_VIEW - 0x0 , // DB_RENDER_OVERRIDE - 0x0 , // DB_RENDER_OVERRIDE2 - 0x0 , // DB_HTILE_DATA_BASE - 0x0 , // - 0x0 , // DB_DEPTH_SIZE_XY - 0x0 , // DB_DEPTH_BOUNDS_MIN - 0x0 , // DB_DEPTH_BOUNDS_MAX - 0x0 , // DB_STENCIL_CLEAR - 0x0 , // DB_DEPTH_CLEAR - 0x0 , // PA_SC_SCREEN_SCISSOR_TL + 0x0, // DB_RENDER_CONTROL + 0x0, // DB_COUNT_CONTROL + 0x0, // DB_DEPTH_VIEW + 0x0, // DB_RENDER_OVERRIDE + 0x0, // DB_RENDER_OVERRIDE2 + 0x0, // DB_HTILE_DATA_BASE + 0x0, // + 0x0, // DB_DEPTH_SIZE_XY + 0x0, // DB_DEPTH_BOUNDS_MIN + 0x0, // DB_DEPTH_BOUNDS_MAX + 0x0, // DB_STENCIL_CLEAR + 0x0, // DB_DEPTH_CLEAR + 0x0, // PA_SC_SCREEN_SCISSOR_TL 0x40004000, // PA_SC_SCREEN_SCISSOR_BR - 0x0 , // DB_DFSM_CONTROL - 0x0 , // DB_RESERVED_REG_2 - 0x0 , // DB_Z_INFO - 0x0 , // DB_STENCIL_INFO - 0x0 , // DB_Z_READ_BASE - 0x0 , // DB_STENCIL_READ_BASE - 0x0 , // DB_Z_WRITE_BASE - 0x0 , // DB_STENCIL_WRITE_BASE - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // DB_Z_READ_BASE_HI - 0x0 , // DB_STENCIL_READ_BASE_HI - 0x0 , // DB_Z_WRITE_BASE_HI - 0x0 , // DB_STENCIL_WRITE_BASE_HI - 0x0 , // DB_HTILE_DATA_BASE_HI - 0x0 , // DB_RMI_L2_CACHE_CONTROL - 0x0 , // TA_BC_BASE_ADDR + 0x0, // DB_DFSM_CONTROL + 0x0, // DB_RESERVED_REG_2 + 0x0, // DB_Z_INFO + 0x0, // DB_STENCIL_INFO + 0x0, // DB_Z_READ_BASE + 0x0, // DB_STENCIL_READ_BASE + 0x0, // DB_Z_WRITE_BASE + 0x0, // DB_STENCIL_WRITE_BASE + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // DB_Z_READ_BASE_HI + 0x0, // DB_STENCIL_READ_BASE_HI + 0x0, // DB_Z_WRITE_BASE_HI + 0x0, // DB_STENCIL_WRITE_BASE_HI + 0x0, // DB_HTILE_DATA_BASE_HI + 0x0, // DB_RMI_L2_CACHE_CONTROL + 0x0, // TA_BC_BASE_ADDR 0x0 // TA_BC_BASE_ADDR_HI }; static const uint32_t CoherDestBaseHi0Nv10[] = { - 0x0 , // COHER_DEST_BASE_HI_0 - 0x0 , // COHER_DEST_BASE_HI_1 - 0x0 , // COHER_DEST_BASE_HI_2 - 0x0 , // COHER_DEST_BASE_HI_3 - 0x0 , // COHER_DEST_BASE_2 - 0x0 , // COHER_DEST_BASE_3 - 0x0 , // PA_SC_WINDOW_OFFSET + 0x0, // COHER_DEST_BASE_HI_0 + 0x0, // COHER_DEST_BASE_HI_1 + 0x0, // COHER_DEST_BASE_HI_2 + 0x0, // COHER_DEST_BASE_HI_3 + 0x0, // COHER_DEST_BASE_2 + 0x0, // COHER_DEST_BASE_3 + 0x0, // PA_SC_WINDOW_OFFSET 0x80000000, // PA_SC_WINDOW_SCISSOR_TL 0x40004000, // PA_SC_WINDOW_SCISSOR_BR - 0xffff , // PA_SC_CLIPRECT_RULE - 0x0 , // PA_SC_CLIPRECT_0_TL + 0xffff, // PA_SC_CLIPRECT_RULE + 0x0, // PA_SC_CLIPRECT_0_TL 0x40004000, // PA_SC_CLIPRECT_0_BR - 0x0 , // PA_SC_CLIPRECT_1_TL + 0x0, // PA_SC_CLIPRECT_1_TL 0x40004000, // PA_SC_CLIPRECT_1_BR - 0x0 , // PA_SC_CLIPRECT_2_TL + 0x0, // PA_SC_CLIPRECT_2_TL 0x40004000, // PA_SC_CLIPRECT_2_BR - 0x0 , // PA_SC_CLIPRECT_3_TL + 0x0, // PA_SC_CLIPRECT_3_TL 0x40004000, // PA_SC_CLIPRECT_3_BR 0xaa99aaaa, // PA_SC_EDGERULE - 0x0 , // PA_SU_HARDWARE_SCREEN_OFFSET + 0x0, // PA_SU_HARDWARE_SCREEN_OFFSET 0xffffffff, // CB_TARGET_MASK 0xffffffff, // CB_SHADER_MASK 0x80000000, // PA_SC_GENERIC_SCISSOR_TL 0x40004000, // PA_SC_GENERIC_SCISSOR_BR - 0x0 , // COHER_DEST_BASE_0 - 0x0 , // COHER_DEST_BASE_1 + 0x0, // COHER_DEST_BASE_0 + 0x0, // COHER_DEST_BASE_1 0x80000000, // PA_SC_VPORT_SCISSOR_0_TL 0x40004000, // PA_SC_VPORT_SCISSOR_0_BR 0x80000000, // PA_SC_VPORT_SCISSOR_1_TL @@ -1623,583 +1636,585 @@ static void gfx10_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_reg 0x40004000, // PA_SC_VPORT_SCISSOR_14_BR 0x80000000, // PA_SC_VPORT_SCISSOR_15_TL 0x40004000, // PA_SC_VPORT_SCISSOR_15_BR - 0x0 , // PA_SC_VPORT_ZMIN_0 + 0x0, // PA_SC_VPORT_ZMIN_0 0x3f800000, // PA_SC_VPORT_ZMAX_0 - 0x0 , // PA_SC_VPORT_ZMIN_1 + 0x0, // PA_SC_VPORT_ZMIN_1 0x3f800000, // PA_SC_VPORT_ZMAX_1 - 0x0 , // PA_SC_VPORT_ZMIN_2 + 0x0, // PA_SC_VPORT_ZMIN_2 0x3f800000, // PA_SC_VPORT_ZMAX_2 - 0x0 , // PA_SC_VPORT_ZMIN_3 + 0x0, // PA_SC_VPORT_ZMIN_3 0x3f800000, // PA_SC_VPORT_ZMAX_3 - 0x0 , // PA_SC_VPORT_ZMIN_4 + 0x0, // PA_SC_VPORT_ZMIN_4 0x3f800000, // PA_SC_VPORT_ZMAX_4 - 0x0 , // PA_SC_VPORT_ZMIN_5 + 0x0, // PA_SC_VPORT_ZMIN_5 0x3f800000, // PA_SC_VPORT_ZMAX_5 - 0x0 , // PA_SC_VPORT_ZMIN_6 + 0x0, // PA_SC_VPORT_ZMIN_6 0x3f800000, // PA_SC_VPORT_ZMAX_6 - 0x0 , // PA_SC_VPORT_ZMIN_7 + 0x0, // PA_SC_VPORT_ZMIN_7 0x3f800000, // PA_SC_VPORT_ZMAX_7 - 0x0 , // PA_SC_VPORT_ZMIN_8 + 0x0, // PA_SC_VPORT_ZMIN_8 0x3f800000, // PA_SC_VPORT_ZMAX_8 - 0x0 , // PA_SC_VPORT_ZMIN_9 + 0x0, // PA_SC_VPORT_ZMIN_9 0x3f800000, // PA_SC_VPORT_ZMAX_9 - 0x0 , // PA_SC_VPORT_ZMIN_10 + 0x0, // PA_SC_VPORT_ZMIN_10 0x3f800000, // PA_SC_VPORT_ZMAX_10 - 0x0 , // PA_SC_VPORT_ZMIN_11 + 0x0, // PA_SC_VPORT_ZMIN_11 0x3f800000, // PA_SC_VPORT_ZMAX_11 - 0x0 , // PA_SC_VPORT_ZMIN_12 + 0x0, // PA_SC_VPORT_ZMIN_12 0x3f800000, // PA_SC_VPORT_ZMAX_12 - 0x0 , // PA_SC_VPORT_ZMIN_13 + 0x0, // PA_SC_VPORT_ZMIN_13 0x3f800000, // PA_SC_VPORT_ZMAX_13 - 0x0 , // PA_SC_VPORT_ZMIN_14 + 0x0, // PA_SC_VPORT_ZMIN_14 0x3f800000, // PA_SC_VPORT_ZMAX_14 - 0x0 , // PA_SC_VPORT_ZMIN_15 + 0x0, // PA_SC_VPORT_ZMIN_15 0x3f800000, // PA_SC_VPORT_ZMAX_15 - 0x0 , // PA_SC_RASTER_CONFIG - 0x0 , // PA_SC_RASTER_CONFIG_1 - 0x0 , // + 0x0, // PA_SC_RASTER_CONFIG + 0x0, // PA_SC_RASTER_CONFIG_1 + 0x0, // 0x0 // PA_SC_TILE_STEERING_OVERRIDE }; static const uint32_t VgtMultiPrimIbResetIndxNv10[] = { - 0x0 , // VGT_MULTI_PRIM_IB_RESET_INDX - 0x0 , // CB_RMI_GL2_CACHE_CONTROL - 0x0 , // CB_BLEND_RED - 0x0 , // CB_BLEND_GREEN - 0x0 , // CB_BLEND_BLUE - 0x0 , // CB_BLEND_ALPHA - 0x0 , // CB_DCC_CONTROL - 0x0 , // CB_COVERAGE_OUT_CONTROL - 0x0 , // DB_STENCIL_CONTROL - 0x1000000 , // DB_STENCILREFMASK - 0x1000000 , // DB_STENCILREFMASK_BF - 0x0 , // - 0x0 , // PA_CL_VPORT_XSCALE - 0x0 , // PA_CL_VPORT_XOFFSET - 0x0 , // PA_CL_VPORT_YSCALE - 0x0 , // PA_CL_VPORT_YOFFSET - 0x0 , // PA_CL_VPORT_ZSCALE - 0x0 , // PA_CL_VPORT_ZOFFSET - 0x0 , // PA_CL_VPORT_XSCALE_1 - 0x0 , // PA_CL_VPORT_XOFFSET_1 - 0x0 , // PA_CL_VPORT_YSCALE_1 - 0x0 , // PA_CL_VPORT_YOFFSET_1 - 0x0 , // PA_CL_VPORT_ZSCALE_1 - 0x0 , // PA_CL_VPORT_ZOFFSET_1 - 0x0 , // PA_CL_VPORT_XSCALE_2 - 0x0 , // PA_CL_VPORT_XOFFSET_2 - 0x0 , // PA_CL_VPORT_YSCALE_2 - 0x0 , // PA_CL_VPORT_YOFFSET_2 - 0x0 , // PA_CL_VPORT_ZSCALE_2 - 0x0 , // PA_CL_VPORT_ZOFFSET_2 - 0x0 , // PA_CL_VPORT_XSCALE_3 - 0x0 , // PA_CL_VPORT_XOFFSET_3 - 0x0 , // PA_CL_VPORT_YSCALE_3 - 0x0 , // PA_CL_VPORT_YOFFSET_3 - 0x0 , // PA_CL_VPORT_ZSCALE_3 - 0x0 , // PA_CL_VPORT_ZOFFSET_3 - 0x0 , // PA_CL_VPORT_XSCALE_4 - 0x0 , // PA_CL_VPORT_XOFFSET_4 - 0x0 , // PA_CL_VPORT_YSCALE_4 - 0x0 , // PA_CL_VPORT_YOFFSET_4 - 0x0 , // PA_CL_VPORT_ZSCALE_4 - 0x0 , // PA_CL_VPORT_ZOFFSET_4 - 0x0 , // PA_CL_VPORT_XSCALE_5 - 0x0 , // PA_CL_VPORT_XOFFSET_5 - 0x0 , // PA_CL_VPORT_YSCALE_5 - 0x0 , // PA_CL_VPORT_YOFFSET_5 - 0x0 , // PA_CL_VPORT_ZSCALE_5 - 0x0 , // PA_CL_VPORT_ZOFFSET_5 - 0x0 , // PA_CL_VPORT_XSCALE_6 - 0x0 , // PA_CL_VPORT_XOFFSET_6 - 0x0 , // PA_CL_VPORT_YSCALE_6 - 0x0 , // PA_CL_VPORT_YOFFSET_6 - 0x0 , // PA_CL_VPORT_ZSCALE_6 - 0x0 , // PA_CL_VPORT_ZOFFSET_6 - 0x0 , // PA_CL_VPORT_XSCALE_7 - 0x0 , // PA_CL_VPORT_XOFFSET_7 - 0x0 , // PA_CL_VPORT_YSCALE_7 - 0x0 , // PA_CL_VPORT_YOFFSET_7 - 0x0 , // PA_CL_VPORT_ZSCALE_7 - 0x0 , // PA_CL_VPORT_ZOFFSET_7 - 0x0 , // PA_CL_VPORT_XSCALE_8 - 0x0 , // PA_CL_VPORT_XOFFSET_8 - 0x0 , // PA_CL_VPORT_YSCALE_8 - 0x0 , // PA_CL_VPORT_YOFFSET_8 - 0x0 , // PA_CL_VPORT_ZSCALE_8 - 0x0 , // PA_CL_VPORT_ZOFFSET_8 - 0x0 , // PA_CL_VPORT_XSCALE_9 - 0x0 , // PA_CL_VPORT_XOFFSET_9 - 0x0 , // PA_CL_VPORT_YSCALE_9 - 0x0 , // PA_CL_VPORT_YOFFSET_9 - 0x0 , // PA_CL_VPORT_ZSCALE_9 - 0x0 , // PA_CL_VPORT_ZOFFSET_9 - 0x0 , // PA_CL_VPORT_XSCALE_10 - 0x0 , // PA_CL_VPORT_XOFFSET_10 - 0x0 , // PA_CL_VPORT_YSCALE_10 - 0x0 , // PA_CL_VPORT_YOFFSET_10 - 0x0 , // PA_CL_VPORT_ZSCALE_10 - 0x0 , // PA_CL_VPORT_ZOFFSET_10 - 0x0 , // PA_CL_VPORT_XSCALE_11 - 0x0 , // PA_CL_VPORT_XOFFSET_11 - 0x0 , // PA_CL_VPORT_YSCALE_11 - 0x0 , // PA_CL_VPORT_YOFFSET_11 - 0x0 , // PA_CL_VPORT_ZSCALE_11 - 0x0 , // PA_CL_VPORT_ZOFFSET_11 - 0x0 , // PA_CL_VPORT_XSCALE_12 - 0x0 , // PA_CL_VPORT_XOFFSET_12 - 0x0 , // PA_CL_VPORT_YSCALE_12 - 0x0 , // PA_CL_VPORT_YOFFSET_12 - 0x0 , // PA_CL_VPORT_ZSCALE_12 - 0x0 , // PA_CL_VPORT_ZOFFSET_12 - 0x0 , // PA_CL_VPORT_XSCALE_13 - 0x0 , // PA_CL_VPORT_XOFFSET_13 - 0x0 , // PA_CL_VPORT_YSCALE_13 - 0x0 , // PA_CL_VPORT_YOFFSET_13 - 0x0 , // PA_CL_VPORT_ZSCALE_13 - 0x0 , // PA_CL_VPORT_ZOFFSET_13 - 0x0 , // PA_CL_VPORT_XSCALE_14 - 0x0 , // PA_CL_VPORT_XOFFSET_14 - 0x0 , // PA_CL_VPORT_YSCALE_14 - 0x0 , // PA_CL_VPORT_YOFFSET_14 - 0x0 , // PA_CL_VPORT_ZSCALE_14 - 0x0 , // PA_CL_VPORT_ZOFFSET_14 - 0x0 , // PA_CL_VPORT_XSCALE_15 - 0x0 , // PA_CL_VPORT_XOFFSET_15 - 0x0 , // PA_CL_VPORT_YSCALE_15 - 0x0 , // PA_CL_VPORT_YOFFSET_15 - 0x0 , // PA_CL_VPORT_ZSCALE_15 - 0x0 , // PA_CL_VPORT_ZOFFSET_15 - 0x0 , // PA_CL_UCP_0_X - 0x0 , // PA_CL_UCP_0_Y - 0x0 , // PA_CL_UCP_0_Z - 0x0 , // PA_CL_UCP_0_W - 0x0 , // PA_CL_UCP_1_X - 0x0 , // PA_CL_UCP_1_Y - 0x0 , // PA_CL_UCP_1_Z - 0x0 , // PA_CL_UCP_1_W - 0x0 , // PA_CL_UCP_2_X - 0x0 , // PA_CL_UCP_2_Y - 0x0 , // PA_CL_UCP_2_Z - 0x0 , // PA_CL_UCP_2_W - 0x0 , // PA_CL_UCP_3_X - 0x0 , // PA_CL_UCP_3_Y - 0x0 , // PA_CL_UCP_3_Z - 0x0 , // PA_CL_UCP_3_W - 0x0 , // PA_CL_UCP_4_X - 0x0 , // PA_CL_UCP_4_Y - 0x0 , // PA_CL_UCP_4_Z - 0x0 , // PA_CL_UCP_4_W - 0x0 , // PA_CL_UCP_5_X - 0x0 , // PA_CL_UCP_5_Y - 0x0 , // PA_CL_UCP_5_Z - 0x0 // PA_CL_UCP_5_W + 0x0, // VGT_MULTI_PRIM_IB_RESET_INDX + 0x0, // CB_RMI_GL2_CACHE_CONTROL + 0x0, // CB_BLEND_RED + 0x0, // CB_BLEND_GREEN + 0x0, // CB_BLEND_BLUE + 0x0, // CB_BLEND_ALPHA + 0x0, // CB_DCC_CONTROL + 0x0, // CB_COVERAGE_OUT_CONTROL + 0x0, // DB_STENCIL_CONTROL + 0x1000000, // DB_STENCILREFMASK + 0x1000000, // DB_STENCILREFMASK_BF + 0x0, // + 0x0, // PA_CL_VPORT_XSCALE + 0x0, // PA_CL_VPORT_XOFFSET + 0x0, // PA_CL_VPORT_YSCALE + 0x0, // PA_CL_VPORT_YOFFSET + 0x0, // PA_CL_VPORT_ZSCALE + 0x0, // PA_CL_VPORT_ZOFFSET + 0x0, // PA_CL_VPORT_XSCALE_1 + 0x0, // PA_CL_VPORT_XOFFSET_1 + 0x0, // PA_CL_VPORT_YSCALE_1 + 0x0, // PA_CL_VPORT_YOFFSET_1 + 0x0, // PA_CL_VPORT_ZSCALE_1 + 0x0, // PA_CL_VPORT_ZOFFSET_1 + 0x0, // PA_CL_VPORT_XSCALE_2 + 0x0, // PA_CL_VPORT_XOFFSET_2 + 0x0, // PA_CL_VPORT_YSCALE_2 + 0x0, // PA_CL_VPORT_YOFFSET_2 + 0x0, // PA_CL_VPORT_ZSCALE_2 + 0x0, // PA_CL_VPORT_ZOFFSET_2 + 0x0, // PA_CL_VPORT_XSCALE_3 + 0x0, // PA_CL_VPORT_XOFFSET_3 + 0x0, // PA_CL_VPORT_YSCALE_3 + 0x0, // PA_CL_VPORT_YOFFSET_3 + 0x0, // PA_CL_VPORT_ZSCALE_3 + 0x0, // PA_CL_VPORT_ZOFFSET_3 + 0x0, // PA_CL_VPORT_XSCALE_4 + 0x0, // PA_CL_VPORT_XOFFSET_4 + 0x0, // PA_CL_VPORT_YSCALE_4 + 0x0, // PA_CL_VPORT_YOFFSET_4 + 0x0, // PA_CL_VPORT_ZSCALE_4 + 0x0, // PA_CL_VPORT_ZOFFSET_4 + 0x0, // PA_CL_VPORT_XSCALE_5 + 0x0, // PA_CL_VPORT_XOFFSET_5 + 0x0, // PA_CL_VPORT_YSCALE_5 + 0x0, // PA_CL_VPORT_YOFFSET_5 + 0x0, // PA_CL_VPORT_ZSCALE_5 + 0x0, // PA_CL_VPORT_ZOFFSET_5 + 0x0, // PA_CL_VPORT_XSCALE_6 + 0x0, // PA_CL_VPORT_XOFFSET_6 + 0x0, // PA_CL_VPORT_YSCALE_6 + 0x0, // PA_CL_VPORT_YOFFSET_6 + 0x0, // PA_CL_VPORT_ZSCALE_6 + 0x0, // PA_CL_VPORT_ZOFFSET_6 + 0x0, // PA_CL_VPORT_XSCALE_7 + 0x0, // PA_CL_VPORT_XOFFSET_7 + 0x0, // PA_CL_VPORT_YSCALE_7 + 0x0, // PA_CL_VPORT_YOFFSET_7 + 0x0, // PA_CL_VPORT_ZSCALE_7 + 0x0, // PA_CL_VPORT_ZOFFSET_7 + 0x0, // PA_CL_VPORT_XSCALE_8 + 0x0, // PA_CL_VPORT_XOFFSET_8 + 0x0, // PA_CL_VPORT_YSCALE_8 + 0x0, // PA_CL_VPORT_YOFFSET_8 + 0x0, // PA_CL_VPORT_ZSCALE_8 + 0x0, // PA_CL_VPORT_ZOFFSET_8 + 0x0, // PA_CL_VPORT_XSCALE_9 + 0x0, // PA_CL_VPORT_XOFFSET_9 + 0x0, // PA_CL_VPORT_YSCALE_9 + 0x0, // PA_CL_VPORT_YOFFSET_9 + 0x0, // PA_CL_VPORT_ZSCALE_9 + 0x0, // PA_CL_VPORT_ZOFFSET_9 + 0x0, // PA_CL_VPORT_XSCALE_10 + 0x0, // PA_CL_VPORT_XOFFSET_10 + 0x0, // PA_CL_VPORT_YSCALE_10 + 0x0, // PA_CL_VPORT_YOFFSET_10 + 0x0, // PA_CL_VPORT_ZSCALE_10 + 0x0, // PA_CL_VPORT_ZOFFSET_10 + 0x0, // PA_CL_VPORT_XSCALE_11 + 0x0, // PA_CL_VPORT_XOFFSET_11 + 0x0, // PA_CL_VPORT_YSCALE_11 + 0x0, // PA_CL_VPORT_YOFFSET_11 + 0x0, // PA_CL_VPORT_ZSCALE_11 + 0x0, // PA_CL_VPORT_ZOFFSET_11 + 0x0, // PA_CL_VPORT_XSCALE_12 + 0x0, // PA_CL_VPORT_XOFFSET_12 + 0x0, // PA_CL_VPORT_YSCALE_12 + 0x0, // PA_CL_VPORT_YOFFSET_12 + 0x0, // PA_CL_VPORT_ZSCALE_12 + 0x0, // PA_CL_VPORT_ZOFFSET_12 + 0x0, // PA_CL_VPORT_XSCALE_13 + 0x0, // PA_CL_VPORT_XOFFSET_13 + 0x0, // PA_CL_VPORT_YSCALE_13 + 0x0, // PA_CL_VPORT_YOFFSET_13 + 0x0, // PA_CL_VPORT_ZSCALE_13 + 0x0, // PA_CL_VPORT_ZOFFSET_13 + 0x0, // PA_CL_VPORT_XSCALE_14 + 0x0, // PA_CL_VPORT_XOFFSET_14 + 0x0, // PA_CL_VPORT_YSCALE_14 + 0x0, // PA_CL_VPORT_YOFFSET_14 + 0x0, // PA_CL_VPORT_ZSCALE_14 + 0x0, // PA_CL_VPORT_ZOFFSET_14 + 0x0, // PA_CL_VPORT_XSCALE_15 + 0x0, // PA_CL_VPORT_XOFFSET_15 + 0x0, // PA_CL_VPORT_YSCALE_15 + 0x0, // PA_CL_VPORT_YOFFSET_15 + 0x0, // PA_CL_VPORT_ZSCALE_15 + 0x0, // PA_CL_VPORT_ZOFFSET_15 + 0x0, // PA_CL_UCP_0_X + 0x0, // PA_CL_UCP_0_Y + 0x0, // PA_CL_UCP_0_Z + 0x0, // PA_CL_UCP_0_W + 0x0, // PA_CL_UCP_1_X + 0x0, // PA_CL_UCP_1_Y + 0x0, // PA_CL_UCP_1_Z + 0x0, // PA_CL_UCP_1_W + 0x0, // PA_CL_UCP_2_X + 0x0, // PA_CL_UCP_2_Y + 0x0, // PA_CL_UCP_2_Z + 0x0, // PA_CL_UCP_2_W + 0x0, // PA_CL_UCP_3_X + 0x0, // PA_CL_UCP_3_Y + 0x0, // PA_CL_UCP_3_Z + 0x0, // PA_CL_UCP_3_W + 0x0, // PA_CL_UCP_4_X + 0x0, // PA_CL_UCP_4_Y + 0x0, // PA_CL_UCP_4_Z + 0x0, // PA_CL_UCP_4_W + 0x0, // PA_CL_UCP_5_X + 0x0, // PA_CL_UCP_5_Y + 0x0, // PA_CL_UCP_5_Z + 0x0 // PA_CL_UCP_5_W }; static const uint32_t SpiPsInputCntl0Nv10[] = { - 0x0 , // SPI_PS_INPUT_CNTL_0 - 0x0 , // SPI_PS_INPUT_CNTL_1 - 0x0 , // SPI_PS_INPUT_CNTL_2 - 0x0 , // SPI_PS_INPUT_CNTL_3 - 0x0 , // SPI_PS_INPUT_CNTL_4 - 0x0 , // SPI_PS_INPUT_CNTL_5 - 0x0 , // SPI_PS_INPUT_CNTL_6 - 0x0 , // SPI_PS_INPUT_CNTL_7 - 0x0 , // SPI_PS_INPUT_CNTL_8 - 0x0 , // SPI_PS_INPUT_CNTL_9 - 0x0 , // SPI_PS_INPUT_CNTL_10 - 0x0 , // SPI_PS_INPUT_CNTL_11 - 0x0 , // SPI_PS_INPUT_CNTL_12 - 0x0 , // SPI_PS_INPUT_CNTL_13 - 0x0 , // SPI_PS_INPUT_CNTL_14 - 0x0 , // SPI_PS_INPUT_CNTL_15 - 0x0 , // SPI_PS_INPUT_CNTL_16 - 0x0 , // SPI_PS_INPUT_CNTL_17 - 0x0 , // SPI_PS_INPUT_CNTL_18 - 0x0 , // SPI_PS_INPUT_CNTL_19 - 0x0 , // SPI_PS_INPUT_CNTL_20 - 0x0 , // SPI_PS_INPUT_CNTL_21 - 0x0 , // SPI_PS_INPUT_CNTL_22 - 0x0 , // SPI_PS_INPUT_CNTL_23 - 0x0 , // SPI_PS_INPUT_CNTL_24 - 0x0 , // SPI_PS_INPUT_CNTL_25 - 0x0 , // SPI_PS_INPUT_CNTL_26 - 0x0 , // SPI_PS_INPUT_CNTL_27 - 0x0 , // SPI_PS_INPUT_CNTL_28 - 0x0 , // SPI_PS_INPUT_CNTL_29 - 0x0 , // SPI_PS_INPUT_CNTL_30 - 0x0 , // SPI_PS_INPUT_CNTL_31 - 0x0 , // SPI_VS_OUT_CONFIG - 0x0 , // - 0x0 , // SPI_PS_INPUT_ENA - 0x0 , // SPI_PS_INPUT_ADDR - 0x0 , // SPI_INTERP_CONTROL_0 - 0x2 , // SPI_PS_IN_CONTROL - 0x0 , // - 0x0 , // SPI_BARYC_CNTL - 0x0 , // - 0x0 , // SPI_TMPRING_SIZE - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // SPI_SHADER_IDX_FORMAT - 0x0 , // SPI_SHADER_POS_FORMAT - 0x0 , // SPI_SHADER_Z_FORMAT - 0x0 // SPI_SHADER_COL_FORMAT + 0x0, // SPI_PS_INPUT_CNTL_0 + 0x0, // SPI_PS_INPUT_CNTL_1 + 0x0, // SPI_PS_INPUT_CNTL_2 + 0x0, // SPI_PS_INPUT_CNTL_3 + 0x0, // SPI_PS_INPUT_CNTL_4 + 0x0, // SPI_PS_INPUT_CNTL_5 + 0x0, // SPI_PS_INPUT_CNTL_6 + 0x0, // SPI_PS_INPUT_CNTL_7 + 0x0, // SPI_PS_INPUT_CNTL_8 + 0x0, // SPI_PS_INPUT_CNTL_9 + 0x0, // SPI_PS_INPUT_CNTL_10 + 0x0, // SPI_PS_INPUT_CNTL_11 + 0x0, // SPI_PS_INPUT_CNTL_12 + 0x0, // SPI_PS_INPUT_CNTL_13 + 0x0, // SPI_PS_INPUT_CNTL_14 + 0x0, // SPI_PS_INPUT_CNTL_15 + 0x0, // SPI_PS_INPUT_CNTL_16 + 0x0, // SPI_PS_INPUT_CNTL_17 + 0x0, // SPI_PS_INPUT_CNTL_18 + 0x0, // SPI_PS_INPUT_CNTL_19 + 0x0, // SPI_PS_INPUT_CNTL_20 + 0x0, // SPI_PS_INPUT_CNTL_21 + 0x0, // SPI_PS_INPUT_CNTL_22 + 0x0, // SPI_PS_INPUT_CNTL_23 + 0x0, // SPI_PS_INPUT_CNTL_24 + 0x0, // SPI_PS_INPUT_CNTL_25 + 0x0, // SPI_PS_INPUT_CNTL_26 + 0x0, // SPI_PS_INPUT_CNTL_27 + 0x0, // SPI_PS_INPUT_CNTL_28 + 0x0, // SPI_PS_INPUT_CNTL_29 + 0x0, // SPI_PS_INPUT_CNTL_30 + 0x0, // SPI_PS_INPUT_CNTL_31 + 0x0, // SPI_VS_OUT_CONFIG + 0x0, // + 0x0, // SPI_PS_INPUT_ENA + 0x0, // SPI_PS_INPUT_ADDR + 0x0, // SPI_INTERP_CONTROL_0 + 0x2, // SPI_PS_IN_CONTROL + 0x0, // + 0x0, // SPI_BARYC_CNTL + 0x0, // + 0x0, // SPI_TMPRING_SIZE + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // SPI_SHADER_IDX_FORMAT + 0x0, // SPI_SHADER_POS_FORMAT + 0x0, // SPI_SHADER_Z_FORMAT + 0x0 // SPI_SHADER_COL_FORMAT }; static const uint32_t SxPsDownconvertNv10[] = { - 0x0 , // SX_PS_DOWNCONVERT - 0x0 , // SX_BLEND_OPT_EPSILON - 0x0 , // SX_BLEND_OPT_CONTROL - 0x0 , // SX_MRT0_BLEND_OPT - 0x0 , // SX_MRT1_BLEND_OPT - 0x0 , // SX_MRT2_BLEND_OPT - 0x0 , // SX_MRT3_BLEND_OPT - 0x0 , // SX_MRT4_BLEND_OPT - 0x0 , // SX_MRT5_BLEND_OPT - 0x0 , // SX_MRT6_BLEND_OPT - 0x0 , // SX_MRT7_BLEND_OPT - 0x0 , // CB_BLEND0_CONTROL - 0x0 , // CB_BLEND1_CONTROL - 0x0 , // CB_BLEND2_CONTROL - 0x0 , // CB_BLEND3_CONTROL - 0x0 , // CB_BLEND4_CONTROL - 0x0 , // CB_BLEND5_CONTROL - 0x0 , // CB_BLEND6_CONTROL - 0x0 // CB_BLEND7_CONTROL + 0x0, // SX_PS_DOWNCONVERT + 0x0, // SX_BLEND_OPT_EPSILON + 0x0, // SX_BLEND_OPT_CONTROL + 0x0, // SX_MRT0_BLEND_OPT + 0x0, // SX_MRT1_BLEND_OPT + 0x0, // SX_MRT2_BLEND_OPT + 0x0, // SX_MRT3_BLEND_OPT + 0x0, // SX_MRT4_BLEND_OPT + 0x0, // SX_MRT5_BLEND_OPT + 0x0, // SX_MRT6_BLEND_OPT + 0x0, // SX_MRT7_BLEND_OPT + 0x0, // CB_BLEND0_CONTROL + 0x0, // CB_BLEND1_CONTROL + 0x0, // CB_BLEND2_CONTROL + 0x0, // CB_BLEND3_CONTROL + 0x0, // CB_BLEND4_CONTROL + 0x0, // CB_BLEND5_CONTROL + 0x0, // CB_BLEND6_CONTROL + 0x0 // CB_BLEND7_CONTROL }; static const uint32_t GeMaxOutputPerSubgroupNv10[] = { - 0x0 , // GE_MAX_OUTPUT_PER_SUBGROUP - 0x0 , // DB_DEPTH_CONTROL - 0x0 , // DB_EQAA - 0x0 , // CB_COLOR_CONTROL - 0x0 , // DB_SHADER_CONTROL - 0x90000 , // PA_CL_CLIP_CNTL - 0x4 , // PA_SU_SC_MODE_CNTL - 0x0 , // PA_CL_VTE_CNTL - 0x0 , // PA_CL_VS_OUT_CNTL - 0x0 // PA_CL_NANINF_CNTL + 0x0, // GE_MAX_OUTPUT_PER_SUBGROUP + 0x0, // DB_DEPTH_CONTROL + 0x0, // DB_EQAA + 0x0, // CB_COLOR_CONTROL + 0x0, // DB_SHADER_CONTROL + 0x90000, // PA_CL_CLIP_CNTL + 0x4, // PA_SU_SC_MODE_CNTL + 0x0, // PA_CL_VTE_CNTL + 0x0, // PA_CL_VS_OUT_CNTL + 0x0 // PA_CL_NANINF_CNTL }; static const uint32_t PaSuPrimFilterCntlNv10[] = { - 0x0 , // PA_SU_PRIM_FILTER_CNTL - 0x0 , // PA_SU_SMALL_PRIM_FILTER_CNTL - 0x0 , // PA_CL_OBJPRIM_ID_CNTL - 0x0 , // PA_CL_NGG_CNTL - 0x0 , // PA_SU_OVER_RASTERIZATION_CNTL - 0x0 , // PA_STEREO_CNTL - 0x0 // PA_STATE_STEREO_X + 0x0, // PA_SU_PRIM_FILTER_CNTL + 0x0, // PA_SU_SMALL_PRIM_FILTER_CNTL + 0x0, // PA_CL_OBJPRIM_ID_CNTL + 0x0, // PA_CL_NGG_CNTL + 0x0, // PA_SU_OVER_RASTERIZATION_CNTL + 0x0, // PA_STEREO_CNTL + 0x0 // PA_STATE_STEREO_X }; static const uint32_t PaSuPointSizeNv10[] = { - 0x0 , // PA_SU_POINT_SIZE - 0x0 , // PA_SU_POINT_MINMAX - 0x0 , // PA_SU_LINE_CNTL - 0x0 // PA_SC_LINE_STIPPLE + 0x0, // PA_SU_POINT_SIZE + 0x0, // PA_SU_POINT_MINMAX + 0x0, // PA_SU_LINE_CNTL + 0x0 // PA_SC_LINE_STIPPLE }; static const uint32_t VgtHosMaxTessLevelNv10[] = { - 0x0 , // VGT_HOS_MAX_TESS_LEVEL - 0x0 // VGT_HOS_MIN_TESS_LEVEL + 0x0, // VGT_HOS_MAX_TESS_LEVEL + 0x0 // VGT_HOS_MIN_TESS_LEVEL }; static const uint32_t VgtGsModeNv10[] = { - 0x0 , // VGT_GS_MODE - 0x0 , // VGT_GS_ONCHIP_CNTL - 0x0 , // PA_SC_MODE_CNTL_0 - 0x0 , // PA_SC_MODE_CNTL_1 - 0x0 , // VGT_ENHANCE - 0x100 , // VGT_GS_PER_ES - 0x80 , // VGT_ES_PER_GS - 0x2 , // VGT_GS_PER_VS - 0x0 , // VGT_GSVS_RING_OFFSET_1 - 0x0 , // VGT_GSVS_RING_OFFSET_2 - 0x0 , // VGT_GSVS_RING_OFFSET_3 - 0x0 // VGT_GS_OUT_PRIM_TYPE + 0x0, // VGT_GS_MODE + 0x0, // VGT_GS_ONCHIP_CNTL + 0x0, // PA_SC_MODE_CNTL_0 + 0x0, // PA_SC_MODE_CNTL_1 + 0x0, // VGT_ENHANCE + 0x100, // VGT_GS_PER_ES + 0x80, // VGT_ES_PER_GS + 0x2, // VGT_GS_PER_VS + 0x0, // VGT_GSVS_RING_OFFSET_1 + 0x0, // VGT_GSVS_RING_OFFSET_2 + 0x0, // VGT_GSVS_RING_OFFSET_3 + 0x0 // VGT_GS_OUT_PRIM_TYPE }; static const uint32_t VgtPrimitiveidEnNv10[] = { - 0x0 // VGT_PRIMITIVEID_EN + 0x0 // VGT_PRIMITIVEID_EN }; static const uint32_t VgtPrimitiveidResetNv10[] = { - 0x0 // VGT_PRIMITIVEID_RESET + 0x0 // VGT_PRIMITIVEID_RESET }; static const uint32_t VgtDrawPayloadCntlNv10[] = { - 0x0 , // VGT_DRAW_PAYLOAD_CNTL - 0x0 , // - 0x0 , // VGT_INSTANCE_STEP_RATE_0 - 0x0 , // VGT_INSTANCE_STEP_RATE_1 - 0x0 , // IA_MULTI_VGT_PARAM - 0x0 , // VGT_ESGS_RING_ITEMSIZE - 0x0 , // VGT_GSVS_RING_ITEMSIZE - 0x0 , // VGT_REUSE_OFF - 0x0 , // VGT_VTX_CNT_EN - 0x0 , // DB_HTILE_SURFACE - 0x0 , // DB_SRESULTS_COMPARE_STATE0 - 0x0 , // DB_SRESULTS_COMPARE_STATE1 - 0x0 , // DB_PRELOAD_CONTROL - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_0 - 0x0 , // VGT_STRMOUT_VTX_STRIDE_0 - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_OFFSET_0 - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_1 - 0x0 , // VGT_STRMOUT_VTX_STRIDE_1 - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_OFFSET_1 - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_2 - 0x0 , // VGT_STRMOUT_VTX_STRIDE_2 - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_OFFSET_2 - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_3 - 0x0 , // VGT_STRMOUT_VTX_STRIDE_3 - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_OFFSET_3 - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // VGT_STRMOUT_DRAW_OPAQUE_OFFSET - 0x0 , // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - 0x0 , // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - 0x0 , // - 0x0 , // VGT_GS_MAX_VERT_OUT - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // GE_NGG_SUBGRP_CNTL - 0x0 , // VGT_TESS_DISTRIBUTION - 0x0 , // VGT_SHADER_STAGES_EN - 0x0 , // VGT_LS_HS_CONFIG - 0x0 , // VGT_GS_VERT_ITEMSIZE - 0x0 , // VGT_GS_VERT_ITEMSIZE_1 - 0x0 , // VGT_GS_VERT_ITEMSIZE_2 - 0x0 , // VGT_GS_VERT_ITEMSIZE_3 - 0x0 , // VGT_TF_PARAM - 0x0 , // DB_ALPHA_TO_MASK - 0x0 , // VGT_DISPATCH_DRAW_INDEX - 0x0 , // PA_SU_POLY_OFFSET_DB_FMT_CNTL - 0x0 , // PA_SU_POLY_OFFSET_CLAMP - 0x0 , // PA_SU_POLY_OFFSET_FRONT_SCALE - 0x0 , // PA_SU_POLY_OFFSET_FRONT_OFFSET - 0x0 , // PA_SU_POLY_OFFSET_BACK_SCALE - 0x0 , // PA_SU_POLY_OFFSET_BACK_OFFSET - 0x0 , // VGT_GS_INSTANCE_CNT - 0x0 , // VGT_STRMOUT_CONFIG - 0x0 // VGT_STRMOUT_BUFFER_CONFIG + 0x0, // VGT_DRAW_PAYLOAD_CNTL + 0x0, // + 0x0, // VGT_INSTANCE_STEP_RATE_0 + 0x0, // VGT_INSTANCE_STEP_RATE_1 + 0x0, // IA_MULTI_VGT_PARAM + 0x0, // VGT_ESGS_RING_ITEMSIZE + 0x0, // VGT_GSVS_RING_ITEMSIZE + 0x0, // VGT_REUSE_OFF + 0x0, // VGT_VTX_CNT_EN + 0x0, // DB_HTILE_SURFACE + 0x0, // DB_SRESULTS_COMPARE_STATE0 + 0x0, // DB_SRESULTS_COMPARE_STATE1 + 0x0, // DB_PRELOAD_CONTROL + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_SIZE_0 + 0x0, // VGT_STRMOUT_VTX_STRIDE_0 + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_OFFSET_0 + 0x0, // VGT_STRMOUT_BUFFER_SIZE_1 + 0x0, // VGT_STRMOUT_VTX_STRIDE_1 + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_OFFSET_1 + 0x0, // VGT_STRMOUT_BUFFER_SIZE_2 + 0x0, // VGT_STRMOUT_VTX_STRIDE_2 + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_OFFSET_2 + 0x0, // VGT_STRMOUT_BUFFER_SIZE_3 + 0x0, // VGT_STRMOUT_VTX_STRIDE_3 + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_OFFSET_3 + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // VGT_STRMOUT_DRAW_OPAQUE_OFFSET + 0x0, // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE + 0x0, // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE + 0x0, // + 0x0, // VGT_GS_MAX_VERT_OUT + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // GE_NGG_SUBGRP_CNTL + 0x0, // VGT_TESS_DISTRIBUTION + 0x0, // VGT_SHADER_STAGES_EN + 0x0, // VGT_LS_HS_CONFIG + 0x0, // VGT_GS_VERT_ITEMSIZE + 0x0, // VGT_GS_VERT_ITEMSIZE_1 + 0x0, // VGT_GS_VERT_ITEMSIZE_2 + 0x0, // VGT_GS_VERT_ITEMSIZE_3 + 0x0, // VGT_TF_PARAM + 0x0, // DB_ALPHA_TO_MASK + 0x0, // VGT_DISPATCH_DRAW_INDEX + 0x0, // PA_SU_POLY_OFFSET_DB_FMT_CNTL + 0x0, // PA_SU_POLY_OFFSET_CLAMP + 0x0, // PA_SU_POLY_OFFSET_FRONT_SCALE + 0x0, // PA_SU_POLY_OFFSET_FRONT_OFFSET + 0x0, // PA_SU_POLY_OFFSET_BACK_SCALE + 0x0, // PA_SU_POLY_OFFSET_BACK_OFFSET + 0x0, // VGT_GS_INSTANCE_CNT + 0x0, // VGT_STRMOUT_CONFIG + 0x0 // VGT_STRMOUT_BUFFER_CONFIG }; static const uint32_t PaScCentroidPriority0Nv10[] = { - 0x0 , // PA_SC_CENTROID_PRIORITY_0 - 0x0 , // PA_SC_CENTROID_PRIORITY_1 - 0x1000 , // PA_SC_LINE_CNTL - 0x0 , // PA_SC_AA_CONFIG - 0x5 , // PA_SU_VTX_CNTL + 0x0, // PA_SC_CENTROID_PRIORITY_0 + 0x0, // PA_SC_CENTROID_PRIORITY_1 + 0x1000, // PA_SC_LINE_CNTL + 0x0, // PA_SC_AA_CONFIG + 0x5, // PA_SU_VTX_CNTL 0x3f800000, // PA_CL_GB_VERT_CLIP_ADJ 0x3f800000, // PA_CL_GB_VERT_DISC_ADJ 0x3f800000, // PA_CL_GB_HORZ_CLIP_ADJ 0x3f800000, // PA_CL_GB_HORZ_DISC_ADJ - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3 0xffffffff, // PA_SC_AA_MASK_X0Y0_X1Y0 0xffffffff, // PA_SC_AA_MASK_X0Y1_X1Y1 - 0x0 , // PA_SC_SHADER_CONTROL - 0x3 , // PA_SC_BINNER_CNTL_0 - 0x0 , // PA_SC_BINNER_CNTL_1 - 0x100000 , // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL - 0x0 , // PA_SC_NGG_MODE_CNTL - 0x0 , // - 0x1e , // VGT_VERTEX_REUSE_BLOCK_CNTL - 0x20 , // VGT_OUT_DEALLOC_CNTL - 0x0 , // CB_COLOR0_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR0_VIEW - 0x0 , // CB_COLOR0_INFO - 0x0 , // CB_COLOR0_ATTRIB - 0x0 , // CB_COLOR0_DCC_CONTROL - 0x0 , // CB_COLOR0_CMASK - 0x0 , // - 0x0 , // CB_COLOR0_FMASK - 0x0 , // - 0x0 , // CB_COLOR0_CLEAR_WORD0 - 0x0 , // CB_COLOR0_CLEAR_WORD1 - 0x0 , // CB_COLOR0_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR1_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR1_VIEW - 0x0 , // CB_COLOR1_INFO - 0x0 , // CB_COLOR1_ATTRIB - 0x0 , // CB_COLOR1_DCC_CONTROL - 0x0 , // CB_COLOR1_CMASK - 0x0 , // - 0x0 , // CB_COLOR1_FMASK - 0x0 , // - 0x0 , // CB_COLOR1_CLEAR_WORD0 - 0x0 , // CB_COLOR1_CLEAR_WORD1 - 0x0 , // CB_COLOR1_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR2_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR2_VIEW - 0x0 , // CB_COLOR2_INFO - 0x0 , // CB_COLOR2_ATTRIB - 0x0 , // CB_COLOR2_DCC_CONTROL - 0x0 , // CB_COLOR2_CMASK - 0x0 , // - 0x0 , // CB_COLOR2_FMASK - 0x0 , // - 0x0 , // CB_COLOR2_CLEAR_WORD0 - 0x0 , // CB_COLOR2_CLEAR_WORD1 - 0x0 , // CB_COLOR2_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR3_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR3_VIEW - 0x0 , // CB_COLOR3_INFO - 0x0 , // CB_COLOR3_ATTRIB - 0x0 , // CB_COLOR3_DCC_CONTROL - 0x0 , // CB_COLOR3_CMASK - 0x0 , // - 0x0 , // CB_COLOR3_FMASK - 0x0 , // - 0x0 , // CB_COLOR3_CLEAR_WORD0 - 0x0 , // CB_COLOR3_CLEAR_WORD1 - 0x0 , // CB_COLOR3_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR4_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR4_VIEW - 0x0 , // CB_COLOR4_INFO - 0x0 , // CB_COLOR4_ATTRIB - 0x0 , // CB_COLOR4_DCC_CONTROL - 0x0 , // CB_COLOR4_CMASK - 0x0 , // - 0x0 , // CB_COLOR4_FMASK - 0x0 , // - 0x0 , // CB_COLOR4_CLEAR_WORD0 - 0x0 , // CB_COLOR4_CLEAR_WORD1 - 0x0 , // CB_COLOR4_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR5_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR5_VIEW - 0x0 , // CB_COLOR5_INFO - 0x0 , // CB_COLOR5_ATTRIB - 0x0 , // CB_COLOR5_DCC_CONTROL - 0x0 , // CB_COLOR5_CMASK - 0x0 , // - 0x0 , // CB_COLOR5_FMASK - 0x0 , // - 0x0 , // CB_COLOR5_CLEAR_WORD0 - 0x0 , // CB_COLOR5_CLEAR_WORD1 - 0x0 , // CB_COLOR5_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR6_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR6_VIEW - 0x0 , // CB_COLOR6_INFO - 0x0 , // CB_COLOR6_ATTRIB - 0x0 , // CB_COLOR6_DCC_CONTROL - 0x0 , // CB_COLOR6_CMASK - 0x0 , // - 0x0 , // CB_COLOR6_FMASK - 0x0 , // - 0x0 , // CB_COLOR6_CLEAR_WORD0 - 0x0 , // CB_COLOR6_CLEAR_WORD1 - 0x0 , // CB_COLOR6_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR7_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR7_VIEW - 0x0 , // CB_COLOR7_INFO - 0x0 , // CB_COLOR7_ATTRIB - 0x0 , // CB_COLOR7_DCC_CONTROL - 0x0 , // CB_COLOR7_CMASK - 0x0 , // - 0x0 , // CB_COLOR7_FMASK - 0x0 , // - 0x0 , // CB_COLOR7_CLEAR_WORD0 - 0x0 , // CB_COLOR7_CLEAR_WORD1 - 0x0 , // CB_COLOR7_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR0_BASE_EXT - 0x0 , // CB_COLOR1_BASE_EXT - 0x0 , // CB_COLOR2_BASE_EXT - 0x0 , // CB_COLOR3_BASE_EXT - 0x0 , // CB_COLOR4_BASE_EXT - 0x0 , // CB_COLOR5_BASE_EXT - 0x0 , // CB_COLOR6_BASE_EXT - 0x0 , // CB_COLOR7_BASE_EXT - 0x0 , // CB_COLOR0_CMASK_BASE_EXT - 0x0 , // CB_COLOR1_CMASK_BASE_EXT - 0x0 , // CB_COLOR2_CMASK_BASE_EXT - 0x0 , // CB_COLOR3_CMASK_BASE_EXT - 0x0 , // CB_COLOR4_CMASK_BASE_EXT - 0x0 , // CB_COLOR5_CMASK_BASE_EXT - 0x0 , // CB_COLOR6_CMASK_BASE_EXT - 0x0 , // CB_COLOR7_CMASK_BASE_EXT - 0x0 , // CB_COLOR0_FMASK_BASE_EXT - 0x0 , // CB_COLOR1_FMASK_BASE_EXT - 0x0 , // CB_COLOR2_FMASK_BASE_EXT - 0x0 , // CB_COLOR3_FMASK_BASE_EXT - 0x0 , // CB_COLOR4_FMASK_BASE_EXT - 0x0 , // CB_COLOR5_FMASK_BASE_EXT - 0x0 , // CB_COLOR6_FMASK_BASE_EXT - 0x0 , // CB_COLOR7_FMASK_BASE_EXT - 0x0 , // CB_COLOR0_DCC_BASE_EXT - 0x0 , // CB_COLOR1_DCC_BASE_EXT - 0x0 , // CB_COLOR2_DCC_BASE_EXT - 0x0 , // CB_COLOR3_DCC_BASE_EXT - 0x0 , // CB_COLOR4_DCC_BASE_EXT - 0x0 , // CB_COLOR5_DCC_BASE_EXT - 0x0 , // CB_COLOR6_DCC_BASE_EXT - 0x0 , // CB_COLOR7_DCC_BASE_EXT - 0x0 , // CB_COLOR0_ATTRIB2 - 0x0 , // CB_COLOR1_ATTRIB2 - 0x0 , // CB_COLOR2_ATTRIB2 - 0x0 , // CB_COLOR3_ATTRIB2 - 0x0 , // CB_COLOR4_ATTRIB2 - 0x0 , // CB_COLOR5_ATTRIB2 - 0x0 , // CB_COLOR6_ATTRIB2 - 0x0 , // CB_COLOR7_ATTRIB2 - 0x0 , // CB_COLOR0_ATTRIB3 - 0x0 , // CB_COLOR1_ATTRIB3 - 0x0 , // CB_COLOR2_ATTRIB3 - 0x0 , // CB_COLOR3_ATTRIB3 - 0x0 , // CB_COLOR4_ATTRIB3 - 0x0 , // CB_COLOR5_ATTRIB3 - 0x0 , // CB_COLOR6_ATTRIB3 + 0x0, // PA_SC_SHADER_CONTROL + 0x3, // PA_SC_BINNER_CNTL_0 + 0x0, // PA_SC_BINNER_CNTL_1 + 0x100000, // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL + 0x0, // PA_SC_NGG_MODE_CNTL + 0x0, // + 0x1e, // VGT_VERTEX_REUSE_BLOCK_CNTL + 0x20, // VGT_OUT_DEALLOC_CNTL + 0x0, // CB_COLOR0_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR0_VIEW + 0x0, // CB_COLOR0_INFO + 0x0, // CB_COLOR0_ATTRIB + 0x0, // CB_COLOR0_DCC_CONTROL + 0x0, // CB_COLOR0_CMASK + 0x0, // + 0x0, // CB_COLOR0_FMASK + 0x0, // + 0x0, // CB_COLOR0_CLEAR_WORD0 + 0x0, // CB_COLOR0_CLEAR_WORD1 + 0x0, // CB_COLOR0_DCC_BASE + 0x0, // + 0x0, // CB_COLOR1_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR1_VIEW + 0x0, // CB_COLOR1_INFO + 0x0, // CB_COLOR1_ATTRIB + 0x0, // CB_COLOR1_DCC_CONTROL + 0x0, // CB_COLOR1_CMASK + 0x0, // + 0x0, // CB_COLOR1_FMASK + 0x0, // + 0x0, // CB_COLOR1_CLEAR_WORD0 + 0x0, // CB_COLOR1_CLEAR_WORD1 + 0x0, // CB_COLOR1_DCC_BASE + 0x0, // + 0x0, // CB_COLOR2_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR2_VIEW + 0x0, // CB_COLOR2_INFO + 0x0, // CB_COLOR2_ATTRIB + 0x0, // CB_COLOR2_DCC_CONTROL + 0x0, // CB_COLOR2_CMASK + 0x0, // + 0x0, // CB_COLOR2_FMASK + 0x0, // + 0x0, // CB_COLOR2_CLEAR_WORD0 + 0x0, // CB_COLOR2_CLEAR_WORD1 + 0x0, // CB_COLOR2_DCC_BASE + 0x0, // + 0x0, // CB_COLOR3_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR3_VIEW + 0x0, // CB_COLOR3_INFO + 0x0, // CB_COLOR3_ATTRIB + 0x0, // CB_COLOR3_DCC_CONTROL + 0x0, // CB_COLOR3_CMASK + 0x0, // + 0x0, // CB_COLOR3_FMASK + 0x0, // + 0x0, // CB_COLOR3_CLEAR_WORD0 + 0x0, // CB_COLOR3_CLEAR_WORD1 + 0x0, // CB_COLOR3_DCC_BASE + 0x0, // + 0x0, // CB_COLOR4_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR4_VIEW + 0x0, // CB_COLOR4_INFO + 0x0, // CB_COLOR4_ATTRIB + 0x0, // CB_COLOR4_DCC_CONTROL + 0x0, // CB_COLOR4_CMASK + 0x0, // + 0x0, // CB_COLOR4_FMASK + 0x0, // + 0x0, // CB_COLOR4_CLEAR_WORD0 + 0x0, // CB_COLOR4_CLEAR_WORD1 + 0x0, // CB_COLOR4_DCC_BASE + 0x0, // + 0x0, // CB_COLOR5_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR5_VIEW + 0x0, // CB_COLOR5_INFO + 0x0, // CB_COLOR5_ATTRIB + 0x0, // CB_COLOR5_DCC_CONTROL + 0x0, // CB_COLOR5_CMASK + 0x0, // + 0x0, // CB_COLOR5_FMASK + 0x0, // + 0x0, // CB_COLOR5_CLEAR_WORD0 + 0x0, // CB_COLOR5_CLEAR_WORD1 + 0x0, // CB_COLOR5_DCC_BASE + 0x0, // + 0x0, // CB_COLOR6_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR6_VIEW + 0x0, // CB_COLOR6_INFO + 0x0, // CB_COLOR6_ATTRIB + 0x0, // CB_COLOR6_DCC_CONTROL + 0x0, // CB_COLOR6_CMASK + 0x0, // + 0x0, // CB_COLOR6_FMASK + 0x0, // + 0x0, // CB_COLOR6_CLEAR_WORD0 + 0x0, // CB_COLOR6_CLEAR_WORD1 + 0x0, // CB_COLOR6_DCC_BASE + 0x0, // + 0x0, // CB_COLOR7_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR7_VIEW + 0x0, // CB_COLOR7_INFO + 0x0, // CB_COLOR7_ATTRIB + 0x0, // CB_COLOR7_DCC_CONTROL + 0x0, // CB_COLOR7_CMASK + 0x0, // + 0x0, // CB_COLOR7_FMASK + 0x0, // + 0x0, // CB_COLOR7_CLEAR_WORD0 + 0x0, // CB_COLOR7_CLEAR_WORD1 + 0x0, // CB_COLOR7_DCC_BASE + 0x0, // + 0x0, // CB_COLOR0_BASE_EXT + 0x0, // CB_COLOR1_BASE_EXT + 0x0, // CB_COLOR2_BASE_EXT + 0x0, // CB_COLOR3_BASE_EXT + 0x0, // CB_COLOR4_BASE_EXT + 0x0, // CB_COLOR5_BASE_EXT + 0x0, // CB_COLOR6_BASE_EXT + 0x0, // CB_COLOR7_BASE_EXT + 0x0, // CB_COLOR0_CMASK_BASE_EXT + 0x0, // CB_COLOR1_CMASK_BASE_EXT + 0x0, // CB_COLOR2_CMASK_BASE_EXT + 0x0, // CB_COLOR3_CMASK_BASE_EXT + 0x0, // CB_COLOR4_CMASK_BASE_EXT + 0x0, // CB_COLOR5_CMASK_BASE_EXT + 0x0, // CB_COLOR6_CMASK_BASE_EXT + 0x0, // CB_COLOR7_CMASK_BASE_EXT + 0x0, // CB_COLOR0_FMASK_BASE_EXT + 0x0, // CB_COLOR1_FMASK_BASE_EXT + 0x0, // CB_COLOR2_FMASK_BASE_EXT + 0x0, // CB_COLOR3_FMASK_BASE_EXT + 0x0, // CB_COLOR4_FMASK_BASE_EXT + 0x0, // CB_COLOR5_FMASK_BASE_EXT + 0x0, // CB_COLOR6_FMASK_BASE_EXT + 0x0, // CB_COLOR7_FMASK_BASE_EXT + 0x0, // CB_COLOR0_DCC_BASE_EXT + 0x0, // CB_COLOR1_DCC_BASE_EXT + 0x0, // CB_COLOR2_DCC_BASE_EXT + 0x0, // CB_COLOR3_DCC_BASE_EXT + 0x0, // CB_COLOR4_DCC_BASE_EXT + 0x0, // CB_COLOR5_DCC_BASE_EXT + 0x0, // CB_COLOR6_DCC_BASE_EXT + 0x0, // CB_COLOR7_DCC_BASE_EXT + 0x0, // CB_COLOR0_ATTRIB2 + 0x0, // CB_COLOR1_ATTRIB2 + 0x0, // CB_COLOR2_ATTRIB2 + 0x0, // CB_COLOR3_ATTRIB2 + 0x0, // CB_COLOR4_ATTRIB2 + 0x0, // CB_COLOR5_ATTRIB2 + 0x0, // CB_COLOR6_ATTRIB2 + 0x0, // CB_COLOR7_ATTRIB2 + 0x0, // CB_COLOR0_ATTRIB3 + 0x0, // CB_COLOR1_ATTRIB3 + 0x0, // CB_COLOR2_ATTRIB3 + 0x0, // CB_COLOR3_ATTRIB3 + 0x0, // CB_COLOR4_ATTRIB3 + 0x0, // CB_COLOR5_ATTRIB3 + 0x0, // CB_COLOR6_ATTRIB3 0x0 // CB_COLOR7_ATTRIB3 }; set_context_reg_seq_array(cs, R_028000_DB_RENDER_CONTROL, SET(DbRenderControlNv10)); set_context_reg_seq_array(cs, R_0281E8_COHER_DEST_BASE_HI_0, SET(CoherDestBaseHi0Nv10)); - set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, SET(VgtMultiPrimIbResetIndxNv10)); + set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, + SET(VgtMultiPrimIbResetIndxNv10)); set_context_reg_seq_array(cs, R_028644_SPI_PS_INPUT_CNTL_0, SET(SpiPsInputCntl0Nv10)); set_context_reg_seq_array(cs, R_028754_SX_PS_DOWNCONVERT, SET(SxPsDownconvertNv10)); - set_context_reg_seq_array(cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, SET(GeMaxOutputPerSubgroupNv10)); + set_context_reg_seq_array(cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, + SET(GeMaxOutputPerSubgroupNv10)); set_context_reg_seq_array(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, SET(PaSuPrimFilterCntlNv10)); set_context_reg_seq_array(cs, R_028A00_PA_SU_POINT_SIZE, SET(PaSuPointSizeNv10)); set_context_reg_seq_array(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, SET(VgtHosMaxTessLevelNv10)); @@ -2207,7 +2222,8 @@ static void gfx10_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_reg set_context_reg_seq_array(cs, R_028A84_VGT_PRIMITIVEID_EN, SET(VgtPrimitiveidEnNv10)); set_context_reg_seq_array(cs, R_028A8C_VGT_PRIMITIVEID_RESET, SET(VgtPrimitiveidResetNv10)); set_context_reg_seq_array(cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL, SET(VgtDrawPayloadCntlNv10)); - set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, SET(PaScCentroidPriority0Nv10)); + set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, + SET(PaScCentroidPriority0Nv10)); for (unsigned i = 0; i < num_reg_pairs; i++) set_context_reg_seq_array(cs, reg_offsets[i], 1, ®_values[i]); @@ -2222,68 +2238,68 @@ static void gfx103_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_re set_context_reg_seq_array_fn set_context_reg_seq_array) { static const uint32_t DbRenderControlGfx103[] = { - 0x0 , // DB_RENDER_CONTROL - 0x0 , // DB_COUNT_CONTROL - 0x0 , // DB_DEPTH_VIEW - 0x0 , // DB_RENDER_OVERRIDE - 0x0 , // DB_RENDER_OVERRIDE2 - 0x0 , // DB_HTILE_DATA_BASE - 0x0 , // - 0x0 , // DB_DEPTH_SIZE_XY - 0x0 , // DB_DEPTH_BOUNDS_MIN - 0x0 , // DB_DEPTH_BOUNDS_MAX - 0x0 , // DB_STENCIL_CLEAR - 0x0 , // DB_DEPTH_CLEAR - 0x0 , // PA_SC_SCREEN_SCISSOR_TL + 0x0, // DB_RENDER_CONTROL + 0x0, // DB_COUNT_CONTROL + 0x0, // DB_DEPTH_VIEW + 0x0, // DB_RENDER_OVERRIDE + 0x0, // DB_RENDER_OVERRIDE2 + 0x0, // DB_HTILE_DATA_BASE + 0x0, // + 0x0, // DB_DEPTH_SIZE_XY + 0x0, // DB_DEPTH_BOUNDS_MIN + 0x0, // DB_DEPTH_BOUNDS_MAX + 0x0, // DB_STENCIL_CLEAR + 0x0, // DB_DEPTH_CLEAR + 0x0, // PA_SC_SCREEN_SCISSOR_TL 0x40004000, // PA_SC_SCREEN_SCISSOR_BR - 0x0 , // DB_DFSM_CONTROL - 0x0 , // DB_RESERVED_REG_2 - 0x0 , // DB_Z_INFO - 0x0 , // DB_STENCIL_INFO - 0x0 , // DB_Z_READ_BASE - 0x0 , // DB_STENCIL_READ_BASE - 0x0 , // DB_Z_WRITE_BASE - 0x0 , // DB_STENCIL_WRITE_BASE - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // DB_Z_READ_BASE_HI - 0x0 , // DB_STENCIL_READ_BASE_HI - 0x0 , // DB_Z_WRITE_BASE_HI - 0x0 , // DB_STENCIL_WRITE_BASE_HI - 0x0 , // DB_HTILE_DATA_BASE_HI - 0x0 , // DB_RMI_L2_CACHE_CONTROL - 0x0 , // TA_BC_BASE_ADDR + 0x0, // DB_DFSM_CONTROL + 0x0, // DB_RESERVED_REG_2 + 0x0, // DB_Z_INFO + 0x0, // DB_STENCIL_INFO + 0x0, // DB_Z_READ_BASE + 0x0, // DB_STENCIL_READ_BASE + 0x0, // DB_Z_WRITE_BASE + 0x0, // DB_STENCIL_WRITE_BASE + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // DB_Z_READ_BASE_HI + 0x0, // DB_STENCIL_READ_BASE_HI + 0x0, // DB_Z_WRITE_BASE_HI + 0x0, // DB_STENCIL_WRITE_BASE_HI + 0x0, // DB_HTILE_DATA_BASE_HI + 0x0, // DB_RMI_L2_CACHE_CONTROL + 0x0, // TA_BC_BASE_ADDR 0x0 // TA_BC_BASE_ADDR_HI }; static const uint32_t CoherDestBaseHi0Gfx103[] = { - 0x0 , // COHER_DEST_BASE_HI_0 - 0x0 , // COHER_DEST_BASE_HI_1 - 0x0 , // COHER_DEST_BASE_HI_2 - 0x0 , // COHER_DEST_BASE_HI_3 - 0x0 , // COHER_DEST_BASE_2 - 0x0 , // COHER_DEST_BASE_3 - 0x0 , // PA_SC_WINDOW_OFFSET + 0x0, // COHER_DEST_BASE_HI_0 + 0x0, // COHER_DEST_BASE_HI_1 + 0x0, // COHER_DEST_BASE_HI_2 + 0x0, // COHER_DEST_BASE_HI_3 + 0x0, // COHER_DEST_BASE_2 + 0x0, // COHER_DEST_BASE_3 + 0x0, // PA_SC_WINDOW_OFFSET 0x80000000, // PA_SC_WINDOW_SCISSOR_TL 0x40004000, // PA_SC_WINDOW_SCISSOR_BR - 0xffff , // PA_SC_CLIPRECT_RULE - 0x0 , // PA_SC_CLIPRECT_0_TL + 0xffff, // PA_SC_CLIPRECT_RULE + 0x0, // PA_SC_CLIPRECT_0_TL 0x40004000, // PA_SC_CLIPRECT_0_BR - 0x0 , // PA_SC_CLIPRECT_1_TL + 0x0, // PA_SC_CLIPRECT_1_TL 0x40004000, // PA_SC_CLIPRECT_1_BR - 0x0 , // PA_SC_CLIPRECT_2_TL + 0x0, // PA_SC_CLIPRECT_2_TL 0x40004000, // PA_SC_CLIPRECT_2_BR - 0x0 , // PA_SC_CLIPRECT_3_TL + 0x0, // PA_SC_CLIPRECT_3_TL 0x40004000, // PA_SC_CLIPRECT_3_BR 0xaa99aaaa, // PA_SC_EDGERULE - 0x0 , // PA_SU_HARDWARE_SCREEN_OFFSET + 0x0, // PA_SU_HARDWARE_SCREEN_OFFSET 0xffffffff, // CB_TARGET_MASK 0xffffffff, // CB_SHADER_MASK 0x80000000, // PA_SC_GENERIC_SCISSOR_TL 0x40004000, // PA_SC_GENERIC_SCISSOR_BR - 0x0 , // COHER_DEST_BASE_0 - 0x0 , // COHER_DEST_BASE_1 + 0x0, // COHER_DEST_BASE_0 + 0x0, // COHER_DEST_BASE_1 0x80000000, // PA_SC_VPORT_SCISSOR_0_TL 0x40004000, // PA_SC_VPORT_SCISSOR_0_BR 0x80000000, // PA_SC_VPORT_SCISSOR_1_TL @@ -2316,585 +2332,588 @@ static void gfx103_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_re 0x40004000, // PA_SC_VPORT_SCISSOR_14_BR 0x80000000, // PA_SC_VPORT_SCISSOR_15_TL 0x40004000, // PA_SC_VPORT_SCISSOR_15_BR - 0x0 , // PA_SC_VPORT_ZMIN_0 + 0x0, // PA_SC_VPORT_ZMIN_0 0x3f800000, // PA_SC_VPORT_ZMAX_0 - 0x0 , // PA_SC_VPORT_ZMIN_1 + 0x0, // PA_SC_VPORT_ZMIN_1 0x3f800000, // PA_SC_VPORT_ZMAX_1 - 0x0 , // PA_SC_VPORT_ZMIN_2 + 0x0, // PA_SC_VPORT_ZMIN_2 0x3f800000, // PA_SC_VPORT_ZMAX_2 - 0x0 , // PA_SC_VPORT_ZMIN_3 + 0x0, // PA_SC_VPORT_ZMIN_3 0x3f800000, // PA_SC_VPORT_ZMAX_3 - 0x0 , // PA_SC_VPORT_ZMIN_4 + 0x0, // PA_SC_VPORT_ZMIN_4 0x3f800000, // PA_SC_VPORT_ZMAX_4 - 0x0 , // PA_SC_VPORT_ZMIN_5 + 0x0, // PA_SC_VPORT_ZMIN_5 0x3f800000, // PA_SC_VPORT_ZMAX_5 - 0x0 , // PA_SC_VPORT_ZMIN_6 + 0x0, // PA_SC_VPORT_ZMIN_6 0x3f800000, // PA_SC_VPORT_ZMAX_6 - 0x0 , // PA_SC_VPORT_ZMIN_7 + 0x0, // PA_SC_VPORT_ZMIN_7 0x3f800000, // PA_SC_VPORT_ZMAX_7 - 0x0 , // PA_SC_VPORT_ZMIN_8 + 0x0, // PA_SC_VPORT_ZMIN_8 0x3f800000, // PA_SC_VPORT_ZMAX_8 - 0x0 , // PA_SC_VPORT_ZMIN_9 + 0x0, // PA_SC_VPORT_ZMIN_9 0x3f800000, // PA_SC_VPORT_ZMAX_9 - 0x0 , // PA_SC_VPORT_ZMIN_10 + 0x0, // PA_SC_VPORT_ZMIN_10 0x3f800000, // PA_SC_VPORT_ZMAX_10 - 0x0 , // PA_SC_VPORT_ZMIN_11 + 0x0, // PA_SC_VPORT_ZMIN_11 0x3f800000, // PA_SC_VPORT_ZMAX_11 - 0x0 , // PA_SC_VPORT_ZMIN_12 + 0x0, // PA_SC_VPORT_ZMIN_12 0x3f800000, // PA_SC_VPORT_ZMAX_12 - 0x0 , // PA_SC_VPORT_ZMIN_13 + 0x0, // PA_SC_VPORT_ZMIN_13 0x3f800000, // PA_SC_VPORT_ZMAX_13 - 0x0 , // PA_SC_VPORT_ZMIN_14 + 0x0, // PA_SC_VPORT_ZMIN_14 0x3f800000, // PA_SC_VPORT_ZMAX_14 - 0x0 , // PA_SC_VPORT_ZMIN_15 + 0x0, // PA_SC_VPORT_ZMIN_15 0x3f800000, // PA_SC_VPORT_ZMAX_15 - 0x0 , // PA_SC_RASTER_CONFIG - 0x0 , // PA_SC_RASTER_CONFIG_1 - 0x0 , // + 0x0, // PA_SC_RASTER_CONFIG + 0x0, // PA_SC_RASTER_CONFIG_1 + 0x0, // 0x0 // PA_SC_TILE_STEERING_OVERRIDE }; static const uint32_t VgtMultiPrimIbResetIndxGfx103[] = { - 0x0 , // VGT_MULTI_PRIM_IB_RESET_INDX - 0x0 , // CB_RMI_GL2_CACHE_CONTROL - 0x0 , // CB_BLEND_RED - 0x0 , // CB_BLEND_GREEN - 0x0 , // CB_BLEND_BLUE - 0x0 , // CB_BLEND_ALPHA - 0x0 , // CB_DCC_CONTROL - 0x0 , // CB_COVERAGE_OUT_CONTROL - 0x0 , // DB_STENCIL_CONTROL - 0x1000000 , // DB_STENCILREFMASK - 0x1000000 , // DB_STENCILREFMASK_BF - 0x0 , // - 0x0 , // PA_CL_VPORT_XSCALE - 0x0 , // PA_CL_VPORT_XOFFSET - 0x0 , // PA_CL_VPORT_YSCALE - 0x0 , // PA_CL_VPORT_YOFFSET - 0x0 , // PA_CL_VPORT_ZSCALE - 0x0 , // PA_CL_VPORT_ZOFFSET - 0x0 , // PA_CL_VPORT_XSCALE_1 - 0x0 , // PA_CL_VPORT_XOFFSET_1 - 0x0 , // PA_CL_VPORT_YSCALE_1 - 0x0 , // PA_CL_VPORT_YOFFSET_1 - 0x0 , // PA_CL_VPORT_ZSCALE_1 - 0x0 , // PA_CL_VPORT_ZOFFSET_1 - 0x0 , // PA_CL_VPORT_XSCALE_2 - 0x0 , // PA_CL_VPORT_XOFFSET_2 - 0x0 , // PA_CL_VPORT_YSCALE_2 - 0x0 , // PA_CL_VPORT_YOFFSET_2 - 0x0 , // PA_CL_VPORT_ZSCALE_2 - 0x0 , // PA_CL_VPORT_ZOFFSET_2 - 0x0 , // PA_CL_VPORT_XSCALE_3 - 0x0 , // PA_CL_VPORT_XOFFSET_3 - 0x0 , // PA_CL_VPORT_YSCALE_3 - 0x0 , // PA_CL_VPORT_YOFFSET_3 - 0x0 , // PA_CL_VPORT_ZSCALE_3 - 0x0 , // PA_CL_VPORT_ZOFFSET_3 - 0x0 , // PA_CL_VPORT_XSCALE_4 - 0x0 , // PA_CL_VPORT_XOFFSET_4 - 0x0 , // PA_CL_VPORT_YSCALE_4 - 0x0 , // PA_CL_VPORT_YOFFSET_4 - 0x0 , // PA_CL_VPORT_ZSCALE_4 - 0x0 , // PA_CL_VPORT_ZOFFSET_4 - 0x0 , // PA_CL_VPORT_XSCALE_5 - 0x0 , // PA_CL_VPORT_XOFFSET_5 - 0x0 , // PA_CL_VPORT_YSCALE_5 - 0x0 , // PA_CL_VPORT_YOFFSET_5 - 0x0 , // PA_CL_VPORT_ZSCALE_5 - 0x0 , // PA_CL_VPORT_ZOFFSET_5 - 0x0 , // PA_CL_VPORT_XSCALE_6 - 0x0 , // PA_CL_VPORT_XOFFSET_6 - 0x0 , // PA_CL_VPORT_YSCALE_6 - 0x0 , // PA_CL_VPORT_YOFFSET_6 - 0x0 , // PA_CL_VPORT_ZSCALE_6 - 0x0 , // PA_CL_VPORT_ZOFFSET_6 - 0x0 , // PA_CL_VPORT_XSCALE_7 - 0x0 , // PA_CL_VPORT_XOFFSET_7 - 0x0 , // PA_CL_VPORT_YSCALE_7 - 0x0 , // PA_CL_VPORT_YOFFSET_7 - 0x0 , // PA_CL_VPORT_ZSCALE_7 - 0x0 , // PA_CL_VPORT_ZOFFSET_7 - 0x0 , // PA_CL_VPORT_XSCALE_8 - 0x0 , // PA_CL_VPORT_XOFFSET_8 - 0x0 , // PA_CL_VPORT_YSCALE_8 - 0x0 , // PA_CL_VPORT_YOFFSET_8 - 0x0 , // PA_CL_VPORT_ZSCALE_8 - 0x0 , // PA_CL_VPORT_ZOFFSET_8 - 0x0 , // PA_CL_VPORT_XSCALE_9 - 0x0 , // PA_CL_VPORT_XOFFSET_9 - 0x0 , // PA_CL_VPORT_YSCALE_9 - 0x0 , // PA_CL_VPORT_YOFFSET_9 - 0x0 , // PA_CL_VPORT_ZSCALE_9 - 0x0 , // PA_CL_VPORT_ZOFFSET_9 - 0x0 , // PA_CL_VPORT_XSCALE_10 - 0x0 , // PA_CL_VPORT_XOFFSET_10 - 0x0 , // PA_CL_VPORT_YSCALE_10 - 0x0 , // PA_CL_VPORT_YOFFSET_10 - 0x0 , // PA_CL_VPORT_ZSCALE_10 - 0x0 , // PA_CL_VPORT_ZOFFSET_10 - 0x0 , // PA_CL_VPORT_XSCALE_11 - 0x0 , // PA_CL_VPORT_XOFFSET_11 - 0x0 , // PA_CL_VPORT_YSCALE_11 - 0x0 , // PA_CL_VPORT_YOFFSET_11 - 0x0 , // PA_CL_VPORT_ZSCALE_11 - 0x0 , // PA_CL_VPORT_ZOFFSET_11 - 0x0 , // PA_CL_VPORT_XSCALE_12 - 0x0 , // PA_CL_VPORT_XOFFSET_12 - 0x0 , // PA_CL_VPORT_YSCALE_12 - 0x0 , // PA_CL_VPORT_YOFFSET_12 - 0x0 , // PA_CL_VPORT_ZSCALE_12 - 0x0 , // PA_CL_VPORT_ZOFFSET_12 - 0x0 , // PA_CL_VPORT_XSCALE_13 - 0x0 , // PA_CL_VPORT_XOFFSET_13 - 0x0 , // PA_CL_VPORT_YSCALE_13 - 0x0 , // PA_CL_VPORT_YOFFSET_13 - 0x0 , // PA_CL_VPORT_ZSCALE_13 - 0x0 , // PA_CL_VPORT_ZOFFSET_13 - 0x0 , // PA_CL_VPORT_XSCALE_14 - 0x0 , // PA_CL_VPORT_XOFFSET_14 - 0x0 , // PA_CL_VPORT_YSCALE_14 - 0x0 , // PA_CL_VPORT_YOFFSET_14 - 0x0 , // PA_CL_VPORT_ZSCALE_14 - 0x0 , // PA_CL_VPORT_ZOFFSET_14 - 0x0 , // PA_CL_VPORT_XSCALE_15 - 0x0 , // PA_CL_VPORT_XOFFSET_15 - 0x0 , // PA_CL_VPORT_YSCALE_15 - 0x0 , // PA_CL_VPORT_YOFFSET_15 - 0x0 , // PA_CL_VPORT_ZSCALE_15 - 0x0 , // PA_CL_VPORT_ZOFFSET_15 - 0x0 , // PA_CL_UCP_0_X - 0x0 , // PA_CL_UCP_0_Y - 0x0 , // PA_CL_UCP_0_Z - 0x0 , // PA_CL_UCP_0_W - 0x0 , // PA_CL_UCP_1_X - 0x0 , // PA_CL_UCP_1_Y - 0x0 , // PA_CL_UCP_1_Z - 0x0 , // PA_CL_UCP_1_W - 0x0 , // PA_CL_UCP_2_X - 0x0 , // PA_CL_UCP_2_Y - 0x0 , // PA_CL_UCP_2_Z - 0x0 , // PA_CL_UCP_2_W - 0x0 , // PA_CL_UCP_3_X - 0x0 , // PA_CL_UCP_3_Y - 0x0 , // PA_CL_UCP_3_Z - 0x0 , // PA_CL_UCP_3_W - 0x0 , // PA_CL_UCP_4_X - 0x0 , // PA_CL_UCP_4_Y - 0x0 , // PA_CL_UCP_4_Z - 0x0 , // PA_CL_UCP_4_W - 0x0 , // PA_CL_UCP_5_X - 0x0 , // PA_CL_UCP_5_Y - 0x0 , // PA_CL_UCP_5_Z - 0x0 // PA_CL_UCP_5_W + 0x0, // VGT_MULTI_PRIM_IB_RESET_INDX + 0x0, // CB_RMI_GL2_CACHE_CONTROL + 0x0, // CB_BLEND_RED + 0x0, // CB_BLEND_GREEN + 0x0, // CB_BLEND_BLUE + 0x0, // CB_BLEND_ALPHA + 0x0, // CB_DCC_CONTROL + 0x0, // CB_COVERAGE_OUT_CONTROL + 0x0, // DB_STENCIL_CONTROL + 0x1000000, // DB_STENCILREFMASK + 0x1000000, // DB_STENCILREFMASK_BF + 0x0, // + 0x0, // PA_CL_VPORT_XSCALE + 0x0, // PA_CL_VPORT_XOFFSET + 0x0, // PA_CL_VPORT_YSCALE + 0x0, // PA_CL_VPORT_YOFFSET + 0x0, // PA_CL_VPORT_ZSCALE + 0x0, // PA_CL_VPORT_ZOFFSET + 0x0, // PA_CL_VPORT_XSCALE_1 + 0x0, // PA_CL_VPORT_XOFFSET_1 + 0x0, // PA_CL_VPORT_YSCALE_1 + 0x0, // PA_CL_VPORT_YOFFSET_1 + 0x0, // PA_CL_VPORT_ZSCALE_1 + 0x0, // PA_CL_VPORT_ZOFFSET_1 + 0x0, // PA_CL_VPORT_XSCALE_2 + 0x0, // PA_CL_VPORT_XOFFSET_2 + 0x0, // PA_CL_VPORT_YSCALE_2 + 0x0, // PA_CL_VPORT_YOFFSET_2 + 0x0, // PA_CL_VPORT_ZSCALE_2 + 0x0, // PA_CL_VPORT_ZOFFSET_2 + 0x0, // PA_CL_VPORT_XSCALE_3 + 0x0, // PA_CL_VPORT_XOFFSET_3 + 0x0, // PA_CL_VPORT_YSCALE_3 + 0x0, // PA_CL_VPORT_YOFFSET_3 + 0x0, // PA_CL_VPORT_ZSCALE_3 + 0x0, // PA_CL_VPORT_ZOFFSET_3 + 0x0, // PA_CL_VPORT_XSCALE_4 + 0x0, // PA_CL_VPORT_XOFFSET_4 + 0x0, // PA_CL_VPORT_YSCALE_4 + 0x0, // PA_CL_VPORT_YOFFSET_4 + 0x0, // PA_CL_VPORT_ZSCALE_4 + 0x0, // PA_CL_VPORT_ZOFFSET_4 + 0x0, // PA_CL_VPORT_XSCALE_5 + 0x0, // PA_CL_VPORT_XOFFSET_5 + 0x0, // PA_CL_VPORT_YSCALE_5 + 0x0, // PA_CL_VPORT_YOFFSET_5 + 0x0, // PA_CL_VPORT_ZSCALE_5 + 0x0, // PA_CL_VPORT_ZOFFSET_5 + 0x0, // PA_CL_VPORT_XSCALE_6 + 0x0, // PA_CL_VPORT_XOFFSET_6 + 0x0, // PA_CL_VPORT_YSCALE_6 + 0x0, // PA_CL_VPORT_YOFFSET_6 + 0x0, // PA_CL_VPORT_ZSCALE_6 + 0x0, // PA_CL_VPORT_ZOFFSET_6 + 0x0, // PA_CL_VPORT_XSCALE_7 + 0x0, // PA_CL_VPORT_XOFFSET_7 + 0x0, // PA_CL_VPORT_YSCALE_7 + 0x0, // PA_CL_VPORT_YOFFSET_7 + 0x0, // PA_CL_VPORT_ZSCALE_7 + 0x0, // PA_CL_VPORT_ZOFFSET_7 + 0x0, // PA_CL_VPORT_XSCALE_8 + 0x0, // PA_CL_VPORT_XOFFSET_8 + 0x0, // PA_CL_VPORT_YSCALE_8 + 0x0, // PA_CL_VPORT_YOFFSET_8 + 0x0, // PA_CL_VPORT_ZSCALE_8 + 0x0, // PA_CL_VPORT_ZOFFSET_8 + 0x0, // PA_CL_VPORT_XSCALE_9 + 0x0, // PA_CL_VPORT_XOFFSET_9 + 0x0, // PA_CL_VPORT_YSCALE_9 + 0x0, // PA_CL_VPORT_YOFFSET_9 + 0x0, // PA_CL_VPORT_ZSCALE_9 + 0x0, // PA_CL_VPORT_ZOFFSET_9 + 0x0, // PA_CL_VPORT_XSCALE_10 + 0x0, // PA_CL_VPORT_XOFFSET_10 + 0x0, // PA_CL_VPORT_YSCALE_10 + 0x0, // PA_CL_VPORT_YOFFSET_10 + 0x0, // PA_CL_VPORT_ZSCALE_10 + 0x0, // PA_CL_VPORT_ZOFFSET_10 + 0x0, // PA_CL_VPORT_XSCALE_11 + 0x0, // PA_CL_VPORT_XOFFSET_11 + 0x0, // PA_CL_VPORT_YSCALE_11 + 0x0, // PA_CL_VPORT_YOFFSET_11 + 0x0, // PA_CL_VPORT_ZSCALE_11 + 0x0, // PA_CL_VPORT_ZOFFSET_11 + 0x0, // PA_CL_VPORT_XSCALE_12 + 0x0, // PA_CL_VPORT_XOFFSET_12 + 0x0, // PA_CL_VPORT_YSCALE_12 + 0x0, // PA_CL_VPORT_YOFFSET_12 + 0x0, // PA_CL_VPORT_ZSCALE_12 + 0x0, // PA_CL_VPORT_ZOFFSET_12 + 0x0, // PA_CL_VPORT_XSCALE_13 + 0x0, // PA_CL_VPORT_XOFFSET_13 + 0x0, // PA_CL_VPORT_YSCALE_13 + 0x0, // PA_CL_VPORT_YOFFSET_13 + 0x0, // PA_CL_VPORT_ZSCALE_13 + 0x0, // PA_CL_VPORT_ZOFFSET_13 + 0x0, // PA_CL_VPORT_XSCALE_14 + 0x0, // PA_CL_VPORT_XOFFSET_14 + 0x0, // PA_CL_VPORT_YSCALE_14 + 0x0, // PA_CL_VPORT_YOFFSET_14 + 0x0, // PA_CL_VPORT_ZSCALE_14 + 0x0, // PA_CL_VPORT_ZOFFSET_14 + 0x0, // PA_CL_VPORT_XSCALE_15 + 0x0, // PA_CL_VPORT_XOFFSET_15 + 0x0, // PA_CL_VPORT_YSCALE_15 + 0x0, // PA_CL_VPORT_YOFFSET_15 + 0x0, // PA_CL_VPORT_ZSCALE_15 + 0x0, // PA_CL_VPORT_ZOFFSET_15 + 0x0, // PA_CL_UCP_0_X + 0x0, // PA_CL_UCP_0_Y + 0x0, // PA_CL_UCP_0_Z + 0x0, // PA_CL_UCP_0_W + 0x0, // PA_CL_UCP_1_X + 0x0, // PA_CL_UCP_1_Y + 0x0, // PA_CL_UCP_1_Z + 0x0, // PA_CL_UCP_1_W + 0x0, // PA_CL_UCP_2_X + 0x0, // PA_CL_UCP_2_Y + 0x0, // PA_CL_UCP_2_Z + 0x0, // PA_CL_UCP_2_W + 0x0, // PA_CL_UCP_3_X + 0x0, // PA_CL_UCP_3_Y + 0x0, // PA_CL_UCP_3_Z + 0x0, // PA_CL_UCP_3_W + 0x0, // PA_CL_UCP_4_X + 0x0, // PA_CL_UCP_4_Y + 0x0, // PA_CL_UCP_4_Z + 0x0, // PA_CL_UCP_4_W + 0x0, // PA_CL_UCP_5_X + 0x0, // PA_CL_UCP_5_Y + 0x0, // PA_CL_UCP_5_Z + 0x0 // PA_CL_UCP_5_W }; static const uint32_t SpiPsInputCntl0Gfx103[] = { - 0x0 , // SPI_PS_INPUT_CNTL_0 - 0x0 , // SPI_PS_INPUT_CNTL_1 - 0x0 , // SPI_PS_INPUT_CNTL_2 - 0x0 , // SPI_PS_INPUT_CNTL_3 - 0x0 , // SPI_PS_INPUT_CNTL_4 - 0x0 , // SPI_PS_INPUT_CNTL_5 - 0x0 , // SPI_PS_INPUT_CNTL_6 - 0x0 , // SPI_PS_INPUT_CNTL_7 - 0x0 , // SPI_PS_INPUT_CNTL_8 - 0x0 , // SPI_PS_INPUT_CNTL_9 - 0x0 , // SPI_PS_INPUT_CNTL_10 - 0x0 , // SPI_PS_INPUT_CNTL_11 - 0x0 , // SPI_PS_INPUT_CNTL_12 - 0x0 , // SPI_PS_INPUT_CNTL_13 - 0x0 , // SPI_PS_INPUT_CNTL_14 - 0x0 , // SPI_PS_INPUT_CNTL_15 - 0x0 , // SPI_PS_INPUT_CNTL_16 - 0x0 , // SPI_PS_INPUT_CNTL_17 - 0x0 , // SPI_PS_INPUT_CNTL_18 - 0x0 , // SPI_PS_INPUT_CNTL_19 - 0x0 , // SPI_PS_INPUT_CNTL_20 - 0x0 , // SPI_PS_INPUT_CNTL_21 - 0x0 , // SPI_PS_INPUT_CNTL_22 - 0x0 , // SPI_PS_INPUT_CNTL_23 - 0x0 , // SPI_PS_INPUT_CNTL_24 - 0x0 , // SPI_PS_INPUT_CNTL_25 - 0x0 , // SPI_PS_INPUT_CNTL_26 - 0x0 , // SPI_PS_INPUT_CNTL_27 - 0x0 , // SPI_PS_INPUT_CNTL_28 - 0x0 , // SPI_PS_INPUT_CNTL_29 - 0x0 , // SPI_PS_INPUT_CNTL_30 - 0x0 , // SPI_PS_INPUT_CNTL_31 - 0x0 , // SPI_VS_OUT_CONFIG - 0x0 , // - 0x0 , // SPI_PS_INPUT_ENA - 0x0 , // SPI_PS_INPUT_ADDR - 0x0 , // SPI_INTERP_CONTROL_0 - 0x2 , // SPI_PS_IN_CONTROL - 0x0 , // - 0x0 , // SPI_BARYC_CNTL - 0x0 , // - 0x0 , // SPI_TMPRING_SIZE - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // SPI_SHADER_IDX_FORMAT - 0x0 , // SPI_SHADER_POS_FORMAT - 0x0 , // SPI_SHADER_Z_FORMAT - 0x0 // SPI_SHADER_COL_FORMAT + 0x0, // SPI_PS_INPUT_CNTL_0 + 0x0, // SPI_PS_INPUT_CNTL_1 + 0x0, // SPI_PS_INPUT_CNTL_2 + 0x0, // SPI_PS_INPUT_CNTL_3 + 0x0, // SPI_PS_INPUT_CNTL_4 + 0x0, // SPI_PS_INPUT_CNTL_5 + 0x0, // SPI_PS_INPUT_CNTL_6 + 0x0, // SPI_PS_INPUT_CNTL_7 + 0x0, // SPI_PS_INPUT_CNTL_8 + 0x0, // SPI_PS_INPUT_CNTL_9 + 0x0, // SPI_PS_INPUT_CNTL_10 + 0x0, // SPI_PS_INPUT_CNTL_11 + 0x0, // SPI_PS_INPUT_CNTL_12 + 0x0, // SPI_PS_INPUT_CNTL_13 + 0x0, // SPI_PS_INPUT_CNTL_14 + 0x0, // SPI_PS_INPUT_CNTL_15 + 0x0, // SPI_PS_INPUT_CNTL_16 + 0x0, // SPI_PS_INPUT_CNTL_17 + 0x0, // SPI_PS_INPUT_CNTL_18 + 0x0, // SPI_PS_INPUT_CNTL_19 + 0x0, // SPI_PS_INPUT_CNTL_20 + 0x0, // SPI_PS_INPUT_CNTL_21 + 0x0, // SPI_PS_INPUT_CNTL_22 + 0x0, // SPI_PS_INPUT_CNTL_23 + 0x0, // SPI_PS_INPUT_CNTL_24 + 0x0, // SPI_PS_INPUT_CNTL_25 + 0x0, // SPI_PS_INPUT_CNTL_26 + 0x0, // SPI_PS_INPUT_CNTL_27 + 0x0, // SPI_PS_INPUT_CNTL_28 + 0x0, // SPI_PS_INPUT_CNTL_29 + 0x0, // SPI_PS_INPUT_CNTL_30 + 0x0, // SPI_PS_INPUT_CNTL_31 + 0x0, // SPI_VS_OUT_CONFIG + 0x0, // + 0x0, // SPI_PS_INPUT_ENA + 0x0, // SPI_PS_INPUT_ADDR + 0x0, // SPI_INTERP_CONTROL_0 + 0x2, // SPI_PS_IN_CONTROL + 0x0, // + 0x0, // SPI_BARYC_CNTL + 0x0, // + 0x0, // SPI_TMPRING_SIZE + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // SPI_SHADER_IDX_FORMAT + 0x0, // SPI_SHADER_POS_FORMAT + 0x0, // SPI_SHADER_Z_FORMAT + 0x0 // SPI_SHADER_COL_FORMAT }; static const uint32_t SxPsDownconvertControlGfx103[] = { - 0x0 , // SX_PS_DOWNCONVERT_CONTROL - 0x0 , // SX_PS_DOWNCONVERT - 0x0 , // SX_BLEND_OPT_EPSILON - 0x0 , // SX_BLEND_OPT_CONTROL - 0x0 , // SX_MRT0_BLEND_OPT - 0x0 , // SX_MRT1_BLEND_OPT - 0x0 , // SX_MRT2_BLEND_OPT - 0x0 , // SX_MRT3_BLEND_OPT - 0x0 , // SX_MRT4_BLEND_OPT - 0x0 , // SX_MRT5_BLEND_OPT - 0x0 , // SX_MRT6_BLEND_OPT - 0x0 , // SX_MRT7_BLEND_OPT - 0x0 , // CB_BLEND0_CONTROL - 0x0 , // CB_BLEND1_CONTROL - 0x0 , // CB_BLEND2_CONTROL - 0x0 , // CB_BLEND3_CONTROL - 0x0 , // CB_BLEND4_CONTROL - 0x0 , // CB_BLEND5_CONTROL - 0x0 , // CB_BLEND6_CONTROL - 0x0 // CB_BLEND7_CONTROL + 0x0, // SX_PS_DOWNCONVERT_CONTROL + 0x0, // SX_PS_DOWNCONVERT + 0x0, // SX_BLEND_OPT_EPSILON + 0x0, // SX_BLEND_OPT_CONTROL + 0x0, // SX_MRT0_BLEND_OPT + 0x0, // SX_MRT1_BLEND_OPT + 0x0, // SX_MRT2_BLEND_OPT + 0x0, // SX_MRT3_BLEND_OPT + 0x0, // SX_MRT4_BLEND_OPT + 0x0, // SX_MRT5_BLEND_OPT + 0x0, // SX_MRT6_BLEND_OPT + 0x0, // SX_MRT7_BLEND_OPT + 0x0, // CB_BLEND0_CONTROL + 0x0, // CB_BLEND1_CONTROL + 0x0, // CB_BLEND2_CONTROL + 0x0, // CB_BLEND3_CONTROL + 0x0, // CB_BLEND4_CONTROL + 0x0, // CB_BLEND5_CONTROL + 0x0, // CB_BLEND6_CONTROL + 0x0 // CB_BLEND7_CONTROL }; static const uint32_t GeMaxOutputPerSubgroupGfx103[] = { - 0x0 , // GE_MAX_OUTPUT_PER_SUBGROUP - 0x0 , // DB_DEPTH_CONTROL - 0x0 , // DB_EQAA - 0x0 , // CB_COLOR_CONTROL - 0x0 , // DB_SHADER_CONTROL - 0x90000 , // PA_CL_CLIP_CNTL - 0x4 , // PA_SU_SC_MODE_CNTL - 0x0 , // PA_CL_VTE_CNTL - 0x0 , // PA_CL_VS_OUT_CNTL - 0x0 // PA_CL_NANINF_CNTL + 0x0, // GE_MAX_OUTPUT_PER_SUBGROUP + 0x0, // DB_DEPTH_CONTROL + 0x0, // DB_EQAA + 0x0, // CB_COLOR_CONTROL + 0x0, // DB_SHADER_CONTROL + 0x90000, // PA_CL_CLIP_CNTL + 0x4, // PA_SU_SC_MODE_CNTL + 0x0, // PA_CL_VTE_CNTL + 0x0, // PA_CL_VS_OUT_CNTL + 0x0 // PA_CL_NANINF_CNTL }; static const uint32_t PaSuPrimFilterCntlGfx103[] = { - 0x0 , // PA_SU_PRIM_FILTER_CNTL - 0x0 , // PA_SU_SMALL_PRIM_FILTER_CNTL - 0x0 , // - 0x0 , // PA_CL_NGG_CNTL - 0x0 , // PA_SU_OVER_RASTERIZATION_CNTL - 0x0 , // PA_STEREO_CNTL - 0x0 , // PA_STATE_STEREO_X - 0x0 // + 0x0, // PA_SU_PRIM_FILTER_CNTL + 0x0, // PA_SU_SMALL_PRIM_FILTER_CNTL + 0x0, // + 0x0, // PA_CL_NGG_CNTL + 0x0, // PA_SU_OVER_RASTERIZATION_CNTL + 0x0, // PA_STEREO_CNTL + 0x0, // PA_STATE_STEREO_X + 0x0 // }; static const uint32_t PaSuPointSizeGfx103[] = { - 0x0 , // PA_SU_POINT_SIZE - 0x0 , // PA_SU_POINT_MINMAX - 0x0 , // PA_SU_LINE_CNTL - 0x0 // PA_SC_LINE_STIPPLE + 0x0, // PA_SU_POINT_SIZE + 0x0, // PA_SU_POINT_MINMAX + 0x0, // PA_SU_LINE_CNTL + 0x0 // PA_SC_LINE_STIPPLE }; static const uint32_t VgtHosMaxTessLevelGfx103[] = { - 0x0 , // VGT_HOS_MAX_TESS_LEVEL - 0x0 // VGT_HOS_MIN_TESS_LEVEL + 0x0, // VGT_HOS_MAX_TESS_LEVEL + 0x0 // VGT_HOS_MIN_TESS_LEVEL }; static const uint32_t VgtGsModeGfx103[] = { - 0x0 , // VGT_GS_MODE - 0x0 , // VGT_GS_ONCHIP_CNTL - 0x0 , // PA_SC_MODE_CNTL_0 - 0x0 , // PA_SC_MODE_CNTL_1 - 0x0 , // VGT_ENHANCE - 0x100 , // VGT_GS_PER_ES - 0x80 , // VGT_ES_PER_GS - 0x2 , // VGT_GS_PER_VS - 0x0 , // VGT_GSVS_RING_OFFSET_1 - 0x0 , // VGT_GSVS_RING_OFFSET_2 - 0x0 , // VGT_GSVS_RING_OFFSET_3 - 0x0 // VGT_GS_OUT_PRIM_TYPE + 0x0, // VGT_GS_MODE + 0x0, // VGT_GS_ONCHIP_CNTL + 0x0, // PA_SC_MODE_CNTL_0 + 0x0, // PA_SC_MODE_CNTL_1 + 0x0, // VGT_ENHANCE + 0x100, // VGT_GS_PER_ES + 0x80, // VGT_ES_PER_GS + 0x2, // VGT_GS_PER_VS + 0x0, // VGT_GSVS_RING_OFFSET_1 + 0x0, // VGT_GSVS_RING_OFFSET_2 + 0x0, // VGT_GSVS_RING_OFFSET_3 + 0x0 // VGT_GS_OUT_PRIM_TYPE }; static const uint32_t VgtPrimitiveidEnGfx103[] = { - 0x0 // VGT_PRIMITIVEID_EN + 0x0 // VGT_PRIMITIVEID_EN }; static const uint32_t VgtPrimitiveidResetGfx103[] = { - 0x0 // VGT_PRIMITIVEID_RESET + 0x0 // VGT_PRIMITIVEID_RESET }; static const uint32_t VgtDrawPayloadCntlGfx103[] = { - 0x0 , // VGT_DRAW_PAYLOAD_CNTL - 0x0 , // - 0x0 , // VGT_INSTANCE_STEP_RATE_0 - 0x0 , // VGT_INSTANCE_STEP_RATE_1 - 0x0 , // IA_MULTI_VGT_PARAM - 0x0 , // VGT_ESGS_RING_ITEMSIZE - 0x0 , // VGT_GSVS_RING_ITEMSIZE - 0x0 , // VGT_REUSE_OFF - 0x0 , // VGT_VTX_CNT_EN - 0x0 , // DB_HTILE_SURFACE - 0x0 , // DB_SRESULTS_COMPARE_STATE0 - 0x0 , // DB_SRESULTS_COMPARE_STATE1 - 0x0 , // DB_PRELOAD_CONTROL - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_0 - 0x0 , // VGT_STRMOUT_VTX_STRIDE_0 - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_OFFSET_0 - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_1 - 0x0 , // VGT_STRMOUT_VTX_STRIDE_1 - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_OFFSET_1 - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_2 - 0x0 , // VGT_STRMOUT_VTX_STRIDE_2 - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_OFFSET_2 - 0x0 , // VGT_STRMOUT_BUFFER_SIZE_3 - 0x0 , // VGT_STRMOUT_VTX_STRIDE_3 - 0x0 , // - 0x0 , // VGT_STRMOUT_BUFFER_OFFSET_3 - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // VGT_STRMOUT_DRAW_OPAQUE_OFFSET - 0x0 , // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - 0x0 , // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - 0x0 , // - 0x0 , // VGT_GS_MAX_VERT_OUT - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // - 0x0 , // GE_NGG_SUBGRP_CNTL - 0x0 , // VGT_TESS_DISTRIBUTION - 0x0 , // VGT_SHADER_STAGES_EN - 0x0 , // VGT_LS_HS_CONFIG - 0x0 , // VGT_GS_VERT_ITEMSIZE - 0x0 , // VGT_GS_VERT_ITEMSIZE_1 - 0x0 , // VGT_GS_VERT_ITEMSIZE_2 - 0x0 , // VGT_GS_VERT_ITEMSIZE_3 - 0x0 , // VGT_TF_PARAM - 0x0 , // DB_ALPHA_TO_MASK - 0x0 , // - 0x0 , // PA_SU_POLY_OFFSET_DB_FMT_CNTL - 0x0 , // PA_SU_POLY_OFFSET_CLAMP - 0x0 , // PA_SU_POLY_OFFSET_FRONT_SCALE - 0x0 , // PA_SU_POLY_OFFSET_FRONT_OFFSET - 0x0 , // PA_SU_POLY_OFFSET_BACK_SCALE - 0x0 , // PA_SU_POLY_OFFSET_BACK_OFFSET - 0x0 , // VGT_GS_INSTANCE_CNT - 0x0 , // VGT_STRMOUT_CONFIG - 0x0 // VGT_STRMOUT_BUFFER_CONFIG + 0x0, // VGT_DRAW_PAYLOAD_CNTL + 0x0, // + 0x0, // VGT_INSTANCE_STEP_RATE_0 + 0x0, // VGT_INSTANCE_STEP_RATE_1 + 0x0, // IA_MULTI_VGT_PARAM + 0x0, // VGT_ESGS_RING_ITEMSIZE + 0x0, // VGT_GSVS_RING_ITEMSIZE + 0x0, // VGT_REUSE_OFF + 0x0, // VGT_VTX_CNT_EN + 0x0, // DB_HTILE_SURFACE + 0x0, // DB_SRESULTS_COMPARE_STATE0 + 0x0, // DB_SRESULTS_COMPARE_STATE1 + 0x0, // DB_PRELOAD_CONTROL + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_SIZE_0 + 0x0, // VGT_STRMOUT_VTX_STRIDE_0 + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_OFFSET_0 + 0x0, // VGT_STRMOUT_BUFFER_SIZE_1 + 0x0, // VGT_STRMOUT_VTX_STRIDE_1 + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_OFFSET_1 + 0x0, // VGT_STRMOUT_BUFFER_SIZE_2 + 0x0, // VGT_STRMOUT_VTX_STRIDE_2 + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_OFFSET_2 + 0x0, // VGT_STRMOUT_BUFFER_SIZE_3 + 0x0, // VGT_STRMOUT_VTX_STRIDE_3 + 0x0, // + 0x0, // VGT_STRMOUT_BUFFER_OFFSET_3 + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // VGT_STRMOUT_DRAW_OPAQUE_OFFSET + 0x0, // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE + 0x0, // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE + 0x0, // + 0x0, // VGT_GS_MAX_VERT_OUT + 0x0, // + 0x0, // + 0x0, // + 0x0, // + 0x0, // GE_NGG_SUBGRP_CNTL + 0x0, // VGT_TESS_DISTRIBUTION + 0x0, // VGT_SHADER_STAGES_EN + 0x0, // VGT_LS_HS_CONFIG + 0x0, // VGT_GS_VERT_ITEMSIZE + 0x0, // VGT_GS_VERT_ITEMSIZE_1 + 0x0, // VGT_GS_VERT_ITEMSIZE_2 + 0x0, // VGT_GS_VERT_ITEMSIZE_3 + 0x0, // VGT_TF_PARAM + 0x0, // DB_ALPHA_TO_MASK + 0x0, // + 0x0, // PA_SU_POLY_OFFSET_DB_FMT_CNTL + 0x0, // PA_SU_POLY_OFFSET_CLAMP + 0x0, // PA_SU_POLY_OFFSET_FRONT_SCALE + 0x0, // PA_SU_POLY_OFFSET_FRONT_OFFSET + 0x0, // PA_SU_POLY_OFFSET_BACK_SCALE + 0x0, // PA_SU_POLY_OFFSET_BACK_OFFSET + 0x0, // VGT_GS_INSTANCE_CNT + 0x0, // VGT_STRMOUT_CONFIG + 0x0 // VGT_STRMOUT_BUFFER_CONFIG }; static const uint32_t PaScCentroidPriority0Gfx103[] = { - 0x0 , // PA_SC_CENTROID_PRIORITY_0 - 0x0 , // PA_SC_CENTROID_PRIORITY_1 - 0x1000 , // PA_SC_LINE_CNTL - 0x0 , // PA_SC_AA_CONFIG - 0x5 , // PA_SU_VTX_CNTL + 0x0, // PA_SC_CENTROID_PRIORITY_0 + 0x0, // PA_SC_CENTROID_PRIORITY_1 + 0x1000, // PA_SC_LINE_CNTL + 0x0, // PA_SC_AA_CONFIG + 0x5, // PA_SU_VTX_CNTL 0x3f800000, // PA_CL_GB_VERT_CLIP_ADJ 0x3f800000, // PA_CL_GB_VERT_DISC_ADJ 0x3f800000, // PA_CL_GB_HORZ_CLIP_ADJ 0x3f800000, // PA_CL_GB_HORZ_DISC_ADJ - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2 - 0x0 , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2 + 0x0, // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3 0xffffffff, // PA_SC_AA_MASK_X0Y0_X1Y0 0xffffffff, // PA_SC_AA_MASK_X0Y1_X1Y1 - 0x0 , // PA_SC_SHADER_CONTROL - 0x3 , // PA_SC_BINNER_CNTL_0 - 0x0 , // PA_SC_BINNER_CNTL_1 - 0x100000 , // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL - 0x0 , // PA_SC_NGG_MODE_CNTL - 0x0 , // - 0x1e , // VGT_VERTEX_REUSE_BLOCK_CNTL - 0x20 , // VGT_OUT_DEALLOC_CNTL - 0x0 , // CB_COLOR0_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR0_VIEW - 0x0 , // CB_COLOR0_INFO - 0x0 , // CB_COLOR0_ATTRIB - 0x0 , // CB_COLOR0_DCC_CONTROL - 0x0 , // CB_COLOR0_CMASK - 0x0 , // - 0x0 , // CB_COLOR0_FMASK - 0x0 , // - 0x0 , // CB_COLOR0_CLEAR_WORD0 - 0x0 , // CB_COLOR0_CLEAR_WORD1 - 0x0 , // CB_COLOR0_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR1_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR1_VIEW - 0x0 , // CB_COLOR1_INFO - 0x0 , // CB_COLOR1_ATTRIB - 0x0 , // CB_COLOR1_DCC_CONTROL - 0x0 , // CB_COLOR1_CMASK - 0x0 , // - 0x0 , // CB_COLOR1_FMASK - 0x0 , // - 0x0 , // CB_COLOR1_CLEAR_WORD0 - 0x0 , // CB_COLOR1_CLEAR_WORD1 - 0x0 , // CB_COLOR1_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR2_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR2_VIEW - 0x0 , // CB_COLOR2_INFO - 0x0 , // CB_COLOR2_ATTRIB - 0x0 , // CB_COLOR2_DCC_CONTROL - 0x0 , // CB_COLOR2_CMASK - 0x0 , // - 0x0 , // CB_COLOR2_FMASK - 0x0 , // - 0x0 , // CB_COLOR2_CLEAR_WORD0 - 0x0 , // CB_COLOR2_CLEAR_WORD1 - 0x0 , // CB_COLOR2_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR3_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR3_VIEW - 0x0 , // CB_COLOR3_INFO - 0x0 , // CB_COLOR3_ATTRIB - 0x0 , // CB_COLOR3_DCC_CONTROL - 0x0 , // CB_COLOR3_CMASK - 0x0 , // - 0x0 , // CB_COLOR3_FMASK - 0x0 , // - 0x0 , // CB_COLOR3_CLEAR_WORD0 - 0x0 , // CB_COLOR3_CLEAR_WORD1 - 0x0 , // CB_COLOR3_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR4_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR4_VIEW - 0x0 , // CB_COLOR4_INFO - 0x0 , // CB_COLOR4_ATTRIB - 0x0 , // CB_COLOR4_DCC_CONTROL - 0x0 , // CB_COLOR4_CMASK - 0x0 , // - 0x0 , // CB_COLOR4_FMASK - 0x0 , // - 0x0 , // CB_COLOR4_CLEAR_WORD0 - 0x0 , // CB_COLOR4_CLEAR_WORD1 - 0x0 , // CB_COLOR4_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR5_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR5_VIEW - 0x0 , // CB_COLOR5_INFO - 0x0 , // CB_COLOR5_ATTRIB - 0x0 , // CB_COLOR5_DCC_CONTROL - 0x0 , // CB_COLOR5_CMASK - 0x0 , // - 0x0 , // CB_COLOR5_FMASK - 0x0 , // - 0x0 , // CB_COLOR5_CLEAR_WORD0 - 0x0 , // CB_COLOR5_CLEAR_WORD1 - 0x0 , // CB_COLOR5_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR6_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR6_VIEW - 0x0 , // CB_COLOR6_INFO - 0x0 , // CB_COLOR6_ATTRIB - 0x0 , // CB_COLOR6_DCC_CONTROL - 0x0 , // CB_COLOR6_CMASK - 0x0 , // - 0x0 , // CB_COLOR6_FMASK - 0x0 , // - 0x0 , // CB_COLOR6_CLEAR_WORD0 - 0x0 , // CB_COLOR6_CLEAR_WORD1 - 0x0 , // CB_COLOR6_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR7_BASE - 0x0 , // - 0x0 , // - 0x0 , // CB_COLOR7_VIEW - 0x0 , // CB_COLOR7_INFO - 0x0 , // CB_COLOR7_ATTRIB - 0x0 , // CB_COLOR7_DCC_CONTROL - 0x0 , // CB_COLOR7_CMASK - 0x0 , // - 0x0 , // CB_COLOR7_FMASK - 0x0 , // - 0x0 , // CB_COLOR7_CLEAR_WORD0 - 0x0 , // CB_COLOR7_CLEAR_WORD1 - 0x0 , // CB_COLOR7_DCC_BASE - 0x0 , // - 0x0 , // CB_COLOR0_BASE_EXT - 0x0 , // CB_COLOR1_BASE_EXT - 0x0 , // CB_COLOR2_BASE_EXT - 0x0 , // CB_COLOR3_BASE_EXT - 0x0 , // CB_COLOR4_BASE_EXT - 0x0 , // CB_COLOR5_BASE_EXT - 0x0 , // CB_COLOR6_BASE_EXT - 0x0 , // CB_COLOR7_BASE_EXT - 0x0 , // CB_COLOR0_CMASK_BASE_EXT - 0x0 , // CB_COLOR1_CMASK_BASE_EXT - 0x0 , // CB_COLOR2_CMASK_BASE_EXT - 0x0 , // CB_COLOR3_CMASK_BASE_EXT - 0x0 , // CB_COLOR4_CMASK_BASE_EXT - 0x0 , // CB_COLOR5_CMASK_BASE_EXT - 0x0 , // CB_COLOR6_CMASK_BASE_EXT - 0x0 , // CB_COLOR7_CMASK_BASE_EXT - 0x0 , // CB_COLOR0_FMASK_BASE_EXT - 0x0 , // CB_COLOR1_FMASK_BASE_EXT - 0x0 , // CB_COLOR2_FMASK_BASE_EXT - 0x0 , // CB_COLOR3_FMASK_BASE_EXT - 0x0 , // CB_COLOR4_FMASK_BASE_EXT - 0x0 , // CB_COLOR5_FMASK_BASE_EXT - 0x0 , // CB_COLOR6_FMASK_BASE_EXT - 0x0 , // CB_COLOR7_FMASK_BASE_EXT - 0x0 , // CB_COLOR0_DCC_BASE_EXT - 0x0 , // CB_COLOR1_DCC_BASE_EXT - 0x0 , // CB_COLOR2_DCC_BASE_EXT - 0x0 , // CB_COLOR3_DCC_BASE_EXT - 0x0 , // CB_COLOR4_DCC_BASE_EXT - 0x0 , // CB_COLOR5_DCC_BASE_EXT - 0x0 , // CB_COLOR6_DCC_BASE_EXT - 0x0 , // CB_COLOR7_DCC_BASE_EXT - 0x0 , // CB_COLOR0_ATTRIB2 - 0x0 , // CB_COLOR1_ATTRIB2 - 0x0 , // CB_COLOR2_ATTRIB2 - 0x0 , // CB_COLOR3_ATTRIB2 - 0x0 , // CB_COLOR4_ATTRIB2 - 0x0 , // CB_COLOR5_ATTRIB2 - 0x0 , // CB_COLOR6_ATTRIB2 - 0x0 , // CB_COLOR7_ATTRIB2 - 0x0 , // CB_COLOR0_ATTRIB3 - 0x0 , // CB_COLOR1_ATTRIB3 - 0x0 , // CB_COLOR2_ATTRIB3 - 0x0 , // CB_COLOR3_ATTRIB3 - 0x0 , // CB_COLOR4_ATTRIB3 - 0x0 , // CB_COLOR5_ATTRIB3 - 0x0 , // CB_COLOR6_ATTRIB3 + 0x0, // PA_SC_SHADER_CONTROL + 0x3, // PA_SC_BINNER_CNTL_0 + 0x0, // PA_SC_BINNER_CNTL_1 + 0x100000, // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL + 0x0, // PA_SC_NGG_MODE_CNTL + 0x0, // + 0x1e, // VGT_VERTEX_REUSE_BLOCK_CNTL + 0x20, // VGT_OUT_DEALLOC_CNTL + 0x0, // CB_COLOR0_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR0_VIEW + 0x0, // CB_COLOR0_INFO + 0x0, // CB_COLOR0_ATTRIB + 0x0, // CB_COLOR0_DCC_CONTROL + 0x0, // CB_COLOR0_CMASK + 0x0, // + 0x0, // CB_COLOR0_FMASK + 0x0, // + 0x0, // CB_COLOR0_CLEAR_WORD0 + 0x0, // CB_COLOR0_CLEAR_WORD1 + 0x0, // CB_COLOR0_DCC_BASE + 0x0, // + 0x0, // CB_COLOR1_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR1_VIEW + 0x0, // CB_COLOR1_INFO + 0x0, // CB_COLOR1_ATTRIB + 0x0, // CB_COLOR1_DCC_CONTROL + 0x0, // CB_COLOR1_CMASK + 0x0, // + 0x0, // CB_COLOR1_FMASK + 0x0, // + 0x0, // CB_COLOR1_CLEAR_WORD0 + 0x0, // CB_COLOR1_CLEAR_WORD1 + 0x0, // CB_COLOR1_DCC_BASE + 0x0, // + 0x0, // CB_COLOR2_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR2_VIEW + 0x0, // CB_COLOR2_INFO + 0x0, // CB_COLOR2_ATTRIB + 0x0, // CB_COLOR2_DCC_CONTROL + 0x0, // CB_COLOR2_CMASK + 0x0, // + 0x0, // CB_COLOR2_FMASK + 0x0, // + 0x0, // CB_COLOR2_CLEAR_WORD0 + 0x0, // CB_COLOR2_CLEAR_WORD1 + 0x0, // CB_COLOR2_DCC_BASE + 0x0, // + 0x0, // CB_COLOR3_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR3_VIEW + 0x0, // CB_COLOR3_INFO + 0x0, // CB_COLOR3_ATTRIB + 0x0, // CB_COLOR3_DCC_CONTROL + 0x0, // CB_COLOR3_CMASK + 0x0, // + 0x0, // CB_COLOR3_FMASK + 0x0, // + 0x0, // CB_COLOR3_CLEAR_WORD0 + 0x0, // CB_COLOR3_CLEAR_WORD1 + 0x0, // CB_COLOR3_DCC_BASE + 0x0, // + 0x0, // CB_COLOR4_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR4_VIEW + 0x0, // CB_COLOR4_INFO + 0x0, // CB_COLOR4_ATTRIB + 0x0, // CB_COLOR4_DCC_CONTROL + 0x0, // CB_COLOR4_CMASK + 0x0, // + 0x0, // CB_COLOR4_FMASK + 0x0, // + 0x0, // CB_COLOR4_CLEAR_WORD0 + 0x0, // CB_COLOR4_CLEAR_WORD1 + 0x0, // CB_COLOR4_DCC_BASE + 0x0, // + 0x0, // CB_COLOR5_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR5_VIEW + 0x0, // CB_COLOR5_INFO + 0x0, // CB_COLOR5_ATTRIB + 0x0, // CB_COLOR5_DCC_CONTROL + 0x0, // CB_COLOR5_CMASK + 0x0, // + 0x0, // CB_COLOR5_FMASK + 0x0, // + 0x0, // CB_COLOR5_CLEAR_WORD0 + 0x0, // CB_COLOR5_CLEAR_WORD1 + 0x0, // CB_COLOR5_DCC_BASE + 0x0, // + 0x0, // CB_COLOR6_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR6_VIEW + 0x0, // CB_COLOR6_INFO + 0x0, // CB_COLOR6_ATTRIB + 0x0, // CB_COLOR6_DCC_CONTROL + 0x0, // CB_COLOR6_CMASK + 0x0, // + 0x0, // CB_COLOR6_FMASK + 0x0, // + 0x0, // CB_COLOR6_CLEAR_WORD0 + 0x0, // CB_COLOR6_CLEAR_WORD1 + 0x0, // CB_COLOR6_DCC_BASE + 0x0, // + 0x0, // CB_COLOR7_BASE + 0x0, // + 0x0, // + 0x0, // CB_COLOR7_VIEW + 0x0, // CB_COLOR7_INFO + 0x0, // CB_COLOR7_ATTRIB + 0x0, // CB_COLOR7_DCC_CONTROL + 0x0, // CB_COLOR7_CMASK + 0x0, // + 0x0, // CB_COLOR7_FMASK + 0x0, // + 0x0, // CB_COLOR7_CLEAR_WORD0 + 0x0, // CB_COLOR7_CLEAR_WORD1 + 0x0, // CB_COLOR7_DCC_BASE + 0x0, // + 0x0, // CB_COLOR0_BASE_EXT + 0x0, // CB_COLOR1_BASE_EXT + 0x0, // CB_COLOR2_BASE_EXT + 0x0, // CB_COLOR3_BASE_EXT + 0x0, // CB_COLOR4_BASE_EXT + 0x0, // CB_COLOR5_BASE_EXT + 0x0, // CB_COLOR6_BASE_EXT + 0x0, // CB_COLOR7_BASE_EXT + 0x0, // CB_COLOR0_CMASK_BASE_EXT + 0x0, // CB_COLOR1_CMASK_BASE_EXT + 0x0, // CB_COLOR2_CMASK_BASE_EXT + 0x0, // CB_COLOR3_CMASK_BASE_EXT + 0x0, // CB_COLOR4_CMASK_BASE_EXT + 0x0, // CB_COLOR5_CMASK_BASE_EXT + 0x0, // CB_COLOR6_CMASK_BASE_EXT + 0x0, // CB_COLOR7_CMASK_BASE_EXT + 0x0, // CB_COLOR0_FMASK_BASE_EXT + 0x0, // CB_COLOR1_FMASK_BASE_EXT + 0x0, // CB_COLOR2_FMASK_BASE_EXT + 0x0, // CB_COLOR3_FMASK_BASE_EXT + 0x0, // CB_COLOR4_FMASK_BASE_EXT + 0x0, // CB_COLOR5_FMASK_BASE_EXT + 0x0, // CB_COLOR6_FMASK_BASE_EXT + 0x0, // CB_COLOR7_FMASK_BASE_EXT + 0x0, // CB_COLOR0_DCC_BASE_EXT + 0x0, // CB_COLOR1_DCC_BASE_EXT + 0x0, // CB_COLOR2_DCC_BASE_EXT + 0x0, // CB_COLOR3_DCC_BASE_EXT + 0x0, // CB_COLOR4_DCC_BASE_EXT + 0x0, // CB_COLOR5_DCC_BASE_EXT + 0x0, // CB_COLOR6_DCC_BASE_EXT + 0x0, // CB_COLOR7_DCC_BASE_EXT + 0x0, // CB_COLOR0_ATTRIB2 + 0x0, // CB_COLOR1_ATTRIB2 + 0x0, // CB_COLOR2_ATTRIB2 + 0x0, // CB_COLOR3_ATTRIB2 + 0x0, // CB_COLOR4_ATTRIB2 + 0x0, // CB_COLOR5_ATTRIB2 + 0x0, // CB_COLOR6_ATTRIB2 + 0x0, // CB_COLOR7_ATTRIB2 + 0x0, // CB_COLOR0_ATTRIB3 + 0x0, // CB_COLOR1_ATTRIB3 + 0x0, // CB_COLOR2_ATTRIB3 + 0x0, // CB_COLOR3_ATTRIB3 + 0x0, // CB_COLOR4_ATTRIB3 + 0x0, // CB_COLOR5_ATTRIB3 + 0x0, // CB_COLOR6_ATTRIB3 0x0 // CB_COLOR7_ATTRIB3 }; set_context_reg_seq_array(cs, R_028000_DB_RENDER_CONTROL, SET(DbRenderControlGfx103)); set_context_reg_seq_array(cs, R_0281E8_COHER_DEST_BASE_HI_0, SET(CoherDestBaseHi0Gfx103)); - set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, SET(VgtMultiPrimIbResetIndxGfx103)); + set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, + SET(VgtMultiPrimIbResetIndxGfx103)); set_context_reg_seq_array(cs, R_028644_SPI_PS_INPUT_CNTL_0, SET(SpiPsInputCntl0Gfx103)); - set_context_reg_seq_array(cs, R_028750_SX_PS_DOWNCONVERT_CONTROL, SET(SxPsDownconvertControlGfx103)); - set_context_reg_seq_array(cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, SET(GeMaxOutputPerSubgroupGfx103)); + set_context_reg_seq_array(cs, R_028750_SX_PS_DOWNCONVERT_CONTROL, + SET(SxPsDownconvertControlGfx103)); + set_context_reg_seq_array(cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, + SET(GeMaxOutputPerSubgroupGfx103)); set_context_reg_seq_array(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, SET(PaSuPrimFilterCntlGfx103)); set_context_reg_seq_array(cs, R_028A00_PA_SU_POINT_SIZE, SET(PaSuPointSizeGfx103)); set_context_reg_seq_array(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, SET(VgtHosMaxTessLevelGfx103)); @@ -2902,14 +2921,14 @@ static void gfx103_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_re set_context_reg_seq_array(cs, R_028A84_VGT_PRIMITIVEID_EN, SET(VgtPrimitiveidEnGfx103)); set_context_reg_seq_array(cs, R_028A8C_VGT_PRIMITIVEID_RESET, SET(VgtPrimitiveidResetGfx103)); set_context_reg_seq_array(cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL, SET(VgtDrawPayloadCntlGfx103)); - set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, SET(PaScCentroidPriority0Gfx103)); + set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, + SET(PaScCentroidPriority0Gfx103)); for (unsigned i = 0; i < num_reg_pairs; i++) set_context_reg_seq_array(cs, reg_offsets[i], 1, ®_values[i]); } -void ac_emulate_clear_state(const struct radeon_info *info, - struct radeon_cmdbuf *cs, +void ac_emulate_clear_state(const struct radeon_info *info, struct radeon_cmdbuf *cs, set_context_reg_seq_array_fn set_context_reg_seq_array) { /* Set context registers same as CLEAR_STATE to initialize shadow memory. */ @@ -2917,11 +2936,9 @@ void ac_emulate_clear_state(const struct radeon_info *info, uint32_t reg_value = info->pa_sc_tile_steering_override; if (info->chip_class == GFX10_3) { - gfx103_emulate_clear_state(cs, 1, ®_offset, ®_value, - set_context_reg_seq_array); + gfx103_emulate_clear_state(cs, 1, ®_offset, ®_value, set_context_reg_seq_array); } else if (info->chip_class == GFX10) { - gfx10_emulate_clear_state(cs, 1, ®_offset, ®_value, - set_context_reg_seq_array); + gfx10_emulate_clear_state(cs, 1, ®_offset, ®_value, set_context_reg_seq_array); } else if (info->chip_class == GFX9) { gfx9_emulate_clear_state(cs, set_context_reg_seq_array); } else { @@ -2949,8 +2966,7 @@ void ac_check_shadowed_regs(enum chip_class chip_class, enum radeon_family famil unsigned end_range_offset = ranges[i].offset + ranges[i].size; /* Test if the ranges interect. */ - if (MAX2(ranges[i].offset, reg_offset) < - MIN2(end_range_offset, end_reg_offset)) { + if (MAX2(ranges[i].offset, reg_offset) < MIN2(end_range_offset, end_reg_offset)) { /* Assertion: A register can be listed only once. */ assert(!found); found = true; @@ -2992,7 +3008,7 @@ void ac_print_shadowed_regs(const struct radeon_info *info) for (unsigned i = 0; i < num_ranges; i++) { for (unsigned j = 0; j < ranges[i].size / 4; j++) { - unsigned offset = ranges[i].offset + j*4; + unsigned offset = ranges[i].offset + j * 4; const char *name = ac_get_register_name(info->chip_class, offset); unsigned value = -1; diff --git a/src/amd/common/ac_shadowed_regs.h b/src/amd/common/ac_shadowed_regs.h index b3f61db..df2a7b7 100644 --- a/src/amd/common/ac_shadowed_regs.h +++ b/src/amd/common/ac_shadowed_regs.h @@ -35,7 +35,8 @@ struct ac_reg_range { unsigned size; }; -enum ac_reg_range_type { +enum ac_reg_range_type +{ SI_REG_RANGE_UCONFIG, SI_REG_RANGE_CONTEXT, SI_REG_RANGE_SH, @@ -46,14 +47,13 @@ enum ac_reg_range_type { SI_NUM_ALL_REG_RANGES, }; -typedef void (*set_context_reg_seq_array_fn)(struct radeon_cmdbuf *cs, unsigned reg, - unsigned num, const uint32_t *values); +typedef void (*set_context_reg_seq_array_fn)(struct radeon_cmdbuf *cs, unsigned reg, unsigned num, + const uint32_t *values); void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family, enum ac_reg_range_type type, unsigned *num_ranges, const struct ac_reg_range **ranges); -void ac_emulate_clear_state(const struct radeon_info *info, - struct radeon_cmdbuf *cs, +void ac_emulate_clear_state(const struct radeon_info *info, struct radeon_cmdbuf *cs, set_context_reg_seq_array_fn set_context_reg_seq_array); void ac_check_shadowed_regs(enum chip_class chip_class, enum radeon_family family, unsigned reg_offset, unsigned count); diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c index 461fd50..223a61e 100644 --- a/src/amd/common/ac_surface.c +++ b/src/amd/common/ac_surface.c @@ -26,24 +26,24 @@ */ #include "ac_surface.h" -#include "amd_family.h" -#include "addrlib/src/amdgpu_asic_addr.h" + #include "ac_gpu_info.h" +#include "addrlib/inc/addrinterface.h" +#include "addrlib/src/amdgpu_asic_addr.h" +#include "amd_family.h" +#include "drm-uapi/amdgpu_drm.h" +#include "sid.h" #include "util/hash_table.h" #include "util/macros.h" #include "util/simple_mtx.h" #include "util/u_atomic.h" #include "util/u_math.h" #include "util/u_memory.h" -#include "sid.h" +#include #include #include #include -#include -#include "drm-uapi/amdgpu_drm.h" - -#include "addrlib/inc/addrinterface.h" #ifndef CIASICIDGFXENGINE_SOUTHERNISLAND #define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A @@ -54,689 +54,660 @@ #endif struct ac_addrlib { - ADDR_HANDLE handle; - - /* The cache of DCC retile maps for reuse when allocating images of - * similar sizes. - */ - simple_mtx_t dcc_retile_map_lock; - struct hash_table *dcc_retile_maps; - struct hash_table *dcc_retile_tile_indices; + ADDR_HANDLE handle; + + /* The cache of DCC retile maps for reuse when allocating images of + * similar sizes. + */ + simple_mtx_t dcc_retile_map_lock; + struct hash_table *dcc_retile_maps; + struct hash_table *dcc_retile_tile_indices; }; struct dcc_retile_map_key { - enum radeon_family family; - unsigned retile_width; - unsigned retile_height; - bool rb_aligned; - bool pipe_aligned; - unsigned dcc_retile_num_elements; - ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT input; + enum radeon_family family; + unsigned retile_width; + unsigned retile_height; + bool rb_aligned; + bool pipe_aligned; + unsigned dcc_retile_num_elements; + ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT input; }; static uint32_t dcc_retile_map_hash_key(const void *key) { - return _mesa_hash_data(key, sizeof(struct dcc_retile_map_key)); + return _mesa_hash_data(key, sizeof(struct dcc_retile_map_key)); } static bool dcc_retile_map_keys_equal(const void *a, const void *b) { - return memcmp(a, b, sizeof(struct dcc_retile_map_key)) == 0; + return memcmp(a, b, sizeof(struct dcc_retile_map_key)) == 0; } static void dcc_retile_map_free(struct hash_entry *entry) { - free((void*)entry->key); - free(entry->data); + free((void *)entry->key); + free(entry->data); } struct dcc_retile_tile_key { - enum radeon_family family; - unsigned bpp; - unsigned swizzle_mode; - bool rb_aligned; - bool pipe_aligned; + enum radeon_family family; + unsigned bpp; + unsigned swizzle_mode; + bool rb_aligned; + bool pipe_aligned; }; struct dcc_retile_tile_data { - unsigned tile_width_log2; - unsigned tile_height_log2; - uint16_t *data; + unsigned tile_width_log2; + unsigned tile_height_log2; + uint16_t *data; }; static uint32_t dcc_retile_tile_hash_key(const void *key) { - return _mesa_hash_data(key, sizeof(struct dcc_retile_tile_key)); + return _mesa_hash_data(key, sizeof(struct dcc_retile_tile_key)); } static bool dcc_retile_tile_keys_equal(const void *a, const void *b) { - return memcmp(a, b, sizeof(struct dcc_retile_tile_key)) == 0; + return memcmp(a, b, sizeof(struct dcc_retile_tile_key)) == 0; } static void dcc_retile_tile_free(struct hash_entry *entry) { - free((void*)entry->key); - free(((struct dcc_retile_tile_data*)entry->data)->data); - free(entry->data); + free((void *)entry->key); + free(((struct dcc_retile_tile_data *)entry->data)->data); + free(entry->data); } /* Assumes dcc_retile_map_lock is taken. */ static const struct dcc_retile_tile_data * -ac_compute_dcc_retile_tile_indices(struct ac_addrlib *addrlib, - const struct radeon_info *info, - unsigned bpp, unsigned swizzle_mode, - bool rb_aligned, bool pipe_aligned) +ac_compute_dcc_retile_tile_indices(struct ac_addrlib *addrlib, const struct radeon_info *info, + unsigned bpp, unsigned swizzle_mode, bool rb_aligned, + bool pipe_aligned) { - struct dcc_retile_tile_key key = (struct dcc_retile_tile_key) { - .family = info->family, - .bpp = bpp, - .swizzle_mode = swizzle_mode, - .rb_aligned = rb_aligned, - .pipe_aligned = pipe_aligned - }; - - struct hash_entry *entry = _mesa_hash_table_search(addrlib->dcc_retile_tile_indices, &key); - if (entry) - return entry->data; - - ADDR2_COMPUTE_DCCINFO_INPUT din = {0}; - ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0}; - din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT); - dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT); - - din.dccKeyFlags.pipeAligned = pipe_aligned; - din.dccKeyFlags.rbAligned = rb_aligned; - din.resourceType = ADDR_RSRC_TEX_2D; - din.swizzleMode = swizzle_mode; - din.bpp = bpp; - din.unalignedWidth = 1; - din.unalignedHeight = 1; - din.numSlices = 1; - din.numFrags = 1; - din.numMipLevels = 1; - - ADDR_E_RETURNCODE ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout); - if (ret != ADDR_OK) - return NULL; - - ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT addrin = {0}; - addrin.size = sizeof(addrin); - addrin.swizzleMode = swizzle_mode; - addrin.resourceType = ADDR_RSRC_TEX_2D; - addrin.bpp = bpp; - addrin.numSlices = 1; - addrin.numMipLevels = 1; - addrin.numFrags = 1; - addrin.pitch = dout.pitch; - addrin.height = dout.height; - addrin.compressBlkWidth = dout.compressBlkWidth; - addrin.compressBlkHeight = dout.compressBlkHeight; - addrin.compressBlkDepth = dout.compressBlkDepth; - addrin.metaBlkWidth = dout.metaBlkWidth; - addrin.metaBlkHeight = dout.metaBlkHeight; - addrin.metaBlkDepth = dout.metaBlkDepth; - addrin.dccKeyFlags.pipeAligned = pipe_aligned; - addrin.dccKeyFlags.rbAligned = rb_aligned; - - unsigned w = dout.metaBlkWidth / dout.compressBlkWidth; - unsigned h = dout.metaBlkHeight / dout.compressBlkHeight; - uint16_t *indices = malloc(w * h * sizeof (uint16_t)); - if (!indices) - return NULL; - - ADDR2_COMPUTE_DCC_ADDRFROMCOORD_OUTPUT addrout = {}; - addrout.size = sizeof(addrout); - - for (unsigned y = 0; y < h; ++y) { - addrin.y = y * dout.compressBlkHeight; - for (unsigned x = 0; x < w; ++x) { - addrin.x = x * dout.compressBlkWidth; - addrout.addr = 0; - - if (Addr2ComputeDccAddrFromCoord(addrlib->handle, &addrin, &addrout) != ADDR_OK) { - free(indices); - return NULL; - } - indices[y * w + x] = addrout.addr; - } - } - - struct dcc_retile_tile_data *data = calloc(1, sizeof(*data)); - if (!data) { - free(indices); - return NULL; - } - - data->tile_width_log2 = util_logbase2(w); - data->tile_height_log2 = util_logbase2(h); - data->data = indices; - - struct dcc_retile_tile_key *heap_key = mem_dup(&key, sizeof(key)); - if (!heap_key) { - free(data); - free(indices); - return NULL; - } - - entry = _mesa_hash_table_insert(addrlib->dcc_retile_tile_indices, heap_key, data); - if (!entry) { - free(heap_key); - free(data); - free(indices); - } - return data; + struct dcc_retile_tile_key key = (struct dcc_retile_tile_key){.family = info->family, + .bpp = bpp, + .swizzle_mode = swizzle_mode, + .rb_aligned = rb_aligned, + .pipe_aligned = pipe_aligned}; + + struct hash_entry *entry = _mesa_hash_table_search(addrlib->dcc_retile_tile_indices, &key); + if (entry) + return entry->data; + + ADDR2_COMPUTE_DCCINFO_INPUT din = {0}; + ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0}; + din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT); + dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT); + + din.dccKeyFlags.pipeAligned = pipe_aligned; + din.dccKeyFlags.rbAligned = rb_aligned; + din.resourceType = ADDR_RSRC_TEX_2D; + din.swizzleMode = swizzle_mode; + din.bpp = bpp; + din.unalignedWidth = 1; + din.unalignedHeight = 1; + din.numSlices = 1; + din.numFrags = 1; + din.numMipLevels = 1; + + ADDR_E_RETURNCODE ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout); + if (ret != ADDR_OK) + return NULL; + + ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT addrin = {0}; + addrin.size = sizeof(addrin); + addrin.swizzleMode = swizzle_mode; + addrin.resourceType = ADDR_RSRC_TEX_2D; + addrin.bpp = bpp; + addrin.numSlices = 1; + addrin.numMipLevels = 1; + addrin.numFrags = 1; + addrin.pitch = dout.pitch; + addrin.height = dout.height; + addrin.compressBlkWidth = dout.compressBlkWidth; + addrin.compressBlkHeight = dout.compressBlkHeight; + addrin.compressBlkDepth = dout.compressBlkDepth; + addrin.metaBlkWidth = dout.metaBlkWidth; + addrin.metaBlkHeight = dout.metaBlkHeight; + addrin.metaBlkDepth = dout.metaBlkDepth; + addrin.dccKeyFlags.pipeAligned = pipe_aligned; + addrin.dccKeyFlags.rbAligned = rb_aligned; + + unsigned w = dout.metaBlkWidth / dout.compressBlkWidth; + unsigned h = dout.metaBlkHeight / dout.compressBlkHeight; + uint16_t *indices = malloc(w * h * sizeof(uint16_t)); + if (!indices) + return NULL; + + ADDR2_COMPUTE_DCC_ADDRFROMCOORD_OUTPUT addrout = {}; + addrout.size = sizeof(addrout); + + for (unsigned y = 0; y < h; ++y) { + addrin.y = y * dout.compressBlkHeight; + for (unsigned x = 0; x < w; ++x) { + addrin.x = x * dout.compressBlkWidth; + addrout.addr = 0; + + if (Addr2ComputeDccAddrFromCoord(addrlib->handle, &addrin, &addrout) != ADDR_OK) { + free(indices); + return NULL; + } + indices[y * w + x] = addrout.addr; + } + } + + struct dcc_retile_tile_data *data = calloc(1, sizeof(*data)); + if (!data) { + free(indices); + return NULL; + } + + data->tile_width_log2 = util_logbase2(w); + data->tile_height_log2 = util_logbase2(h); + data->data = indices; + + struct dcc_retile_tile_key *heap_key = mem_dup(&key, sizeof(key)); + if (!heap_key) { + free(data); + free(indices); + return NULL; + } + + entry = _mesa_hash_table_insert(addrlib->dcc_retile_tile_indices, heap_key, data); + if (!entry) { + free(heap_key); + free(data); + free(indices); + } + return data; } static uint32_t ac_compute_retile_tile_addr(const struct dcc_retile_tile_data *tile, unsigned stride, unsigned x, unsigned y) { - unsigned x_mask = (1u << tile->tile_width_log2) - 1; - unsigned y_mask = (1u << tile->tile_height_log2) - 1; - unsigned tile_size_log2 = tile->tile_width_log2 + tile->tile_height_log2; - - unsigned base = ((y >> tile->tile_height_log2) * stride + (x >> tile->tile_width_log2)) << tile_size_log2; - unsigned offset_in_tile = tile->data[((y & y_mask) << tile->tile_width_log2) + (x & x_mask)]; - return base + offset_in_tile; + unsigned x_mask = (1u << tile->tile_width_log2) - 1; + unsigned y_mask = (1u << tile->tile_height_log2) - 1; + unsigned tile_size_log2 = tile->tile_width_log2 + tile->tile_height_log2; + + unsigned base = ((y >> tile->tile_height_log2) * stride + (x >> tile->tile_width_log2)) + << tile_size_log2; + unsigned offset_in_tile = tile->data[((y & y_mask) << tile->tile_width_log2) + (x & x_mask)]; + return base + offset_in_tile; } static uint32_t *ac_compute_dcc_retile_map(struct ac_addrlib *addrlib, - const struct radeon_info *info, - unsigned retile_width, unsigned retile_height, - bool rb_aligned, bool pipe_aligned, bool use_uint16, - unsigned dcc_retile_num_elements, - const ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT *in) + const struct radeon_info *info, unsigned retile_width, + unsigned retile_height, bool rb_aligned, + bool pipe_aligned, bool use_uint16, + unsigned dcc_retile_num_elements, + const ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT *in) { - unsigned dcc_retile_map_size = dcc_retile_num_elements * (use_uint16 ? 2 : 4); - struct dcc_retile_map_key key; - - assert(in->numFrags == 1 && in->numSlices == 1 && in->numMipLevels == 1); - - memset(&key, 0, sizeof(key)); - key.family = info->family; - key.retile_width = retile_width; - key.retile_height = retile_height; - key.rb_aligned = rb_aligned; - key.pipe_aligned = pipe_aligned; - key.dcc_retile_num_elements = dcc_retile_num_elements; - memcpy(&key.input, in, sizeof(*in)); - - simple_mtx_lock(&addrlib->dcc_retile_map_lock); - - /* If we have already computed this retile map, get it from the hash table. */ - struct hash_entry *entry = _mesa_hash_table_search(addrlib->dcc_retile_maps, &key); - if (entry) { - uint32_t *map = entry->data; - simple_mtx_unlock(&addrlib->dcc_retile_map_lock); - return map; - } - - const struct dcc_retile_tile_data *src_tile = - ac_compute_dcc_retile_tile_indices(addrlib, info, in->bpp, - in->swizzleMode, - rb_aligned, pipe_aligned); - const struct dcc_retile_tile_data *dst_tile = - ac_compute_dcc_retile_tile_indices(addrlib, info, in->bpp, - in->swizzleMode, false, false); - if (!src_tile || !dst_tile) { - simple_mtx_unlock(&addrlib->dcc_retile_map_lock); - return NULL; - } - - void *dcc_retile_map = malloc(dcc_retile_map_size); - if (!dcc_retile_map) { - simple_mtx_unlock(&addrlib->dcc_retile_map_lock); - return NULL; - } - - unsigned index = 0; - unsigned w = DIV_ROUND_UP(retile_width, in->compressBlkWidth); - unsigned h = DIV_ROUND_UP(retile_height, in->compressBlkHeight); - unsigned src_stride = DIV_ROUND_UP(w, 1u << src_tile->tile_width_log2); - unsigned dst_stride = DIV_ROUND_UP(w, 1u << dst_tile->tile_width_log2); - - for (unsigned y = 0; y < h; ++y) { - for (unsigned x = 0; x < w; ++x) { - unsigned src_addr = ac_compute_retile_tile_addr(src_tile, src_stride, x, y); - unsigned dst_addr = ac_compute_retile_tile_addr(dst_tile, dst_stride, x, y); - - if (use_uint16) { - ((uint16_t*)dcc_retile_map)[2 * index] = src_addr; - ((uint16_t*)dcc_retile_map)[2 * index + 1] = dst_addr; - } else { - ((uint32_t*)dcc_retile_map)[2 * index] = src_addr; - ((uint32_t*)dcc_retile_map)[2 * index + 1] = dst_addr; - } - ++index; - } - } - - /* Fill the remaining pairs with the last one (for the compute shader). */ - for (unsigned i = index * 2; i < dcc_retile_num_elements; i++) { - if (use_uint16) - ((uint16_t*)dcc_retile_map)[i] = ((uint16_t*)dcc_retile_map)[i - 2]; - else - ((uint32_t*)dcc_retile_map)[i] = ((uint32_t*)dcc_retile_map)[i - 2]; - } - - /* Insert the retile map into the hash table, so that it can be reused and - * the computation can be skipped for similar image sizes. - */ - _mesa_hash_table_insert(addrlib->dcc_retile_maps, - mem_dup(&key, sizeof(key)), dcc_retile_map); - - simple_mtx_unlock(&addrlib->dcc_retile_map_lock); - return dcc_retile_map; + unsigned dcc_retile_map_size = dcc_retile_num_elements * (use_uint16 ? 2 : 4); + struct dcc_retile_map_key key; + + assert(in->numFrags == 1 && in->numSlices == 1 && in->numMipLevels == 1); + + memset(&key, 0, sizeof(key)); + key.family = info->family; + key.retile_width = retile_width; + key.retile_height = retile_height; + key.rb_aligned = rb_aligned; + key.pipe_aligned = pipe_aligned; + key.dcc_retile_num_elements = dcc_retile_num_elements; + memcpy(&key.input, in, sizeof(*in)); + + simple_mtx_lock(&addrlib->dcc_retile_map_lock); + + /* If we have already computed this retile map, get it from the hash table. */ + struct hash_entry *entry = _mesa_hash_table_search(addrlib->dcc_retile_maps, &key); + if (entry) { + uint32_t *map = entry->data; + simple_mtx_unlock(&addrlib->dcc_retile_map_lock); + return map; + } + + const struct dcc_retile_tile_data *src_tile = ac_compute_dcc_retile_tile_indices( + addrlib, info, in->bpp, in->swizzleMode, rb_aligned, pipe_aligned); + const struct dcc_retile_tile_data *dst_tile = + ac_compute_dcc_retile_tile_indices(addrlib, info, in->bpp, in->swizzleMode, false, false); + if (!src_tile || !dst_tile) { + simple_mtx_unlock(&addrlib->dcc_retile_map_lock); + return NULL; + } + + void *dcc_retile_map = malloc(dcc_retile_map_size); + if (!dcc_retile_map) { + simple_mtx_unlock(&addrlib->dcc_retile_map_lock); + return NULL; + } + + unsigned index = 0; + unsigned w = DIV_ROUND_UP(retile_width, in->compressBlkWidth); + unsigned h = DIV_ROUND_UP(retile_height, in->compressBlkHeight); + unsigned src_stride = DIV_ROUND_UP(w, 1u << src_tile->tile_width_log2); + unsigned dst_stride = DIV_ROUND_UP(w, 1u << dst_tile->tile_width_log2); + + for (unsigned y = 0; y < h; ++y) { + for (unsigned x = 0; x < w; ++x) { + unsigned src_addr = ac_compute_retile_tile_addr(src_tile, src_stride, x, y); + unsigned dst_addr = ac_compute_retile_tile_addr(dst_tile, dst_stride, x, y); + + if (use_uint16) { + ((uint16_t *)dcc_retile_map)[2 * index] = src_addr; + ((uint16_t *)dcc_retile_map)[2 * index + 1] = dst_addr; + } else { + ((uint32_t *)dcc_retile_map)[2 * index] = src_addr; + ((uint32_t *)dcc_retile_map)[2 * index + 1] = dst_addr; + } + ++index; + } + } + + /* Fill the remaining pairs with the last one (for the compute shader). */ + for (unsigned i = index * 2; i < dcc_retile_num_elements; i++) { + if (use_uint16) + ((uint16_t *)dcc_retile_map)[i] = ((uint16_t *)dcc_retile_map)[i - 2]; + else + ((uint32_t *)dcc_retile_map)[i] = ((uint32_t *)dcc_retile_map)[i - 2]; + } + + /* Insert the retile map into the hash table, so that it can be reused and + * the computation can be skipped for similar image sizes. + */ + _mesa_hash_table_insert(addrlib->dcc_retile_maps, mem_dup(&key, sizeof(key)), dcc_retile_map); + + simple_mtx_unlock(&addrlib->dcc_retile_map_lock); + return dcc_retile_map; } -static void *ADDR_API allocSysMem(const ADDR_ALLOCSYSMEM_INPUT * pInput) +static void *ADDR_API allocSysMem(const ADDR_ALLOCSYSMEM_INPUT *pInput) { - return malloc(pInput->sizeInBytes); + return malloc(pInput->sizeInBytes); } -static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT * pInput) +static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT *pInput) { - free(pInput->pVirtAddr); - return ADDR_OK; + free(pInput->pVirtAddr); + return ADDR_OK; } struct ac_addrlib *ac_addrlib_create(const struct radeon_info *info, - const struct amdgpu_gpu_info *amdinfo, - uint64_t *max_alignment) + const struct amdgpu_gpu_info *amdinfo, uint64_t *max_alignment) { - ADDR_CREATE_INPUT addrCreateInput = {0}; - ADDR_CREATE_OUTPUT addrCreateOutput = {0}; - ADDR_REGISTER_VALUE regValue = {0}; - ADDR_CREATE_FLAGS createFlags = {{0}}; - ADDR_GET_MAX_ALIGNMENTS_OUTPUT addrGetMaxAlignmentsOutput = {0}; - ADDR_E_RETURNCODE addrRet; - - addrCreateInput.size = sizeof(ADDR_CREATE_INPUT); - addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT); - - regValue.gbAddrConfig = amdinfo->gb_addr_cfg; - createFlags.value = 0; - - addrCreateInput.chipFamily = info->family_id; - addrCreateInput.chipRevision = info->chip_external_rev; - - if (addrCreateInput.chipFamily == FAMILY_UNKNOWN) - return NULL; - - if (addrCreateInput.chipFamily >= FAMILY_AI) { - addrCreateInput.chipEngine = CIASICIDGFXENGINE_ARCTICISLAND; - } else { - regValue.noOfBanks = amdinfo->mc_arb_ramcfg & 0x3; - regValue.noOfRanks = (amdinfo->mc_arb_ramcfg & 0x4) >> 2; - - regValue.backendDisables = amdinfo->enabled_rb_pipes_mask; - regValue.pTileConfig = amdinfo->gb_tile_mode; - regValue.noOfEntries = ARRAY_SIZE(amdinfo->gb_tile_mode); - if (addrCreateInput.chipFamily == FAMILY_SI) { - regValue.pMacroTileConfig = NULL; - regValue.noOfMacroEntries = 0; - } else { - regValue.pMacroTileConfig = amdinfo->gb_macro_tile_mode; - regValue.noOfMacroEntries = ARRAY_SIZE(amdinfo->gb_macro_tile_mode); - } - - createFlags.useTileIndex = 1; - createFlags.useHtileSliceAlign = 1; - - addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND; - } - - addrCreateInput.callbacks.allocSysMem = allocSysMem; - addrCreateInput.callbacks.freeSysMem = freeSysMem; - addrCreateInput.callbacks.debugPrint = 0; - addrCreateInput.createFlags = createFlags; - addrCreateInput.regValue = regValue; - - addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput); - if (addrRet != ADDR_OK) - return NULL; - - if (max_alignment) { - addrRet = AddrGetMaxAlignments(addrCreateOutput.hLib, &addrGetMaxAlignmentsOutput); - if (addrRet == ADDR_OK){ - *max_alignment = addrGetMaxAlignmentsOutput.baseAlign; - } - } - - struct ac_addrlib *addrlib = calloc(1, sizeof(struct ac_addrlib)); - if (!addrlib) { - AddrDestroy(addrCreateOutput.hLib); - return NULL; - } - - addrlib->handle = addrCreateOutput.hLib; - simple_mtx_init(&addrlib->dcc_retile_map_lock, mtx_plain); - addrlib->dcc_retile_maps = _mesa_hash_table_create(NULL, dcc_retile_map_hash_key, - dcc_retile_map_keys_equal); - addrlib->dcc_retile_tile_indices = _mesa_hash_table_create(NULL, dcc_retile_tile_hash_key, - dcc_retile_tile_keys_equal); - return addrlib; + ADDR_CREATE_INPUT addrCreateInput = {0}; + ADDR_CREATE_OUTPUT addrCreateOutput = {0}; + ADDR_REGISTER_VALUE regValue = {0}; + ADDR_CREATE_FLAGS createFlags = {{0}}; + ADDR_GET_MAX_ALIGNMENTS_OUTPUT addrGetMaxAlignmentsOutput = {0}; + ADDR_E_RETURNCODE addrRet; + + addrCreateInput.size = sizeof(ADDR_CREATE_INPUT); + addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT); + + regValue.gbAddrConfig = amdinfo->gb_addr_cfg; + createFlags.value = 0; + + addrCreateInput.chipFamily = info->family_id; + addrCreateInput.chipRevision = info->chip_external_rev; + + if (addrCreateInput.chipFamily == FAMILY_UNKNOWN) + return NULL; + + if (addrCreateInput.chipFamily >= FAMILY_AI) { + addrCreateInput.chipEngine = CIASICIDGFXENGINE_ARCTICISLAND; + } else { + regValue.noOfBanks = amdinfo->mc_arb_ramcfg & 0x3; + regValue.noOfRanks = (amdinfo->mc_arb_ramcfg & 0x4) >> 2; + + regValue.backendDisables = amdinfo->enabled_rb_pipes_mask; + regValue.pTileConfig = amdinfo->gb_tile_mode; + regValue.noOfEntries = ARRAY_SIZE(amdinfo->gb_tile_mode); + if (addrCreateInput.chipFamily == FAMILY_SI) { + regValue.pMacroTileConfig = NULL; + regValue.noOfMacroEntries = 0; + } else { + regValue.pMacroTileConfig = amdinfo->gb_macro_tile_mode; + regValue.noOfMacroEntries = ARRAY_SIZE(amdinfo->gb_macro_tile_mode); + } + + createFlags.useTileIndex = 1; + createFlags.useHtileSliceAlign = 1; + + addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND; + } + + addrCreateInput.callbacks.allocSysMem = allocSysMem; + addrCreateInput.callbacks.freeSysMem = freeSysMem; + addrCreateInput.callbacks.debugPrint = 0; + addrCreateInput.createFlags = createFlags; + addrCreateInput.regValue = regValue; + + addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput); + if (addrRet != ADDR_OK) + return NULL; + + if (max_alignment) { + addrRet = AddrGetMaxAlignments(addrCreateOutput.hLib, &addrGetMaxAlignmentsOutput); + if (addrRet == ADDR_OK) { + *max_alignment = addrGetMaxAlignmentsOutput.baseAlign; + } + } + + struct ac_addrlib *addrlib = calloc(1, sizeof(struct ac_addrlib)); + if (!addrlib) { + AddrDestroy(addrCreateOutput.hLib); + return NULL; + } + + addrlib->handle = addrCreateOutput.hLib; + simple_mtx_init(&addrlib->dcc_retile_map_lock, mtx_plain); + addrlib->dcc_retile_maps = + _mesa_hash_table_create(NULL, dcc_retile_map_hash_key, dcc_retile_map_keys_equal); + addrlib->dcc_retile_tile_indices = + _mesa_hash_table_create(NULL, dcc_retile_tile_hash_key, dcc_retile_tile_keys_equal); + return addrlib; } void ac_addrlib_destroy(struct ac_addrlib *addrlib) { - AddrDestroy(addrlib->handle); - simple_mtx_destroy(&addrlib->dcc_retile_map_lock); - _mesa_hash_table_destroy(addrlib->dcc_retile_maps, dcc_retile_map_free); - _mesa_hash_table_destroy(addrlib->dcc_retile_tile_indices, dcc_retile_tile_free); - free(addrlib); + AddrDestroy(addrlib->handle); + simple_mtx_destroy(&addrlib->dcc_retile_map_lock); + _mesa_hash_table_destroy(addrlib->dcc_retile_maps, dcc_retile_map_free); + _mesa_hash_table_destroy(addrlib->dcc_retile_tile_indices, dcc_retile_tile_free); + free(addrlib); } -static int surf_config_sanity(const struct ac_surf_config *config, - unsigned flags) +static int surf_config_sanity(const struct ac_surf_config *config, unsigned flags) { - /* FMASK is allocated together with the color surface and can't be - * allocated separately. - */ - assert(!(flags & RADEON_SURF_FMASK)); - if (flags & RADEON_SURF_FMASK) - return -EINVAL; - - /* all dimension must be at least 1 ! */ - if (!config->info.width || !config->info.height || !config->info.depth || - !config->info.array_size || !config->info.levels) - return -EINVAL; - - switch (config->info.samples) { - case 0: - case 1: - case 2: - case 4: - case 8: - break; - case 16: - if (flags & RADEON_SURF_Z_OR_SBUFFER) - return -EINVAL; - break; - default: - return -EINVAL; - } - - if (!(flags & RADEON_SURF_Z_OR_SBUFFER)) { - switch (config->info.storage_samples) { - case 0: - case 1: - case 2: - case 4: - case 8: - break; - default: - return -EINVAL; - } - } - - if (config->is_3d && config->info.array_size > 1) - return -EINVAL; - if (config->is_cube && config->info.depth > 1) - return -EINVAL; - - return 0; + /* FMASK is allocated together with the color surface and can't be + * allocated separately. + */ + assert(!(flags & RADEON_SURF_FMASK)); + if (flags & RADEON_SURF_FMASK) + return -EINVAL; + + /* all dimension must be at least 1 ! */ + if (!config->info.width || !config->info.height || !config->info.depth || + !config->info.array_size || !config->info.levels) + return -EINVAL; + + switch (config->info.samples) { + case 0: + case 1: + case 2: + case 4: + case 8: + break; + case 16: + if (flags & RADEON_SURF_Z_OR_SBUFFER) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (!(flags & RADEON_SURF_Z_OR_SBUFFER)) { + switch (config->info.storage_samples) { + case 0: + case 1: + case 2: + case 4: + case 8: + break; + default: + return -EINVAL; + } + } + + if (config->is_3d && config->info.array_size > 1) + return -EINVAL; + if (config->is_cube && config->info.depth > 1) + return -EINVAL; + + return 0; } -static int gfx6_compute_level(ADDR_HANDLE addrlib, - const struct ac_surf_config *config, - struct radeon_surf *surf, bool is_stencil, - unsigned level, bool compressed, - ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn, - ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut, - ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn, - ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut, - ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn, - ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut) +static int gfx6_compute_level(ADDR_HANDLE addrlib, const struct ac_surf_config *config, + struct radeon_surf *surf, bool is_stencil, unsigned level, + bool compressed, ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn, + ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut, + ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn, + ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut, + ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn, + ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut) { - struct legacy_surf_level *surf_level; - ADDR_E_RETURNCODE ret; - - AddrSurfInfoIn->mipLevel = level; - AddrSurfInfoIn->width = u_minify(config->info.width, level); - AddrSurfInfoIn->height = u_minify(config->info.height, level); - - /* Make GFX6 linear surfaces compatible with GFX9 for hybrid graphics, - * because GFX9 needs linear alignment of 256 bytes. - */ - if (config->info.levels == 1 && - AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED && - AddrSurfInfoIn->bpp && - util_is_power_of_two_or_zero(AddrSurfInfoIn->bpp)) { - unsigned alignment = 256 / (AddrSurfInfoIn->bpp / 8); - - AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, alignment); - } - - /* addrlib assumes the bytes/pixel is a divisor of 64, which is not - * true for r32g32b32 formats. */ - if (AddrSurfInfoIn->bpp == 96) { - assert(config->info.levels == 1); - assert(AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED); - - /* The least common multiple of 64 bytes and 12 bytes/pixel is - * 192 bytes, or 16 pixels. */ - AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, 16); - } - - if (config->is_3d) - AddrSurfInfoIn->numSlices = u_minify(config->info.depth, level); - else if (config->is_cube) - AddrSurfInfoIn->numSlices = 6; - else - AddrSurfInfoIn->numSlices = config->info.array_size; - - if (level > 0) { - /* Set the base level pitch. This is needed for calculation - * of non-zero levels. */ - if (is_stencil) - AddrSurfInfoIn->basePitch = surf->u.legacy.stencil_level[0].nblk_x; - else - AddrSurfInfoIn->basePitch = surf->u.legacy.level[0].nblk_x; - - /* Convert blocks to pixels for compressed formats. */ - if (compressed) - AddrSurfInfoIn->basePitch *= surf->blk_w; - } - - ret = AddrComputeSurfaceInfo(addrlib, - AddrSurfInfoIn, - AddrSurfInfoOut); - if (ret != ADDR_OK) { - return ret; - } - - surf_level = is_stencil ? &surf->u.legacy.stencil_level[level] : &surf->u.legacy.level[level]; - surf_level->offset = align64(surf->surf_size, AddrSurfInfoOut->baseAlign); - surf_level->slice_size_dw = AddrSurfInfoOut->sliceSize / 4; - surf_level->nblk_x = AddrSurfInfoOut->pitch; - surf_level->nblk_y = AddrSurfInfoOut->height; - - switch (AddrSurfInfoOut->tileMode) { - case ADDR_TM_LINEAR_ALIGNED: - surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED; - break; - case ADDR_TM_1D_TILED_THIN1: - surf_level->mode = RADEON_SURF_MODE_1D; - break; - case ADDR_TM_2D_TILED_THIN1: - surf_level->mode = RADEON_SURF_MODE_2D; - break; - default: - assert(0); - } - - if (is_stencil) - surf->u.legacy.stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex; - else - surf->u.legacy.tiling_index[level] = AddrSurfInfoOut->tileIndex; - - surf->surf_size = surf_level->offset + AddrSurfInfoOut->surfSize; - - /* Clear DCC fields at the beginning. */ - surf_level->dcc_offset = 0; - - /* The previous level's flag tells us if we can use DCC for this level. */ - if (AddrSurfInfoIn->flags.dccCompatible && - (level == 0 || AddrDccOut->subLvlCompressible)) { - bool prev_level_clearable = level == 0 || - AddrDccOut->dccRamSizeAligned; - - AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize; - AddrDccIn->tileMode = AddrSurfInfoOut->tileMode; - AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo; - AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex; - AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; - - ret = AddrComputeDccInfo(addrlib, - AddrDccIn, - AddrDccOut); - - if (ret == ADDR_OK) { - surf_level->dcc_offset = surf->dcc_size; - surf->num_dcc_levels = level + 1; - surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize; - surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign); - - /* If the DCC size of a subresource (1 mip level or 1 slice) - * is not aligned, the DCC memory layout is not contiguous for - * that subresource, which means we can't use fast clear. - * - * We only do fast clears for whole mipmap levels. If we did - * per-slice fast clears, the same restriction would apply. - * (i.e. only compute the slice size and see if it's aligned) - * - * The last level can be non-contiguous and still be clearable - * if it's interleaved with the next level that doesn't exist. - */ - if (AddrDccOut->dccRamSizeAligned || - (prev_level_clearable && level == config->info.levels - 1)) - surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize; - else - surf_level->dcc_fast_clear_size = 0; - - /* Compute the DCC slice size because addrlib doesn't - * provide this info. As DCC memory is linear (each - * slice is the same size) it's easy to compute. - */ - surf->dcc_slice_size = AddrDccOut->dccRamSize / config->info.array_size; - - /* For arrays, we have to compute the DCC info again - * with one slice size to get a correct fast clear - * size. - */ - if (config->info.array_size > 1) { - AddrDccIn->colorSurfSize = AddrSurfInfoOut->sliceSize; - AddrDccIn->tileMode = AddrSurfInfoOut->tileMode; - AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo; - AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex; - AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; - - ret = AddrComputeDccInfo(addrlib, - AddrDccIn, AddrDccOut); - if (ret == ADDR_OK) { - /* If the DCC memory isn't properly - * aligned, the data are interleaved - * accross slices. - */ - if (AddrDccOut->dccRamSizeAligned) - surf_level->dcc_slice_fast_clear_size = AddrDccOut->dccFastClearSize; - else - surf_level->dcc_slice_fast_clear_size = 0; - } - - if (surf->flags & RADEON_SURF_CONTIGUOUS_DCC_LAYERS && - surf->dcc_slice_size != surf_level->dcc_slice_fast_clear_size) { - surf->dcc_size = 0; - surf->num_dcc_levels = 0; - AddrDccOut->subLvlCompressible = false; - } - } else { - surf_level->dcc_slice_fast_clear_size = surf_level->dcc_fast_clear_size; - } - } - } - - /* HTILE. */ - if (!is_stencil && - AddrSurfInfoIn->flags.depth && - surf_level->mode == RADEON_SURF_MODE_2D && - level == 0 && - !(surf->flags & RADEON_SURF_NO_HTILE)) { - AddrHtileIn->flags.tcCompatible = AddrSurfInfoOut->tcCompatible; - AddrHtileIn->pitch = AddrSurfInfoOut->pitch; - AddrHtileIn->height = AddrSurfInfoOut->height; - AddrHtileIn->numSlices = AddrSurfInfoOut->depth; - AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8; - AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8; - AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo; - AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex; - AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; - - ret = AddrComputeHtileInfo(addrlib, - AddrHtileIn, - AddrHtileOut); - - if (ret == ADDR_OK) { - surf->htile_size = AddrHtileOut->htileBytes; - surf->htile_slice_size = AddrHtileOut->sliceSize; - surf->htile_alignment = AddrHtileOut->baseAlign; - } - } - - return 0; + struct legacy_surf_level *surf_level; + ADDR_E_RETURNCODE ret; + + AddrSurfInfoIn->mipLevel = level; + AddrSurfInfoIn->width = u_minify(config->info.width, level); + AddrSurfInfoIn->height = u_minify(config->info.height, level); + + /* Make GFX6 linear surfaces compatible with GFX9 for hybrid graphics, + * because GFX9 needs linear alignment of 256 bytes. + */ + if (config->info.levels == 1 && AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED && + AddrSurfInfoIn->bpp && util_is_power_of_two_or_zero(AddrSurfInfoIn->bpp)) { + unsigned alignment = 256 / (AddrSurfInfoIn->bpp / 8); + + AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, alignment); + } + + /* addrlib assumes the bytes/pixel is a divisor of 64, which is not + * true for r32g32b32 formats. */ + if (AddrSurfInfoIn->bpp == 96) { + assert(config->info.levels == 1); + assert(AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED); + + /* The least common multiple of 64 bytes and 12 bytes/pixel is + * 192 bytes, or 16 pixels. */ + AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, 16); + } + + if (config->is_3d) + AddrSurfInfoIn->numSlices = u_minify(config->info.depth, level); + else if (config->is_cube) + AddrSurfInfoIn->numSlices = 6; + else + AddrSurfInfoIn->numSlices = config->info.array_size; + + if (level > 0) { + /* Set the base level pitch. This is needed for calculation + * of non-zero levels. */ + if (is_stencil) + AddrSurfInfoIn->basePitch = surf->u.legacy.stencil_level[0].nblk_x; + else + AddrSurfInfoIn->basePitch = surf->u.legacy.level[0].nblk_x; + + /* Convert blocks to pixels for compressed formats. */ + if (compressed) + AddrSurfInfoIn->basePitch *= surf->blk_w; + } + + ret = AddrComputeSurfaceInfo(addrlib, AddrSurfInfoIn, AddrSurfInfoOut); + if (ret != ADDR_OK) { + return ret; + } + + surf_level = is_stencil ? &surf->u.legacy.stencil_level[level] : &surf->u.legacy.level[level]; + surf_level->offset = align64(surf->surf_size, AddrSurfInfoOut->baseAlign); + surf_level->slice_size_dw = AddrSurfInfoOut->sliceSize / 4; + surf_level->nblk_x = AddrSurfInfoOut->pitch; + surf_level->nblk_y = AddrSurfInfoOut->height; + + switch (AddrSurfInfoOut->tileMode) { + case ADDR_TM_LINEAR_ALIGNED: + surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + break; + case ADDR_TM_1D_TILED_THIN1: + surf_level->mode = RADEON_SURF_MODE_1D; + break; + case ADDR_TM_2D_TILED_THIN1: + surf_level->mode = RADEON_SURF_MODE_2D; + break; + default: + assert(0); + } + + if (is_stencil) + surf->u.legacy.stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex; + else + surf->u.legacy.tiling_index[level] = AddrSurfInfoOut->tileIndex; + + surf->surf_size = surf_level->offset + AddrSurfInfoOut->surfSize; + + /* Clear DCC fields at the beginning. */ + surf_level->dcc_offset = 0; + + /* The previous level's flag tells us if we can use DCC for this level. */ + if (AddrSurfInfoIn->flags.dccCompatible && (level == 0 || AddrDccOut->subLvlCompressible)) { + bool prev_level_clearable = level == 0 || AddrDccOut->dccRamSizeAligned; + + AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize; + AddrDccIn->tileMode = AddrSurfInfoOut->tileMode; + AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo; + AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex; + AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; + + ret = AddrComputeDccInfo(addrlib, AddrDccIn, AddrDccOut); + + if (ret == ADDR_OK) { + surf_level->dcc_offset = surf->dcc_size; + surf->num_dcc_levels = level + 1; + surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize; + surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign); + + /* If the DCC size of a subresource (1 mip level or 1 slice) + * is not aligned, the DCC memory layout is not contiguous for + * that subresource, which means we can't use fast clear. + * + * We only do fast clears for whole mipmap levels. If we did + * per-slice fast clears, the same restriction would apply. + * (i.e. only compute the slice size and see if it's aligned) + * + * The last level can be non-contiguous and still be clearable + * if it's interleaved with the next level that doesn't exist. + */ + if (AddrDccOut->dccRamSizeAligned || + (prev_level_clearable && level == config->info.levels - 1)) + surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize; + else + surf_level->dcc_fast_clear_size = 0; + + /* Compute the DCC slice size because addrlib doesn't + * provide this info. As DCC memory is linear (each + * slice is the same size) it's easy to compute. + */ + surf->dcc_slice_size = AddrDccOut->dccRamSize / config->info.array_size; + + /* For arrays, we have to compute the DCC info again + * with one slice size to get a correct fast clear + * size. + */ + if (config->info.array_size > 1) { + AddrDccIn->colorSurfSize = AddrSurfInfoOut->sliceSize; + AddrDccIn->tileMode = AddrSurfInfoOut->tileMode; + AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo; + AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex; + AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; + + ret = AddrComputeDccInfo(addrlib, AddrDccIn, AddrDccOut); + if (ret == ADDR_OK) { + /* If the DCC memory isn't properly + * aligned, the data are interleaved + * accross slices. + */ + if (AddrDccOut->dccRamSizeAligned) + surf_level->dcc_slice_fast_clear_size = AddrDccOut->dccFastClearSize; + else + surf_level->dcc_slice_fast_clear_size = 0; + } + + if (surf->flags & RADEON_SURF_CONTIGUOUS_DCC_LAYERS && + surf->dcc_slice_size != surf_level->dcc_slice_fast_clear_size) { + surf->dcc_size = 0; + surf->num_dcc_levels = 0; + AddrDccOut->subLvlCompressible = false; + } + } else { + surf_level->dcc_slice_fast_clear_size = surf_level->dcc_fast_clear_size; + } + } + } + + /* HTILE. */ + if (!is_stencil && AddrSurfInfoIn->flags.depth && surf_level->mode == RADEON_SURF_MODE_2D && + level == 0 && !(surf->flags & RADEON_SURF_NO_HTILE)) { + AddrHtileIn->flags.tcCompatible = AddrSurfInfoOut->tcCompatible; + AddrHtileIn->pitch = AddrSurfInfoOut->pitch; + AddrHtileIn->height = AddrSurfInfoOut->height; + AddrHtileIn->numSlices = AddrSurfInfoOut->depth; + AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8; + AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8; + AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo; + AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex; + AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; + + ret = AddrComputeHtileInfo(addrlib, AddrHtileIn, AddrHtileOut); + + if (ret == ADDR_OK) { + surf->htile_size = AddrHtileOut->htileBytes; + surf->htile_slice_size = AddrHtileOut->sliceSize; + surf->htile_alignment = AddrHtileOut->baseAlign; + } + } + + return 0; } -static void gfx6_set_micro_tile_mode(struct radeon_surf *surf, - const struct radeon_info *info) +static void gfx6_set_micro_tile_mode(struct radeon_surf *surf, const struct radeon_info *info) { - uint32_t tile_mode = info->si_tile_mode_array[surf->u.legacy.tiling_index[0]]; + uint32_t tile_mode = info->si_tile_mode_array[surf->u.legacy.tiling_index[0]]; - if (info->chip_class >= GFX7) - surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode); - else - surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode); + if (info->chip_class >= GFX7) + surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode); + else + surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode); } static unsigned cik_get_macro_tile_index(struct radeon_surf *surf) { - unsigned index, tileb; + unsigned index, tileb; - tileb = 8 * 8 * surf->bpe; - tileb = MIN2(surf->u.legacy.tile_split, tileb); + tileb = 8 * 8 * surf->bpe; + tileb = MIN2(surf->u.legacy.tile_split, tileb); - for (index = 0; tileb > 64; index++) - tileb >>= 1; + for (index = 0; tileb > 64; index++) + tileb >>= 1; - assert(index < 16); - return index; + assert(index < 16); + return index; } -static bool get_display_flag(const struct ac_surf_config *config, - const struct radeon_surf *surf) +static bool get_display_flag(const struct ac_surf_config *config, const struct radeon_surf *surf) { - unsigned num_channels = config->info.num_channels; - unsigned bpe = surf->bpe; - - if (!config->is_3d && - !config->is_cube && - !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) && - surf->flags & RADEON_SURF_SCANOUT && - config->info.samples <= 1 && - surf->blk_w <= 2 && surf->blk_h == 1) { - /* subsampled */ - if (surf->blk_w == 2 && surf->blk_h == 1) - return true; - - if (/* RGBA8 or RGBA16F */ - (bpe >= 4 && bpe <= 8 && num_channels == 4) || - /* R5G6B5 or R5G5B5A1 */ - (bpe == 2 && num_channels >= 3) || - /* C8 palette */ - (bpe == 1 && num_channels == 1)) - return true; - } - return false; + unsigned num_channels = config->info.num_channels; + unsigned bpe = surf->bpe; + + if (!config->is_3d && !config->is_cube && !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) && + surf->flags & RADEON_SURF_SCANOUT && config->info.samples <= 1 && surf->blk_w <= 2 && + surf->blk_h == 1) { + /* subsampled */ + if (surf->blk_w == 2 && surf->blk_h == 1) + return true; + + if (/* RGBA8 or RGBA16F */ + (bpe >= 4 && bpe <= 8 && num_channels == 4) || + /* R5G6B5 or R5G5B5A1 */ + (bpe == 2 && num_channels >= 3) || + /* C8 palette */ + (bpe == 1 && num_channels == 1)) + return true; + } + return false; } /** @@ -745,119 +716,114 @@ static bool get_display_flag(const struct ac_surf_config *config, * Copy surface-global settings like pipe/bank config from level 0 surface * computation, and compute tile swizzle. */ -static int gfx6_surface_settings(ADDR_HANDLE addrlib, - const struct radeon_info *info, - const struct ac_surf_config *config, - ADDR_COMPUTE_SURFACE_INFO_OUTPUT* csio, - struct radeon_surf *surf) +static int gfx6_surface_settings(ADDR_HANDLE addrlib, const struct radeon_info *info, + const struct ac_surf_config *config, + ADDR_COMPUTE_SURFACE_INFO_OUTPUT *csio, struct radeon_surf *surf) { - surf->surf_alignment = csio->baseAlign; - surf->u.legacy.pipe_config = csio->pTileInfo->pipeConfig - 1; - gfx6_set_micro_tile_mode(surf, info); - - /* For 2D modes only. */ - if (csio->tileMode >= ADDR_TM_2D_TILED_THIN1) { - surf->u.legacy.bankw = csio->pTileInfo->bankWidth; - surf->u.legacy.bankh = csio->pTileInfo->bankHeight; - surf->u.legacy.mtilea = csio->pTileInfo->macroAspectRatio; - surf->u.legacy.tile_split = csio->pTileInfo->tileSplitBytes; - surf->u.legacy.num_banks = csio->pTileInfo->banks; - surf->u.legacy.macro_tile_index = csio->macroModeIndex; - } else { - surf->u.legacy.macro_tile_index = 0; - } - - /* Compute tile swizzle. */ - /* TODO: fix tile swizzle with mipmapping for GFX6 */ - if ((info->chip_class >= GFX7 || config->info.levels == 1) && - config->info.surf_index && - surf->u.legacy.level[0].mode == RADEON_SURF_MODE_2D && - !(surf->flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_SHAREABLE)) && - !get_display_flag(config, surf)) { - ADDR_COMPUTE_BASE_SWIZZLE_INPUT AddrBaseSwizzleIn = {0}; - ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT AddrBaseSwizzleOut = {0}; - - AddrBaseSwizzleIn.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT); - AddrBaseSwizzleOut.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT); - - AddrBaseSwizzleIn.surfIndex = p_atomic_inc_return(config->info.surf_index) - 1; - AddrBaseSwizzleIn.tileIndex = csio->tileIndex; - AddrBaseSwizzleIn.macroModeIndex = csio->macroModeIndex; - AddrBaseSwizzleIn.pTileInfo = csio->pTileInfo; - AddrBaseSwizzleIn.tileMode = csio->tileMode; - - int r = AddrComputeBaseSwizzle(addrlib, &AddrBaseSwizzleIn, - &AddrBaseSwizzleOut); - if (r != ADDR_OK) - return r; - - assert(AddrBaseSwizzleOut.tileSwizzle <= - u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8)); - surf->tile_swizzle = AddrBaseSwizzleOut.tileSwizzle; - } - return 0; + surf->surf_alignment = csio->baseAlign; + surf->u.legacy.pipe_config = csio->pTileInfo->pipeConfig - 1; + gfx6_set_micro_tile_mode(surf, info); + + /* For 2D modes only. */ + if (csio->tileMode >= ADDR_TM_2D_TILED_THIN1) { + surf->u.legacy.bankw = csio->pTileInfo->bankWidth; + surf->u.legacy.bankh = csio->pTileInfo->bankHeight; + surf->u.legacy.mtilea = csio->pTileInfo->macroAspectRatio; + surf->u.legacy.tile_split = csio->pTileInfo->tileSplitBytes; + surf->u.legacy.num_banks = csio->pTileInfo->banks; + surf->u.legacy.macro_tile_index = csio->macroModeIndex; + } else { + surf->u.legacy.macro_tile_index = 0; + } + + /* Compute tile swizzle. */ + /* TODO: fix tile swizzle with mipmapping for GFX6 */ + if ((info->chip_class >= GFX7 || config->info.levels == 1) && config->info.surf_index && + surf->u.legacy.level[0].mode == RADEON_SURF_MODE_2D && + !(surf->flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_SHAREABLE)) && + !get_display_flag(config, surf)) { + ADDR_COMPUTE_BASE_SWIZZLE_INPUT AddrBaseSwizzleIn = {0}; + ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT AddrBaseSwizzleOut = {0}; + + AddrBaseSwizzleIn.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT); + AddrBaseSwizzleOut.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT); + + AddrBaseSwizzleIn.surfIndex = p_atomic_inc_return(config->info.surf_index) - 1; + AddrBaseSwizzleIn.tileIndex = csio->tileIndex; + AddrBaseSwizzleIn.macroModeIndex = csio->macroModeIndex; + AddrBaseSwizzleIn.pTileInfo = csio->pTileInfo; + AddrBaseSwizzleIn.tileMode = csio->tileMode; + + int r = AddrComputeBaseSwizzle(addrlib, &AddrBaseSwizzleIn, &AddrBaseSwizzleOut); + if (r != ADDR_OK) + return r; + + assert(AddrBaseSwizzleOut.tileSwizzle <= + u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8)); + surf->tile_swizzle = AddrBaseSwizzleOut.tileSwizzle; + } + return 0; } -static void ac_compute_cmask(const struct radeon_info *info, - const struct ac_surf_config *config, - struct radeon_surf *surf) +static void ac_compute_cmask(const struct radeon_info *info, const struct ac_surf_config *config, + struct radeon_surf *surf) { - unsigned pipe_interleave_bytes = info->pipe_interleave_bytes; - unsigned num_pipes = info->num_tile_pipes; - unsigned cl_width, cl_height; - - if (surf->flags & RADEON_SURF_Z_OR_SBUFFER || surf->is_linear || - (config->info.samples >= 2 && !surf->fmask_size)) - return; - - assert(info->chip_class <= GFX8); - - switch (num_pipes) { - case 2: - cl_width = 32; - cl_height = 16; - break; - case 4: - cl_width = 32; - cl_height = 32; - break; - case 8: - cl_width = 64; - cl_height = 32; - break; - case 16: /* Hawaii */ - cl_width = 64; - cl_height = 64; - break; - default: - assert(0); - return; - } - - unsigned base_align = num_pipes * pipe_interleave_bytes; - - unsigned width = align(surf->u.legacy.level[0].nblk_x, cl_width*8); - unsigned height = align(surf->u.legacy.level[0].nblk_y, cl_height*8); - unsigned slice_elements = (width * height) / (8*8); - - /* Each element of CMASK is a nibble. */ - unsigned slice_bytes = slice_elements / 2; - - surf->u.legacy.cmask_slice_tile_max = (width * height) / (128*128); - if (surf->u.legacy.cmask_slice_tile_max) - surf->u.legacy.cmask_slice_tile_max -= 1; - - unsigned num_layers; - if (config->is_3d) - num_layers = config->info.depth; - else if (config->is_cube) - num_layers = 6; - else - num_layers = config->info.array_size; - - surf->cmask_alignment = MAX2(256, base_align); - surf->cmask_slice_size = align(slice_bytes, base_align); - surf->cmask_size = surf->cmask_slice_size * num_layers; + unsigned pipe_interleave_bytes = info->pipe_interleave_bytes; + unsigned num_pipes = info->num_tile_pipes; + unsigned cl_width, cl_height; + + if (surf->flags & RADEON_SURF_Z_OR_SBUFFER || surf->is_linear || + (config->info.samples >= 2 && !surf->fmask_size)) + return; + + assert(info->chip_class <= GFX8); + + switch (num_pipes) { + case 2: + cl_width = 32; + cl_height = 16; + break; + case 4: + cl_width = 32; + cl_height = 32; + break; + case 8: + cl_width = 64; + cl_height = 32; + break; + case 16: /* Hawaii */ + cl_width = 64; + cl_height = 64; + break; + default: + assert(0); + return; + } + + unsigned base_align = num_pipes * pipe_interleave_bytes; + + unsigned width = align(surf->u.legacy.level[0].nblk_x, cl_width * 8); + unsigned height = align(surf->u.legacy.level[0].nblk_y, cl_height * 8); + unsigned slice_elements = (width * height) / (8 * 8); + + /* Each element of CMASK is a nibble. */ + unsigned slice_bytes = slice_elements / 2; + + surf->u.legacy.cmask_slice_tile_max = (width * height) / (128 * 128); + if (surf->u.legacy.cmask_slice_tile_max) + surf->u.legacy.cmask_slice_tile_max -= 1; + + unsigned num_layers; + if (config->is_3d) + num_layers = config->info.depth; + else if (config->is_cube) + num_layers = 6; + else + num_layers = config->info.array_size; + + surf->cmask_alignment = MAX2(256, base_align); + surf->cmask_slice_size = align(slice_bytes, base_align); + surf->cmask_size = surf->cmask_slice_size * num_layers; } /** @@ -866,1416 +832,1327 @@ static void ac_compute_cmask(const struct radeon_info *info, * The following fields of \p surf must be initialized by the caller: * blk_w, blk_h, bpe, flags. */ -static int gfx6_compute_surface(ADDR_HANDLE addrlib, - const struct radeon_info *info, - const struct ac_surf_config *config, - enum radeon_surf_mode mode, - struct radeon_surf *surf) +static int gfx6_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *info, + const struct ac_surf_config *config, enum radeon_surf_mode mode, + struct radeon_surf *surf) { - unsigned level; - bool compressed; - ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0}; - ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0}; - ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0}; - ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0}; - ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0}; - ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0}; - ADDR_TILEINFO AddrTileInfoIn = {0}; - ADDR_TILEINFO AddrTileInfoOut = {0}; - int r; - - AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT); - AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT); - AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT); - AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT); - AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT); - AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT); - AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut; - - compressed = surf->blk_w == 4 && surf->blk_h == 4; - - /* MSAA requires 2D tiling. */ - if (config->info.samples > 1) - mode = RADEON_SURF_MODE_2D; - - /* DB doesn't support linear layouts. */ - if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) && - mode < RADEON_SURF_MODE_1D) - mode = RADEON_SURF_MODE_1D; - - /* Set the requested tiling mode. */ - switch (mode) { - case RADEON_SURF_MODE_LINEAR_ALIGNED: - AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED; - break; - case RADEON_SURF_MODE_1D: - AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1; - break; - case RADEON_SURF_MODE_2D: - AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1; - break; - default: - assert(0); - } - - /* The format must be set correctly for the allocation of compressed - * textures to work. In other cases, setting the bpp is sufficient. - */ - if (compressed) { - switch (surf->bpe) { - case 8: - AddrSurfInfoIn.format = ADDR_FMT_BC1; - break; - case 16: - AddrSurfInfoIn.format = ADDR_FMT_BC3; - break; - default: - assert(0); - } - } - else { - AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8; - } - - AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = - MAX2(1, config->info.samples); - AddrSurfInfoIn.tileIndex = -1; - - if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)) { - AddrDccIn.numSamples = AddrSurfInfoIn.numFrags = - MAX2(1, config->info.storage_samples); - } - - /* Set the micro tile type. */ - if (surf->flags & RADEON_SURF_SCANOUT) - AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE; - else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER) - AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER; - else - AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE; - - AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER); - AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0; - AddrSurfInfoIn.flags.cube = config->is_cube; - AddrSurfInfoIn.flags.display = get_display_flag(config, surf); - AddrSurfInfoIn.flags.pow2Pad = config->info.levels > 1; - AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0; - - /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been - * requested, because TC-compatible HTILE requires 2D tiling. - */ - AddrSurfInfoIn.flags.opt4Space = !AddrSurfInfoIn.flags.tcCompatible && - !AddrSurfInfoIn.flags.fmask && - config->info.samples <= 1 && - !(surf->flags & RADEON_SURF_FORCE_SWIZZLE_MODE); - - /* DCC notes: - * - If we add MSAA support, keep in mind that CB can't decompress 8bpp - * with samples >= 4. - * - Mipmapped array textures have low performance (discovered by a closed - * driver team). - */ - AddrSurfInfoIn.flags.dccCompatible = - info->chip_class >= GFX8 && - info->has_graphics && /* disable DCC on compute-only chips */ - !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) && - !(surf->flags & RADEON_SURF_DISABLE_DCC) && - !compressed && - ((config->info.array_size == 1 && config->info.depth == 1) || - config->info.levels == 1); - - AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0; - AddrSurfInfoIn.flags.compressZ = !!(surf->flags & RADEON_SURF_Z_OR_SBUFFER); - - /* On GFX7-GFX8, the DB uses the same pitch and tile mode (except tilesplit) - * for Z and stencil. This can cause a number of problems which we work - * around here: - * - * - a depth part that is incompatible with mipmapped texturing - * - at least on Stoney, entirely incompatible Z/S aspects (e.g. - * incorrect tiling applied to the stencil part, stencil buffer - * memory accesses that go out of bounds) even without mipmapping - * - * Some piglit tests that are prone to different types of related - * failures: - * ./bin/ext_framebuffer_multisample-upsample 2 stencil - * ./bin/framebuffer-blit-levels {draw,read} stencil - * ./bin/ext_framebuffer_multisample-unaligned-blit N {depth,stencil} {msaa,upsample,downsample} - * ./bin/fbo-depth-array fs-writes-{depth,stencil} / {depth,stencil}-{clear,layered-clear,draw} - * ./bin/depthstencil-render-miplevels 1024 d=s=z24_s8 - */ - int stencil_tile_idx = -1; - - if (AddrSurfInfoIn.flags.depth && !AddrSurfInfoIn.flags.noStencil && - (config->info.levels > 1 || info->family == CHIP_STONEY)) { - /* Compute stencilTileIdx that is compatible with the (depth) - * tileIdx. This degrades the depth surface if necessary to - * ensure that a matching stencilTileIdx exists. */ - AddrSurfInfoIn.flags.matchStencilTileCfg = 1; - - /* Keep the depth mip-tail compatible with texturing. */ - AddrSurfInfoIn.flags.noStencil = 1; - } - - /* Set preferred macrotile parameters. This is usually required - * for shared resources. This is for 2D tiling only. */ - if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 && - surf->u.legacy.bankw && surf->u.legacy.bankh && - surf->u.legacy.mtilea && surf->u.legacy.tile_split) { - /* If any of these parameters are incorrect, the calculation - * will fail. */ - AddrTileInfoIn.banks = surf->u.legacy.num_banks; - AddrTileInfoIn.bankWidth = surf->u.legacy.bankw; - AddrTileInfoIn.bankHeight = surf->u.legacy.bankh; - AddrTileInfoIn.macroAspectRatio = surf->u.legacy.mtilea; - AddrTileInfoIn.tileSplitBytes = surf->u.legacy.tile_split; - AddrTileInfoIn.pipeConfig = surf->u.legacy.pipe_config + 1; /* +1 compared to GB_TILE_MODE */ - AddrSurfInfoIn.flags.opt4Space = 0; - AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn; - - /* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set - * the tile index, because we are expected to know it if - * we know the other parameters. - * - * This is something that can easily be fixed in Addrlib. - * For now, just figure it out here. - * Note that only 2D_TILE_THIN1 is handled here. - */ - assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); - assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1); - - if (info->chip_class == GFX6) { - if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) { - if (surf->bpe == 2) - AddrSurfInfoIn.tileIndex = 11; /* 16bpp */ - else - AddrSurfInfoIn.tileIndex = 12; /* 32bpp */ - } else { - if (surf->bpe == 1) - AddrSurfInfoIn.tileIndex = 14; /* 8bpp */ - else if (surf->bpe == 2) - AddrSurfInfoIn.tileIndex = 15; /* 16bpp */ - else if (surf->bpe == 4) - AddrSurfInfoIn.tileIndex = 16; /* 32bpp */ - else - AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */ - } - } else { - /* GFX7 - GFX8 */ - if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) - AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */ - else - AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */ - - /* Addrlib doesn't set this if tileIndex is forced like above. */ - AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf); - } - } - - surf->has_stencil = !!(surf->flags & RADEON_SURF_SBUFFER); - surf->num_dcc_levels = 0; - surf->surf_size = 0; - surf->dcc_size = 0; - surf->dcc_alignment = 1; - surf->htile_size = 0; - surf->htile_slice_size = 0; - surf->htile_alignment = 1; - - const bool only_stencil = (surf->flags & RADEON_SURF_SBUFFER) && - !(surf->flags & RADEON_SURF_ZBUFFER); - - /* Calculate texture layout information. */ - if (!only_stencil) { - for (level = 0; level < config->info.levels; level++) { - r = gfx6_compute_level(addrlib, config, surf, false, level, compressed, - &AddrSurfInfoIn, &AddrSurfInfoOut, - &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut); - if (r) - return r; - - if (level > 0) - continue; - - if (!AddrSurfInfoOut.tcCompatible) { - AddrSurfInfoIn.flags.tcCompatible = 0; - surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE; - } - - if (AddrSurfInfoIn.flags.matchStencilTileCfg) { - AddrSurfInfoIn.flags.matchStencilTileCfg = 0; - AddrSurfInfoIn.tileIndex = AddrSurfInfoOut.tileIndex; - stencil_tile_idx = AddrSurfInfoOut.stencilTileIdx; - - assert(stencil_tile_idx >= 0); - } - - r = gfx6_surface_settings(addrlib, info, config, - &AddrSurfInfoOut, surf); - if (r) - return r; - } - } - - /* Calculate texture layout information for stencil. */ - if (surf->flags & RADEON_SURF_SBUFFER) { - AddrSurfInfoIn.tileIndex = stencil_tile_idx; - AddrSurfInfoIn.bpp = 8; - AddrSurfInfoIn.flags.depth = 0; - AddrSurfInfoIn.flags.stencil = 1; - AddrSurfInfoIn.flags.tcCompatible = 0; - /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */ - AddrTileInfoIn.tileSplitBytes = surf->u.legacy.stencil_tile_split; - - for (level = 0; level < config->info.levels; level++) { - r = gfx6_compute_level(addrlib, config, surf, true, level, compressed, - &AddrSurfInfoIn, &AddrSurfInfoOut, - &AddrDccIn, &AddrDccOut, - NULL, NULL); - if (r) - return r; - - /* DB uses the depth pitch for both stencil and depth. */ - if (!only_stencil) { - if (surf->u.legacy.stencil_level[level].nblk_x != - surf->u.legacy.level[level].nblk_x) - surf->u.legacy.stencil_adjusted = true; - } else { - surf->u.legacy.level[level].nblk_x = - surf->u.legacy.stencil_level[level].nblk_x; - } - - if (level == 0) { - if (only_stencil) { - r = gfx6_surface_settings(addrlib, info, config, - &AddrSurfInfoOut, surf); - if (r) - return r; - } - - /* For 2D modes only. */ - if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) { - surf->u.legacy.stencil_tile_split = - AddrSurfInfoOut.pTileInfo->tileSplitBytes; - } - } - } - } - - /* Compute FMASK. */ - if (config->info.samples >= 2 && AddrSurfInfoIn.flags.color && - info->has_graphics && !(surf->flags & RADEON_SURF_NO_FMASK)) { - ADDR_COMPUTE_FMASK_INFO_INPUT fin = {0}; - ADDR_COMPUTE_FMASK_INFO_OUTPUT fout = {0}; - ADDR_TILEINFO fmask_tile_info = {}; - - fin.size = sizeof(fin); - fout.size = sizeof(fout); - - fin.tileMode = AddrSurfInfoOut.tileMode; - fin.pitch = AddrSurfInfoOut.pitch; - fin.height = config->info.height; - fin.numSlices = AddrSurfInfoIn.numSlices; - fin.numSamples = AddrSurfInfoIn.numSamples; - fin.numFrags = AddrSurfInfoIn.numFrags; - fin.tileIndex = -1; - fout.pTileInfo = &fmask_tile_info; - - r = AddrComputeFmaskInfo(addrlib, &fin, &fout); - if (r) - return r; - - surf->fmask_size = fout.fmaskBytes; - surf->fmask_alignment = fout.baseAlign; - surf->fmask_tile_swizzle = 0; - - surf->u.legacy.fmask.slice_tile_max = - (fout.pitch * fout.height) / 64; - if (surf->u.legacy.fmask.slice_tile_max) - surf->u.legacy.fmask.slice_tile_max -= 1; - - surf->u.legacy.fmask.tiling_index = fout.tileIndex; - surf->u.legacy.fmask.bankh = fout.pTileInfo->bankHeight; - surf->u.legacy.fmask.pitch_in_pixels = fout.pitch; - surf->u.legacy.fmask.slice_size = fout.sliceSize; - - /* Compute tile swizzle for FMASK. */ - if (config->info.fmask_surf_index && - !(surf->flags & RADEON_SURF_SHAREABLE)) { - ADDR_COMPUTE_BASE_SWIZZLE_INPUT xin = {0}; - ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT xout = {0}; - - xin.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT); - xout.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT); - - /* This counter starts from 1 instead of 0. */ - xin.surfIndex = p_atomic_inc_return(config->info.fmask_surf_index); - xin.tileIndex = fout.tileIndex; - xin.macroModeIndex = fout.macroModeIndex; - xin.pTileInfo = fout.pTileInfo; - xin.tileMode = fin.tileMode; - - int r = AddrComputeBaseSwizzle(addrlib, &xin, &xout); - if (r != ADDR_OK) - return r; - - assert(xout.tileSwizzle <= - u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8)); - surf->fmask_tile_swizzle = xout.tileSwizzle; - } - } - - /* Recalculate the whole DCC miptree size including disabled levels. - * This is what addrlib does, but calling addrlib would be a lot more - * complicated. - */ - if (surf->dcc_size && config->info.levels > 1) { - /* The smallest miplevels that are never compressed by DCC - * still read the DCC buffer via TC if the base level uses DCC, - * and for some reason the DCC buffer needs to be larger if - * the miptree uses non-zero tile_swizzle. Otherwise there are - * VM faults. - * - * "dcc_alignment * 4" was determined by trial and error. - */ - surf->dcc_size = align64(surf->surf_size >> 8, - surf->dcc_alignment * 4); - } - - /* Make sure HTILE covers the whole miptree, because the shader reads - * TC-compatible HTILE even for levels where it's disabled by DB. - */ - if (surf->htile_size && config->info.levels > 1 && - surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) { - /* MSAA can't occur with levels > 1, so ignore the sample count. */ - const unsigned total_pixels = surf->surf_size / surf->bpe; - const unsigned htile_block_size = 8 * 8; - const unsigned htile_element_size = 4; - - surf->htile_size = (total_pixels / htile_block_size) * - htile_element_size; - surf->htile_size = align(surf->htile_size, surf->htile_alignment); - } else if (!surf->htile_size) { - /* Unset this if HTILE is not present. */ - surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE; - } - - surf->is_linear = surf->u.legacy.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED; - surf->is_displayable = surf->is_linear || - surf->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY || - surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER; - - /* The rotated micro tile mode doesn't work if both CMASK and RB+ are - * used at the same time. This case is not currently expected to occur - * because we don't use rotated. Enforce this restriction on all chips - * to facilitate testing. - */ - if (surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER) { - assert(!"rotate micro tile mode is unsupported"); - return ADDR_ERROR; - } - - ac_compute_cmask(info, config, surf); - return 0; + unsigned level; + bool compressed; + ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0}; + ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0}; + ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0}; + ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0}; + ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0}; + ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0}; + ADDR_TILEINFO AddrTileInfoIn = {0}; + ADDR_TILEINFO AddrTileInfoOut = {0}; + int r; + + AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT); + AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT); + AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT); + AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT); + AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT); + AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT); + AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut; + + compressed = surf->blk_w == 4 && surf->blk_h == 4; + + /* MSAA requires 2D tiling. */ + if (config->info.samples > 1) + mode = RADEON_SURF_MODE_2D; + + /* DB doesn't support linear layouts. */ + if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) && mode < RADEON_SURF_MODE_1D) + mode = RADEON_SURF_MODE_1D; + + /* Set the requested tiling mode. */ + switch (mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: + AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED; + break; + case RADEON_SURF_MODE_1D: + AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1; + break; + case RADEON_SURF_MODE_2D: + AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1; + break; + default: + assert(0); + } + + /* The format must be set correctly for the allocation of compressed + * textures to work. In other cases, setting the bpp is sufficient. + */ + if (compressed) { + switch (surf->bpe) { + case 8: + AddrSurfInfoIn.format = ADDR_FMT_BC1; + break; + case 16: + AddrSurfInfoIn.format = ADDR_FMT_BC3; + break; + default: + assert(0); + } + } else { + AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8; + } + + AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = MAX2(1, config->info.samples); + AddrSurfInfoIn.tileIndex = -1; + + if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)) { + AddrDccIn.numSamples = AddrSurfInfoIn.numFrags = MAX2(1, config->info.storage_samples); + } + + /* Set the micro tile type. */ + if (surf->flags & RADEON_SURF_SCANOUT) + AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE; + else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER) + AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER; + else + AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE; + + AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER); + AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0; + AddrSurfInfoIn.flags.cube = config->is_cube; + AddrSurfInfoIn.flags.display = get_display_flag(config, surf); + AddrSurfInfoIn.flags.pow2Pad = config->info.levels > 1; + AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0; + + /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been + * requested, because TC-compatible HTILE requires 2D tiling. + */ + AddrSurfInfoIn.flags.opt4Space = !AddrSurfInfoIn.flags.tcCompatible && + !AddrSurfInfoIn.flags.fmask && config->info.samples <= 1 && + !(surf->flags & RADEON_SURF_FORCE_SWIZZLE_MODE); + + /* DCC notes: + * - If we add MSAA support, keep in mind that CB can't decompress 8bpp + * with samples >= 4. + * - Mipmapped array textures have low performance (discovered by a closed + * driver team). + */ + AddrSurfInfoIn.flags.dccCompatible = + info->chip_class >= GFX8 && info->has_graphics && /* disable DCC on compute-only chips */ + !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) && !(surf->flags & RADEON_SURF_DISABLE_DCC) && + !compressed && + ((config->info.array_size == 1 && config->info.depth == 1) || config->info.levels == 1); + + AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0; + AddrSurfInfoIn.flags.compressZ = !!(surf->flags & RADEON_SURF_Z_OR_SBUFFER); + + /* On GFX7-GFX8, the DB uses the same pitch and tile mode (except tilesplit) + * for Z and stencil. This can cause a number of problems which we work + * around here: + * + * - a depth part that is incompatible with mipmapped texturing + * - at least on Stoney, entirely incompatible Z/S aspects (e.g. + * incorrect tiling applied to the stencil part, stencil buffer + * memory accesses that go out of bounds) even without mipmapping + * + * Some piglit tests that are prone to different types of related + * failures: + * ./bin/ext_framebuffer_multisample-upsample 2 stencil + * ./bin/framebuffer-blit-levels {draw,read} stencil + * ./bin/ext_framebuffer_multisample-unaligned-blit N {depth,stencil} {msaa,upsample,downsample} + * ./bin/fbo-depth-array fs-writes-{depth,stencil} / {depth,stencil}-{clear,layered-clear,draw} + * ./bin/depthstencil-render-miplevels 1024 d=s=z24_s8 + */ + int stencil_tile_idx = -1; + + if (AddrSurfInfoIn.flags.depth && !AddrSurfInfoIn.flags.noStencil && + (config->info.levels > 1 || info->family == CHIP_STONEY)) { + /* Compute stencilTileIdx that is compatible with the (depth) + * tileIdx. This degrades the depth surface if necessary to + * ensure that a matching stencilTileIdx exists. */ + AddrSurfInfoIn.flags.matchStencilTileCfg = 1; + + /* Keep the depth mip-tail compatible with texturing. */ + AddrSurfInfoIn.flags.noStencil = 1; + } + + /* Set preferred macrotile parameters. This is usually required + * for shared resources. This is for 2D tiling only. */ + if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 && surf->u.legacy.bankw && + surf->u.legacy.bankh && surf->u.legacy.mtilea && surf->u.legacy.tile_split) { + /* If any of these parameters are incorrect, the calculation + * will fail. */ + AddrTileInfoIn.banks = surf->u.legacy.num_banks; + AddrTileInfoIn.bankWidth = surf->u.legacy.bankw; + AddrTileInfoIn.bankHeight = surf->u.legacy.bankh; + AddrTileInfoIn.macroAspectRatio = surf->u.legacy.mtilea; + AddrTileInfoIn.tileSplitBytes = surf->u.legacy.tile_split; + AddrTileInfoIn.pipeConfig = surf->u.legacy.pipe_config + 1; /* +1 compared to GB_TILE_MODE */ + AddrSurfInfoIn.flags.opt4Space = 0; + AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn; + + /* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set + * the tile index, because we are expected to know it if + * we know the other parameters. + * + * This is something that can easily be fixed in Addrlib. + * For now, just figure it out here. + * Note that only 2D_TILE_THIN1 is handled here. + */ + assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); + assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1); + + if (info->chip_class == GFX6) { + if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) { + if (surf->bpe == 2) + AddrSurfInfoIn.tileIndex = 11; /* 16bpp */ + else + AddrSurfInfoIn.tileIndex = 12; /* 32bpp */ + } else { + if (surf->bpe == 1) + AddrSurfInfoIn.tileIndex = 14; /* 8bpp */ + else if (surf->bpe == 2) + AddrSurfInfoIn.tileIndex = 15; /* 16bpp */ + else if (surf->bpe == 4) + AddrSurfInfoIn.tileIndex = 16; /* 32bpp */ + else + AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */ + } + } else { + /* GFX7 - GFX8 */ + if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) + AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */ + else + AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */ + + /* Addrlib doesn't set this if tileIndex is forced like above. */ + AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf); + } + } + + surf->has_stencil = !!(surf->flags & RADEON_SURF_SBUFFER); + surf->num_dcc_levels = 0; + surf->surf_size = 0; + surf->dcc_size = 0; + surf->dcc_alignment = 1; + surf->htile_size = 0; + surf->htile_slice_size = 0; + surf->htile_alignment = 1; + + const bool only_stencil = + (surf->flags & RADEON_SURF_SBUFFER) && !(surf->flags & RADEON_SURF_ZBUFFER); + + /* Calculate texture layout information. */ + if (!only_stencil) { + for (level = 0; level < config->info.levels; level++) { + r = gfx6_compute_level(addrlib, config, surf, false, level, compressed, &AddrSurfInfoIn, + &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut, &AddrHtileIn, + &AddrHtileOut); + if (r) + return r; + + if (level > 0) + continue; + + if (!AddrSurfInfoOut.tcCompatible) { + AddrSurfInfoIn.flags.tcCompatible = 0; + surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE; + } + + if (AddrSurfInfoIn.flags.matchStencilTileCfg) { + AddrSurfInfoIn.flags.matchStencilTileCfg = 0; + AddrSurfInfoIn.tileIndex = AddrSurfInfoOut.tileIndex; + stencil_tile_idx = AddrSurfInfoOut.stencilTileIdx; + + assert(stencil_tile_idx >= 0); + } + + r = gfx6_surface_settings(addrlib, info, config, &AddrSurfInfoOut, surf); + if (r) + return r; + } + } + + /* Calculate texture layout information for stencil. */ + if (surf->flags & RADEON_SURF_SBUFFER) { + AddrSurfInfoIn.tileIndex = stencil_tile_idx; + AddrSurfInfoIn.bpp = 8; + AddrSurfInfoIn.flags.depth = 0; + AddrSurfInfoIn.flags.stencil = 1; + AddrSurfInfoIn.flags.tcCompatible = 0; + /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */ + AddrTileInfoIn.tileSplitBytes = surf->u.legacy.stencil_tile_split; + + for (level = 0; level < config->info.levels; level++) { + r = gfx6_compute_level(addrlib, config, surf, true, level, compressed, &AddrSurfInfoIn, + &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut, NULL, NULL); + if (r) + return r; + + /* DB uses the depth pitch for both stencil and depth. */ + if (!only_stencil) { + if (surf->u.legacy.stencil_level[level].nblk_x != surf->u.legacy.level[level].nblk_x) + surf->u.legacy.stencil_adjusted = true; + } else { + surf->u.legacy.level[level].nblk_x = surf->u.legacy.stencil_level[level].nblk_x; + } + + if (level == 0) { + if (only_stencil) { + r = gfx6_surface_settings(addrlib, info, config, &AddrSurfInfoOut, surf); + if (r) + return r; + } + + /* For 2D modes only. */ + if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) { + surf->u.legacy.stencil_tile_split = AddrSurfInfoOut.pTileInfo->tileSplitBytes; + } + } + } + } + + /* Compute FMASK. */ + if (config->info.samples >= 2 && AddrSurfInfoIn.flags.color && info->has_graphics && + !(surf->flags & RADEON_SURF_NO_FMASK)) { + ADDR_COMPUTE_FMASK_INFO_INPUT fin = {0}; + ADDR_COMPUTE_FMASK_INFO_OUTPUT fout = {0}; + ADDR_TILEINFO fmask_tile_info = {}; + + fin.size = sizeof(fin); + fout.size = sizeof(fout); + + fin.tileMode = AddrSurfInfoOut.tileMode; + fin.pitch = AddrSurfInfoOut.pitch; + fin.height = config->info.height; + fin.numSlices = AddrSurfInfoIn.numSlices; + fin.numSamples = AddrSurfInfoIn.numSamples; + fin.numFrags = AddrSurfInfoIn.numFrags; + fin.tileIndex = -1; + fout.pTileInfo = &fmask_tile_info; + + r = AddrComputeFmaskInfo(addrlib, &fin, &fout); + if (r) + return r; + + surf->fmask_size = fout.fmaskBytes; + surf->fmask_alignment = fout.baseAlign; + surf->fmask_tile_swizzle = 0; + + surf->u.legacy.fmask.slice_tile_max = (fout.pitch * fout.height) / 64; + if (surf->u.legacy.fmask.slice_tile_max) + surf->u.legacy.fmask.slice_tile_max -= 1; + + surf->u.legacy.fmask.tiling_index = fout.tileIndex; + surf->u.legacy.fmask.bankh = fout.pTileInfo->bankHeight; + surf->u.legacy.fmask.pitch_in_pixels = fout.pitch; + surf->u.legacy.fmask.slice_size = fout.sliceSize; + + /* Compute tile swizzle for FMASK. */ + if (config->info.fmask_surf_index && !(surf->flags & RADEON_SURF_SHAREABLE)) { + ADDR_COMPUTE_BASE_SWIZZLE_INPUT xin = {0}; + ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT xout = {0}; + + xin.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT); + xout.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT); + + /* This counter starts from 1 instead of 0. */ + xin.surfIndex = p_atomic_inc_return(config->info.fmask_surf_index); + xin.tileIndex = fout.tileIndex; + xin.macroModeIndex = fout.macroModeIndex; + xin.pTileInfo = fout.pTileInfo; + xin.tileMode = fin.tileMode; + + int r = AddrComputeBaseSwizzle(addrlib, &xin, &xout); + if (r != ADDR_OK) + return r; + + assert(xout.tileSwizzle <= u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8)); + surf->fmask_tile_swizzle = xout.tileSwizzle; + } + } + + /* Recalculate the whole DCC miptree size including disabled levels. + * This is what addrlib does, but calling addrlib would be a lot more + * complicated. + */ + if (surf->dcc_size && config->info.levels > 1) { + /* The smallest miplevels that are never compressed by DCC + * still read the DCC buffer via TC if the base level uses DCC, + * and for some reason the DCC buffer needs to be larger if + * the miptree uses non-zero tile_swizzle. Otherwise there are + * VM faults. + * + * "dcc_alignment * 4" was determined by trial and error. + */ + surf->dcc_size = align64(surf->surf_size >> 8, surf->dcc_alignment * 4); + } + + /* Make sure HTILE covers the whole miptree, because the shader reads + * TC-compatible HTILE even for levels where it's disabled by DB. + */ + if (surf->htile_size && config->info.levels > 1 && + surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) { + /* MSAA can't occur with levels > 1, so ignore the sample count. */ + const unsigned total_pixels = surf->surf_size / surf->bpe; + const unsigned htile_block_size = 8 * 8; + const unsigned htile_element_size = 4; + + surf->htile_size = (total_pixels / htile_block_size) * htile_element_size; + surf->htile_size = align(surf->htile_size, surf->htile_alignment); + } else if (!surf->htile_size) { + /* Unset this if HTILE is not present. */ + surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE; + } + + surf->is_linear = surf->u.legacy.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED; + surf->is_displayable = surf->is_linear || surf->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY || + surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER; + + /* The rotated micro tile mode doesn't work if both CMASK and RB+ are + * used at the same time. This case is not currently expected to occur + * because we don't use rotated. Enforce this restriction on all chips + * to facilitate testing. + */ + if (surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER) { + assert(!"rotate micro tile mode is unsupported"); + return ADDR_ERROR; + } + + ac_compute_cmask(info, config, surf); + return 0; } /* This is only called when expecting a tiled layout. */ -static int -gfx9_get_preferred_swizzle_mode(ADDR_HANDLE addrlib, - struct radeon_surf *surf, - ADDR2_COMPUTE_SURFACE_INFO_INPUT *in, - bool is_fmask, AddrSwizzleMode *swizzle_mode) +static int gfx9_get_preferred_swizzle_mode(ADDR_HANDLE addrlib, struct radeon_surf *surf, + ADDR2_COMPUTE_SURFACE_INFO_INPUT *in, bool is_fmask, + AddrSwizzleMode *swizzle_mode) { - ADDR_E_RETURNCODE ret; - ADDR2_GET_PREFERRED_SURF_SETTING_INPUT sin = {0}; - ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT sout = {0}; - - sin.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_INPUT); - sout.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT); - - sin.flags = in->flags; - sin.resourceType = in->resourceType; - sin.format = in->format; - sin.resourceLoction = ADDR_RSRC_LOC_INVIS; - /* TODO: We could allow some of these: */ - sin.forbiddenBlock.micro = 1; /* don't allow the 256B swizzle modes */ - sin.forbiddenBlock.var = 1; /* don't allow the variable-sized swizzle modes */ - sin.bpp = in->bpp; - sin.width = in->width; - sin.height = in->height; - sin.numSlices = in->numSlices; - sin.numMipLevels = in->numMipLevels; - sin.numSamples = in->numSamples; - sin.numFrags = in->numFrags; - - if (is_fmask) { - sin.flags.display = 0; - sin.flags.color = 0; - sin.flags.fmask = 1; - } - - if (surf->flags & RADEON_SURF_FORCE_MICRO_TILE_MODE) { - sin.forbiddenBlock.linear = 1; - - if (surf->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY) - sin.preferredSwSet.sw_D = 1; - else if (surf->micro_tile_mode == RADEON_MICRO_MODE_STANDARD) - sin.preferredSwSet.sw_S = 1; - else if (surf->micro_tile_mode == RADEON_MICRO_MODE_DEPTH) - sin.preferredSwSet.sw_Z = 1; - else if (surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER) - sin.preferredSwSet.sw_R = 1; - } - - ret = Addr2GetPreferredSurfaceSetting(addrlib, &sin, &sout); - if (ret != ADDR_OK) - return ret; - - *swizzle_mode = sout.swizzleMode; - return 0; + ADDR_E_RETURNCODE ret; + ADDR2_GET_PREFERRED_SURF_SETTING_INPUT sin = {0}; + ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT sout = {0}; + + sin.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_INPUT); + sout.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT); + + sin.flags = in->flags; + sin.resourceType = in->resourceType; + sin.format = in->format; + sin.resourceLoction = ADDR_RSRC_LOC_INVIS; + /* TODO: We could allow some of these: */ + sin.forbiddenBlock.micro = 1; /* don't allow the 256B swizzle modes */ + sin.forbiddenBlock.var = 1; /* don't allow the variable-sized swizzle modes */ + sin.bpp = in->bpp; + sin.width = in->width; + sin.height = in->height; + sin.numSlices = in->numSlices; + sin.numMipLevels = in->numMipLevels; + sin.numSamples = in->numSamples; + sin.numFrags = in->numFrags; + + if (is_fmask) { + sin.flags.display = 0; + sin.flags.color = 0; + sin.flags.fmask = 1; + } + + if (surf->flags & RADEON_SURF_FORCE_MICRO_TILE_MODE) { + sin.forbiddenBlock.linear = 1; + + if (surf->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY) + sin.preferredSwSet.sw_D = 1; + else if (surf->micro_tile_mode == RADEON_MICRO_MODE_STANDARD) + sin.preferredSwSet.sw_S = 1; + else if (surf->micro_tile_mode == RADEON_MICRO_MODE_DEPTH) + sin.preferredSwSet.sw_Z = 1; + else if (surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER) + sin.preferredSwSet.sw_R = 1; + } + + ret = Addr2GetPreferredSurfaceSetting(addrlib, &sin, &sout); + if (ret != ADDR_OK) + return ret; + + *swizzle_mode = sout.swizzleMode; + return 0; } static bool is_dcc_supported_by_CB(const struct radeon_info *info, unsigned sw_mode) { - if (info->chip_class >= GFX10) - return sw_mode == ADDR_SW_64KB_Z_X || sw_mode == ADDR_SW_64KB_R_X; + if (info->chip_class >= GFX10) + return sw_mode == ADDR_SW_64KB_Z_X || sw_mode == ADDR_SW_64KB_R_X; - return sw_mode != ADDR_SW_LINEAR; + return sw_mode != ADDR_SW_LINEAR; } ASSERTED static bool is_dcc_supported_by_L2(const struct radeon_info *info, - const struct radeon_surf *surf) + const struct radeon_surf *surf) { - if (info->chip_class <= GFX9) { - /* Only independent 64B blocks are supported. */ - return surf->u.gfx9.dcc.independent_64B_blocks && - !surf->u.gfx9.dcc.independent_128B_blocks && - surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B; - } - - if (info->family == CHIP_NAVI10) { - /* Only independent 128B blocks are supported. */ - return !surf->u.gfx9.dcc.independent_64B_blocks && - surf->u.gfx9.dcc.independent_128B_blocks && - surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B; - } - - if (info->family == CHIP_NAVI12 || - info->family == CHIP_NAVI14) { - /* Either 64B or 128B can be used, but not both. - * If 64B is used, DCC image stores are unsupported. - */ - return surf->u.gfx9.dcc.independent_64B_blocks != - surf->u.gfx9.dcc.independent_128B_blocks && - (!surf->u.gfx9.dcc.independent_64B_blocks || - surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B) && - (!surf->u.gfx9.dcc.independent_128B_blocks || - surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B); - } - - /* 128B is recommended, but 64B can be set too if needed for 4K by DCN. - * Since there is no reason to ever disable 128B, require it. - * DCC image stores are always supported. - */ - return surf->u.gfx9.dcc.independent_128B_blocks && - surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B; + if (info->chip_class <= GFX9) { + /* Only independent 64B blocks are supported. */ + return surf->u.gfx9.dcc.independent_64B_blocks && !surf->u.gfx9.dcc.independent_128B_blocks && + surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B; + } + + if (info->family == CHIP_NAVI10) { + /* Only independent 128B blocks are supported. */ + return !surf->u.gfx9.dcc.independent_64B_blocks && surf->u.gfx9.dcc.independent_128B_blocks && + surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B; + } + + if (info->family == CHIP_NAVI12 || info->family == CHIP_NAVI14) { + /* Either 64B or 128B can be used, but not both. + * If 64B is used, DCC image stores are unsupported. + */ + return surf->u.gfx9.dcc.independent_64B_blocks != surf->u.gfx9.dcc.independent_128B_blocks && + (!surf->u.gfx9.dcc.independent_64B_blocks || + surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B) && + (!surf->u.gfx9.dcc.independent_128B_blocks || + surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B); + } + + /* 128B is recommended, but 64B can be set too if needed for 4K by DCN. + * Since there is no reason to ever disable 128B, require it. + * DCC image stores are always supported. + */ + return surf->u.gfx9.dcc.independent_128B_blocks && + surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B; } static bool is_dcc_supported_by_DCN(const struct radeon_info *info, - const struct ac_surf_config *config, - const struct radeon_surf *surf, - bool rb_aligned, bool pipe_aligned) + const struct ac_surf_config *config, + const struct radeon_surf *surf, bool rb_aligned, + bool pipe_aligned) { - if (!info->use_display_dcc_unaligned && - !info->use_display_dcc_with_retile_blit) - return false; - - /* 16bpp and 64bpp are more complicated, so they are disallowed for now. */ - if (surf->bpe != 4) - return false; - - /* Handle unaligned DCC. */ - if (info->use_display_dcc_unaligned && - (rb_aligned || pipe_aligned)) - return false; - - switch (info->chip_class) { - case GFX9: - /* There are more constraints, but we always set - * INDEPENDENT_64B_BLOCKS = 1 and MAX_COMPRESSED_BLOCK_SIZE = 64B, - * which always works. - */ - assert(surf->u.gfx9.dcc.independent_64B_blocks && - surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B); - return true; - case GFX10: - case GFX10_3: - /* DCN requires INDEPENDENT_128B_BLOCKS = 0 only on Navi1x. */ - if (info->chip_class == GFX10 && - surf->u.gfx9.dcc.independent_128B_blocks) - return false; - - /* For 4K, DCN requires INDEPENDENT_64B_BLOCKS = 1. */ - return ((config->info.width <= 2560 && - config->info.height <= 2560) || - (surf->u.gfx9.dcc.independent_64B_blocks && - surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B)); - default: - unreachable("unhandled chip"); - return false; - } + if (!info->use_display_dcc_unaligned && !info->use_display_dcc_with_retile_blit) + return false; + + /* 16bpp and 64bpp are more complicated, so they are disallowed for now. */ + if (surf->bpe != 4) + return false; + + /* Handle unaligned DCC. */ + if (info->use_display_dcc_unaligned && (rb_aligned || pipe_aligned)) + return false; + + switch (info->chip_class) { + case GFX9: + /* There are more constraints, but we always set + * INDEPENDENT_64B_BLOCKS = 1 and MAX_COMPRESSED_BLOCK_SIZE = 64B, + * which always works. + */ + assert(surf->u.gfx9.dcc.independent_64B_blocks && + surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B); + return true; + case GFX10: + case GFX10_3: + /* DCN requires INDEPENDENT_128B_BLOCKS = 0 only on Navi1x. */ + if (info->chip_class == GFX10 && surf->u.gfx9.dcc.independent_128B_blocks) + return false; + + /* For 4K, DCN requires INDEPENDENT_64B_BLOCKS = 1. */ + return ((config->info.width <= 2560 && config->info.height <= 2560) || + (surf->u.gfx9.dcc.independent_64B_blocks && + surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B)); + default: + unreachable("unhandled chip"); + return false; + } } -static int gfx9_compute_miptree(struct ac_addrlib *addrlib, - const struct radeon_info *info, - const struct ac_surf_config *config, - struct radeon_surf *surf, bool compressed, - ADDR2_COMPUTE_SURFACE_INFO_INPUT *in) +static int gfx9_compute_miptree(struct ac_addrlib *addrlib, const struct radeon_info *info, + const struct ac_surf_config *config, struct radeon_surf *surf, + bool compressed, ADDR2_COMPUTE_SURFACE_INFO_INPUT *in) { - ADDR2_MIP_INFO mip_info[RADEON_SURF_MAX_LEVELS] = {}; - ADDR2_COMPUTE_SURFACE_INFO_OUTPUT out = {0}; - ADDR_E_RETURNCODE ret; - - out.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_OUTPUT); - out.pMipInfo = mip_info; - - ret = Addr2ComputeSurfaceInfo(addrlib->handle, in, &out); - if (ret != ADDR_OK) - return ret; - - if (in->flags.stencil) { - surf->u.gfx9.stencil.swizzle_mode = in->swizzleMode; - surf->u.gfx9.stencil.epitch = out.epitchIsHeight ? out.mipChainHeight - 1 : - out.mipChainPitch - 1; - surf->surf_alignment = MAX2(surf->surf_alignment, out.baseAlign); - surf->u.gfx9.stencil_offset = align(surf->surf_size, out.baseAlign); - surf->surf_size = surf->u.gfx9.stencil_offset + out.surfSize; - return 0; - } - - surf->u.gfx9.surf.swizzle_mode = in->swizzleMode; - surf->u.gfx9.surf.epitch = out.epitchIsHeight ? out.mipChainHeight - 1 : - out.mipChainPitch - 1; - - /* CMASK fast clear uses these even if FMASK isn't allocated. - * FMASK only supports the Z swizzle modes, whose numbers are multiples of 4. - */ - surf->u.gfx9.fmask.swizzle_mode = surf->u.gfx9.surf.swizzle_mode & ~0x3; - surf->u.gfx9.fmask.epitch = surf->u.gfx9.surf.epitch; - - surf->u.gfx9.surf_slice_size = out.sliceSize; - surf->u.gfx9.surf_pitch = out.pitch; - surf->u.gfx9.surf_height = out.height; - surf->surf_size = out.surfSize; - surf->surf_alignment = out.baseAlign; - - if (!compressed && surf->blk_w > 1 && out.pitch == out.pixelPitch && - surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR) { - /* Adjust surf_pitch to be in elements units not in pixels */ - surf->u.gfx9.surf_pitch = - align(surf->u.gfx9.surf_pitch / surf->blk_w, 256 / surf->bpe); - surf->u.gfx9.surf.epitch = MAX2(surf->u.gfx9.surf.epitch, - surf->u.gfx9.surf_pitch * surf->blk_w - 1); - /* The surface is really a surf->bpe bytes per pixel surface even if we - * use it as a surf->bpe bytes per element one. - * Adjust surf_slice_size and surf_size to reflect the change - * made to surf_pitch. - */ - surf->u.gfx9.surf_slice_size = MAX2( - surf->u.gfx9.surf_slice_size, - surf->u.gfx9.surf_pitch * out.height * surf->bpe * surf->blk_w); - surf->surf_size = surf->u.gfx9.surf_slice_size * in->numSlices; - } - - if (in->swizzleMode == ADDR_SW_LINEAR) { - for (unsigned i = 0; i < in->numMipLevels; i++) { - surf->u.gfx9.offset[i] = mip_info[i].offset; - surf->u.gfx9.pitch[i] = mip_info[i].pitch; - } - } - - if (in->flags.depth) { - assert(in->swizzleMode != ADDR_SW_LINEAR); - - if (surf->flags & RADEON_SURF_NO_HTILE) - return 0; - - /* HTILE */ - ADDR2_COMPUTE_HTILE_INFO_INPUT hin = {0}; - ADDR2_COMPUTE_HTILE_INFO_OUTPUT hout = {0}; - - hin.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_INPUT); - hout.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_OUTPUT); - - assert(in->flags.metaPipeUnaligned == 0); - assert(in->flags.metaRbUnaligned == 0); - - hin.hTileFlags.pipeAligned = 1; - hin.hTileFlags.rbAligned = 1; - hin.depthFlags = in->flags; - hin.swizzleMode = in->swizzleMode; - hin.unalignedWidth = in->width; - hin.unalignedHeight = in->height; - hin.numSlices = in->numSlices; - hin.numMipLevels = in->numMipLevels; - hin.firstMipIdInTail = out.firstMipIdInTail; - - ret = Addr2ComputeHtileInfo(addrlib->handle, &hin, &hout); - if (ret != ADDR_OK) - return ret; - - surf->htile_size = hout.htileBytes; - surf->htile_slice_size = hout.sliceSize; - surf->htile_alignment = hout.baseAlign; - return 0; - } - - { - /* Compute tile swizzle for the color surface. - * All *_X and *_T modes can use the swizzle. - */ - if (config->info.surf_index && - in->swizzleMode >= ADDR_SW_64KB_Z_T && - !out.mipChainInTail && - !(surf->flags & RADEON_SURF_SHAREABLE) && - !in->flags.display) { - ADDR2_COMPUTE_PIPEBANKXOR_INPUT xin = {0}; - ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT xout = {0}; - - xin.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_INPUT); - xout.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT); - - xin.surfIndex = p_atomic_inc_return(config->info.surf_index) - 1; - xin.flags = in->flags; - xin.swizzleMode = in->swizzleMode; - xin.resourceType = in->resourceType; - xin.format = in->format; - xin.numSamples = in->numSamples; - xin.numFrags = in->numFrags; - - ret = Addr2ComputePipeBankXor(addrlib->handle, &xin, &xout); - if (ret != ADDR_OK) - return ret; - - assert(xout.pipeBankXor <= - u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8)); - surf->tile_swizzle = xout.pipeBankXor; - } - - /* DCC */ - if (info->has_graphics && - !(surf->flags & RADEON_SURF_DISABLE_DCC) && - !compressed && - is_dcc_supported_by_CB(info, in->swizzleMode) && - (!in->flags.display || - is_dcc_supported_by_DCN(info, config, surf, - !in->flags.metaRbUnaligned, - !in->flags.metaPipeUnaligned))) { - ADDR2_COMPUTE_DCCINFO_INPUT din = {0}; - ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0}; - ADDR2_META_MIP_INFO meta_mip_info[RADEON_SURF_MAX_LEVELS] = {}; - - din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT); - dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT); - dout.pMipInfo = meta_mip_info; - - din.dccKeyFlags.pipeAligned = !in->flags.metaPipeUnaligned; - din.dccKeyFlags.rbAligned = !in->flags.metaRbUnaligned; - din.resourceType = in->resourceType; - din.swizzleMode = in->swizzleMode; - din.bpp = in->bpp; - din.unalignedWidth = in->width; - din.unalignedHeight = in->height; - din.numSlices = in->numSlices; - din.numFrags = in->numFrags; - din.numMipLevels = in->numMipLevels; - din.dataSurfaceSize = out.surfSize; - din.firstMipIdInTail = out.firstMipIdInTail; - - ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout); - if (ret != ADDR_OK) - return ret; - - surf->u.gfx9.dcc.rb_aligned = din.dccKeyFlags.rbAligned; - surf->u.gfx9.dcc.pipe_aligned = din.dccKeyFlags.pipeAligned; - surf->u.gfx9.dcc_block_width = dout.compressBlkWidth; - surf->u.gfx9.dcc_block_height = dout.compressBlkHeight; - surf->u.gfx9.dcc_block_depth = dout.compressBlkDepth; - surf->dcc_size = dout.dccRamSize; - surf->dcc_alignment = dout.dccRamBaseAlign; - surf->num_dcc_levels = in->numMipLevels; - - /* Disable DCC for levels that are in the mip tail. - * - * There are two issues that this is intended to - * address: - * - * 1. Multiple mip levels may share a cache line. This - * can lead to corruption when switching between - * rendering to different mip levels because the - * RBs don't maintain coherency. - * - * 2. Texturing with metadata after rendering sometimes - * fails with corruption, probably for a similar - * reason. - * - * Working around these issues for all levels in the - * mip tail may be overly conservative, but it's what - * Vulkan does. - * - * Alternative solutions that also work but are worse: - * - Disable DCC entirely. - * - Flush TC L2 after rendering. - */ - for (unsigned i = 0; i < in->numMipLevels; i++) { - if (meta_mip_info[i].inMiptail) { - /* GFX10 can only compress the first level - * in the mip tail. - * - * TODO: Try to do the same thing for gfx9 - * if there are no regressions. - */ - if (info->chip_class >= GFX10) - surf->num_dcc_levels = i + 1; - else - surf->num_dcc_levels = i; - break; - } - } - - if (!surf->num_dcc_levels) - surf->dcc_size = 0; - - surf->u.gfx9.display_dcc_size = surf->dcc_size; - surf->u.gfx9.display_dcc_alignment = surf->dcc_alignment; - surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1; - - /* Compute displayable DCC. */ - if (in->flags.display && - surf->num_dcc_levels && - info->use_display_dcc_with_retile_blit) { - /* Compute displayable DCC info. */ - din.dccKeyFlags.pipeAligned = 0; - din.dccKeyFlags.rbAligned = 0; - - assert(din.numSlices == 1); - assert(din.numMipLevels == 1); - assert(din.numFrags == 1); - assert(surf->tile_swizzle == 0); - assert(surf->u.gfx9.dcc.pipe_aligned || - surf->u.gfx9.dcc.rb_aligned); - - ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout); - if (ret != ADDR_OK) - return ret; - - surf->u.gfx9.display_dcc_size = dout.dccRamSize; - surf->u.gfx9.display_dcc_alignment = dout.dccRamBaseAlign; - surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1; - assert(surf->u.gfx9.display_dcc_size <= surf->dcc_size); - - surf->u.gfx9.dcc_retile_use_uint16 = - surf->u.gfx9.display_dcc_size <= UINT16_MAX + 1 && - surf->dcc_size <= UINT16_MAX + 1; - - /* Align the retile map size to get more hash table hits and - * decrease the maximum memory footprint when all retile maps - * are cached in the hash table. - */ - unsigned retile_dim[2] = {in->width, in->height}; - - for (unsigned i = 0; i < 2; i++) { - /* Increase the alignment as the size increases. - * Greater alignment increases retile compute work, - * but decreases maximum memory footprint for the cache. - * - * With this alignment, the worst case memory footprint of - * the cache is: - * 1920x1080: 55 MB - * 2560x1440: 99 MB - * 3840x2160: 305 MB - * - * The worst case size in MB can be computed in Haskell as follows: - * (sum (map get_retile_size (map get_dcc_size (deduplicate (map align_pair - * [(i*16,j*16) | i <- [1..maxwidth`div`16], j <- [1..maxheight`div`16]]))))) `div` 1024^2 - * where - * alignment x = if x <= 512 then 16 else if x <= 1024 then 32 else if x <= 2048 then 64 else 128 - * align x = (x + (alignment x) - 1) `div` (alignment x) * (alignment x) - * align_pair e = (align (fst e), align (snd e)) - * deduplicate = map head . groupBy (\ a b -> ((fst a) == (fst b)) && ((snd a) == (snd b))) . sortBy compare - * get_dcc_size e = ((fst e) * (snd e) * bpp) `div` 256 - * get_retile_size dcc_size = dcc_size * 2 * (if dcc_size <= 2^16 then 2 else 4) - * bpp = 4; maxwidth = 3840; maxheight = 2160 - */ - if (retile_dim[i] <= 512) - retile_dim[i] = align(retile_dim[i], 16); - else if (retile_dim[i] <= 1024) - retile_dim[i] = align(retile_dim[i], 32); - else if (retile_dim[i] <= 2048) - retile_dim[i] = align(retile_dim[i], 64); - else - retile_dim[i] = align(retile_dim[i], 128); - - /* Don't align more than the DCC pixel alignment. */ - assert(dout.metaBlkWidth >= 128 && dout.metaBlkHeight >= 128); - } - - surf->u.gfx9.dcc_retile_num_elements = - DIV_ROUND_UP(retile_dim[0], dout.compressBlkWidth) * - DIV_ROUND_UP(retile_dim[1], dout.compressBlkHeight) * 2; - /* Align the size to 4 (for the compute shader). */ - surf->u.gfx9.dcc_retile_num_elements = - align(surf->u.gfx9.dcc_retile_num_elements, 4); - - if (!(surf->flags & RADEON_SURF_IMPORTED)) { - /* Compute address mapping from non-displayable to displayable DCC. */ - ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT addrin; - memset(&addrin, 0, sizeof(addrin)); - addrin.size = sizeof(addrin); - addrin.swizzleMode = din.swizzleMode; - addrin.resourceType = din.resourceType; - addrin.bpp = din.bpp; - addrin.numSlices = 1; - addrin.numMipLevels = 1; - addrin.numFrags = 1; - addrin.pitch = dout.pitch; - addrin.height = dout.height; - addrin.compressBlkWidth = dout.compressBlkWidth; - addrin.compressBlkHeight = dout.compressBlkHeight; - addrin.compressBlkDepth = dout.compressBlkDepth; - addrin.metaBlkWidth = dout.metaBlkWidth; - addrin.metaBlkHeight = dout.metaBlkHeight; - addrin.metaBlkDepth = dout.metaBlkDepth; - addrin.dccRamSliceSize = 0; /* Don't care for non-layered images. */ - - surf->u.gfx9.dcc_retile_map = - ac_compute_dcc_retile_map(addrlib, info, - retile_dim[0], retile_dim[1], - surf->u.gfx9.dcc.rb_aligned, - surf->u.gfx9.dcc.pipe_aligned, - surf->u.gfx9.dcc_retile_use_uint16, - surf->u.gfx9.dcc_retile_num_elements, - &addrin); - if (!surf->u.gfx9.dcc_retile_map) - return ADDR_OUTOFMEMORY; - } - } - } - - /* FMASK */ - if (in->numSamples > 1 && info->has_graphics && - !(surf->flags & RADEON_SURF_NO_FMASK)) { - ADDR2_COMPUTE_FMASK_INFO_INPUT fin = {0}; - ADDR2_COMPUTE_FMASK_INFO_OUTPUT fout = {0}; - - fin.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_INPUT); - fout.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_OUTPUT); - - ret = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, in, - true, &fin.swizzleMode); - if (ret != ADDR_OK) - return ret; - - fin.unalignedWidth = in->width; - fin.unalignedHeight = in->height; - fin.numSlices = in->numSlices; - fin.numSamples = in->numSamples; - fin.numFrags = in->numFrags; - - ret = Addr2ComputeFmaskInfo(addrlib->handle, &fin, &fout); - if (ret != ADDR_OK) - return ret; - - surf->u.gfx9.fmask.swizzle_mode = fin.swizzleMode; - surf->u.gfx9.fmask.epitch = fout.pitch - 1; - surf->fmask_size = fout.fmaskBytes; - surf->fmask_alignment = fout.baseAlign; - - /* Compute tile swizzle for the FMASK surface. */ - if (config->info.fmask_surf_index && - fin.swizzleMode >= ADDR_SW_64KB_Z_T && - !(surf->flags & RADEON_SURF_SHAREABLE)) { - ADDR2_COMPUTE_PIPEBANKXOR_INPUT xin = {0}; - ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT xout = {0}; - - xin.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_INPUT); - xout.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT); - - /* This counter starts from 1 instead of 0. */ - xin.surfIndex = p_atomic_inc_return(config->info.fmask_surf_index); - xin.flags = in->flags; - xin.swizzleMode = fin.swizzleMode; - xin.resourceType = in->resourceType; - xin.format = in->format; - xin.numSamples = in->numSamples; - xin.numFrags = in->numFrags; - - ret = Addr2ComputePipeBankXor(addrlib->handle, &xin, &xout); - if (ret != ADDR_OK) - return ret; - - assert(xout.pipeBankXor <= - u_bit_consecutive(0, sizeof(surf->fmask_tile_swizzle) * 8)); - surf->fmask_tile_swizzle = xout.pipeBankXor; - } - } - - /* CMASK -- on GFX10 only for FMASK */ - if (in->swizzleMode != ADDR_SW_LINEAR && - in->resourceType == ADDR_RSRC_TEX_2D && - ((info->chip_class <= GFX9 && - in->numSamples == 1 && - in->flags.metaPipeUnaligned == 0 && - in->flags.metaRbUnaligned == 0) || - (surf->fmask_size && in->numSamples >= 2))) { - ADDR2_COMPUTE_CMASK_INFO_INPUT cin = {0}; - ADDR2_COMPUTE_CMASK_INFO_OUTPUT cout = {0}; - - cin.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_INPUT); - cout.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_OUTPUT); - - assert(in->flags.metaPipeUnaligned == 0); - assert(in->flags.metaRbUnaligned == 0); - - cin.cMaskFlags.pipeAligned = 1; - cin.cMaskFlags.rbAligned = 1; - cin.resourceType = in->resourceType; - cin.unalignedWidth = in->width; - cin.unalignedHeight = in->height; - cin.numSlices = in->numSlices; - - if (in->numSamples > 1) - cin.swizzleMode = surf->u.gfx9.fmask.swizzle_mode; - else - cin.swizzleMode = in->swizzleMode; - - ret = Addr2ComputeCmaskInfo(addrlib->handle, &cin, &cout); - if (ret != ADDR_OK) - return ret; - - surf->cmask_size = cout.cmaskBytes; - surf->cmask_alignment = cout.baseAlign; - } - } - - return 0; + ADDR2_MIP_INFO mip_info[RADEON_SURF_MAX_LEVELS] = {}; + ADDR2_COMPUTE_SURFACE_INFO_OUTPUT out = {0}; + ADDR_E_RETURNCODE ret; + + out.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_OUTPUT); + out.pMipInfo = mip_info; + + ret = Addr2ComputeSurfaceInfo(addrlib->handle, in, &out); + if (ret != ADDR_OK) + return ret; + + if (in->flags.stencil) { + surf->u.gfx9.stencil.swizzle_mode = in->swizzleMode; + surf->u.gfx9.stencil.epitch = + out.epitchIsHeight ? out.mipChainHeight - 1 : out.mipChainPitch - 1; + surf->surf_alignment = MAX2(surf->surf_alignment, out.baseAlign); + surf->u.gfx9.stencil_offset = align(surf->surf_size, out.baseAlign); + surf->surf_size = surf->u.gfx9.stencil_offset + out.surfSize; + return 0; + } + + surf->u.gfx9.surf.swizzle_mode = in->swizzleMode; + surf->u.gfx9.surf.epitch = out.epitchIsHeight ? out.mipChainHeight - 1 : out.mipChainPitch - 1; + + /* CMASK fast clear uses these even if FMASK isn't allocated. + * FMASK only supports the Z swizzle modes, whose numbers are multiples of 4. + */ + surf->u.gfx9.fmask.swizzle_mode = surf->u.gfx9.surf.swizzle_mode & ~0x3; + surf->u.gfx9.fmask.epitch = surf->u.gfx9.surf.epitch; + + surf->u.gfx9.surf_slice_size = out.sliceSize; + surf->u.gfx9.surf_pitch = out.pitch; + surf->u.gfx9.surf_height = out.height; + surf->surf_size = out.surfSize; + surf->surf_alignment = out.baseAlign; + + if (!compressed && surf->blk_w > 1 && out.pitch == out.pixelPitch && + surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR) { + /* Adjust surf_pitch to be in elements units not in pixels */ + surf->u.gfx9.surf_pitch = align(surf->u.gfx9.surf_pitch / surf->blk_w, 256 / surf->bpe); + surf->u.gfx9.surf.epitch = + MAX2(surf->u.gfx9.surf.epitch, surf->u.gfx9.surf_pitch * surf->blk_w - 1); + /* The surface is really a surf->bpe bytes per pixel surface even if we + * use it as a surf->bpe bytes per element one. + * Adjust surf_slice_size and surf_size to reflect the change + * made to surf_pitch. + */ + surf->u.gfx9.surf_slice_size = + MAX2(surf->u.gfx9.surf_slice_size, + surf->u.gfx9.surf_pitch * out.height * surf->bpe * surf->blk_w); + surf->surf_size = surf->u.gfx9.surf_slice_size * in->numSlices; + } + + if (in->swizzleMode == ADDR_SW_LINEAR) { + for (unsigned i = 0; i < in->numMipLevels; i++) { + surf->u.gfx9.offset[i] = mip_info[i].offset; + surf->u.gfx9.pitch[i] = mip_info[i].pitch; + } + } + + if (in->flags.depth) { + assert(in->swizzleMode != ADDR_SW_LINEAR); + + if (surf->flags & RADEON_SURF_NO_HTILE) + return 0; + + /* HTILE */ + ADDR2_COMPUTE_HTILE_INFO_INPUT hin = {0}; + ADDR2_COMPUTE_HTILE_INFO_OUTPUT hout = {0}; + + hin.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_INPUT); + hout.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_OUTPUT); + + assert(in->flags.metaPipeUnaligned == 0); + assert(in->flags.metaRbUnaligned == 0); + + hin.hTileFlags.pipeAligned = 1; + hin.hTileFlags.rbAligned = 1; + hin.depthFlags = in->flags; + hin.swizzleMode = in->swizzleMode; + hin.unalignedWidth = in->width; + hin.unalignedHeight = in->height; + hin.numSlices = in->numSlices; + hin.numMipLevels = in->numMipLevels; + hin.firstMipIdInTail = out.firstMipIdInTail; + + ret = Addr2ComputeHtileInfo(addrlib->handle, &hin, &hout); + if (ret != ADDR_OK) + return ret; + + surf->htile_size = hout.htileBytes; + surf->htile_slice_size = hout.sliceSize; + surf->htile_alignment = hout.baseAlign; + return 0; + } + + { + /* Compute tile swizzle for the color surface. + * All *_X and *_T modes can use the swizzle. + */ + if (config->info.surf_index && in->swizzleMode >= ADDR_SW_64KB_Z_T && !out.mipChainInTail && + !(surf->flags & RADEON_SURF_SHAREABLE) && !in->flags.display) { + ADDR2_COMPUTE_PIPEBANKXOR_INPUT xin = {0}; + ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT xout = {0}; + + xin.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_INPUT); + xout.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT); + + xin.surfIndex = p_atomic_inc_return(config->info.surf_index) - 1; + xin.flags = in->flags; + xin.swizzleMode = in->swizzleMode; + xin.resourceType = in->resourceType; + xin.format = in->format; + xin.numSamples = in->numSamples; + xin.numFrags = in->numFrags; + + ret = Addr2ComputePipeBankXor(addrlib->handle, &xin, &xout); + if (ret != ADDR_OK) + return ret; + + assert(xout.pipeBankXor <= u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8)); + surf->tile_swizzle = xout.pipeBankXor; + } + + /* DCC */ + if (info->has_graphics && !(surf->flags & RADEON_SURF_DISABLE_DCC) && !compressed && + is_dcc_supported_by_CB(info, in->swizzleMode) && + (!in->flags.display || + is_dcc_supported_by_DCN(info, config, surf, !in->flags.metaRbUnaligned, + !in->flags.metaPipeUnaligned))) { + ADDR2_COMPUTE_DCCINFO_INPUT din = {0}; + ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0}; + ADDR2_META_MIP_INFO meta_mip_info[RADEON_SURF_MAX_LEVELS] = {}; + + din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT); + dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT); + dout.pMipInfo = meta_mip_info; + + din.dccKeyFlags.pipeAligned = !in->flags.metaPipeUnaligned; + din.dccKeyFlags.rbAligned = !in->flags.metaRbUnaligned; + din.resourceType = in->resourceType; + din.swizzleMode = in->swizzleMode; + din.bpp = in->bpp; + din.unalignedWidth = in->width; + din.unalignedHeight = in->height; + din.numSlices = in->numSlices; + din.numFrags = in->numFrags; + din.numMipLevels = in->numMipLevels; + din.dataSurfaceSize = out.surfSize; + din.firstMipIdInTail = out.firstMipIdInTail; + + ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout); + if (ret != ADDR_OK) + return ret; + + surf->u.gfx9.dcc.rb_aligned = din.dccKeyFlags.rbAligned; + surf->u.gfx9.dcc.pipe_aligned = din.dccKeyFlags.pipeAligned; + surf->u.gfx9.dcc_block_width = dout.compressBlkWidth; + surf->u.gfx9.dcc_block_height = dout.compressBlkHeight; + surf->u.gfx9.dcc_block_depth = dout.compressBlkDepth; + surf->dcc_size = dout.dccRamSize; + surf->dcc_alignment = dout.dccRamBaseAlign; + surf->num_dcc_levels = in->numMipLevels; + + /* Disable DCC for levels that are in the mip tail. + * + * There are two issues that this is intended to + * address: + * + * 1. Multiple mip levels may share a cache line. This + * can lead to corruption when switching between + * rendering to different mip levels because the + * RBs don't maintain coherency. + * + * 2. Texturing with metadata after rendering sometimes + * fails with corruption, probably for a similar + * reason. + * + * Working around these issues for all levels in the + * mip tail may be overly conservative, but it's what + * Vulkan does. + * + * Alternative solutions that also work but are worse: + * - Disable DCC entirely. + * - Flush TC L2 after rendering. + */ + for (unsigned i = 0; i < in->numMipLevels; i++) { + if (meta_mip_info[i].inMiptail) { + /* GFX10 can only compress the first level + * in the mip tail. + * + * TODO: Try to do the same thing for gfx9 + * if there are no regressions. + */ + if (info->chip_class >= GFX10) + surf->num_dcc_levels = i + 1; + else + surf->num_dcc_levels = i; + break; + } + } + + if (!surf->num_dcc_levels) + surf->dcc_size = 0; + + surf->u.gfx9.display_dcc_size = surf->dcc_size; + surf->u.gfx9.display_dcc_alignment = surf->dcc_alignment; + surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1; + + /* Compute displayable DCC. */ + if (in->flags.display && surf->num_dcc_levels && info->use_display_dcc_with_retile_blit) { + /* Compute displayable DCC info. */ + din.dccKeyFlags.pipeAligned = 0; + din.dccKeyFlags.rbAligned = 0; + + assert(din.numSlices == 1); + assert(din.numMipLevels == 1); + assert(din.numFrags == 1); + assert(surf->tile_swizzle == 0); + assert(surf->u.gfx9.dcc.pipe_aligned || surf->u.gfx9.dcc.rb_aligned); + + ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout); + if (ret != ADDR_OK) + return ret; + + surf->u.gfx9.display_dcc_size = dout.dccRamSize; + surf->u.gfx9.display_dcc_alignment = dout.dccRamBaseAlign; + surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1; + assert(surf->u.gfx9.display_dcc_size <= surf->dcc_size); + + surf->u.gfx9.dcc_retile_use_uint16 = + surf->u.gfx9.display_dcc_size <= UINT16_MAX + 1 && surf->dcc_size <= UINT16_MAX + 1; + + /* Align the retile map size to get more hash table hits and + * decrease the maximum memory footprint when all retile maps + * are cached in the hash table. + */ + unsigned retile_dim[2] = {in->width, in->height}; + + for (unsigned i = 0; i < 2; i++) { + /* Increase the alignment as the size increases. + * Greater alignment increases retile compute work, + * but decreases maximum memory footprint for the cache. + * + * With this alignment, the worst case memory footprint of + * the cache is: + * 1920x1080: 55 MB + * 2560x1440: 99 MB + * 3840x2160: 305 MB + * + * The worst case size in MB can be computed in Haskell as follows: + * (sum (map get_retile_size (map get_dcc_size (deduplicate (map align_pair + * [(i*16,j*16) | i <- [1..maxwidth`div`16], j <- [1..maxheight`div`16]]))))) + * `div` 1024^2 where alignment x = if x <= 512 then 16 else if x <= 1024 then 32 + * else if x <= 2048 then 64 else 128 align x = (x + (alignment x) - 1) `div` + * (alignment x) * (alignment x) align_pair e = (align (fst e), align (snd e)) + * deduplicate = map head . groupBy (\ a b -> ((fst a) == (fst b)) && ((snd a) + * == (snd b))) . sortBy compare get_dcc_size e = ((fst e) * (snd e) * bpp) `div` 256 + * get_retile_size dcc_size = dcc_size * 2 * (if dcc_size <= 2^16 then 2 else + * 4) bpp = 4; maxwidth = 3840; maxheight = 2160 + */ + if (retile_dim[i] <= 512) + retile_dim[i] = align(retile_dim[i], 16); + else if (retile_dim[i] <= 1024) + retile_dim[i] = align(retile_dim[i], 32); + else if (retile_dim[i] <= 2048) + retile_dim[i] = align(retile_dim[i], 64); + else + retile_dim[i] = align(retile_dim[i], 128); + + /* Don't align more than the DCC pixel alignment. */ + assert(dout.metaBlkWidth >= 128 && dout.metaBlkHeight >= 128); + } + + surf->u.gfx9.dcc_retile_num_elements = + DIV_ROUND_UP(retile_dim[0], dout.compressBlkWidth) * + DIV_ROUND_UP(retile_dim[1], dout.compressBlkHeight) * 2; + /* Align the size to 4 (for the compute shader). */ + surf->u.gfx9.dcc_retile_num_elements = align(surf->u.gfx9.dcc_retile_num_elements, 4); + + if (!(surf->flags & RADEON_SURF_IMPORTED)) { + /* Compute address mapping from non-displayable to displayable DCC. */ + ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT addrin; + memset(&addrin, 0, sizeof(addrin)); + addrin.size = sizeof(addrin); + addrin.swizzleMode = din.swizzleMode; + addrin.resourceType = din.resourceType; + addrin.bpp = din.bpp; + addrin.numSlices = 1; + addrin.numMipLevels = 1; + addrin.numFrags = 1; + addrin.pitch = dout.pitch; + addrin.height = dout.height; + addrin.compressBlkWidth = dout.compressBlkWidth; + addrin.compressBlkHeight = dout.compressBlkHeight; + addrin.compressBlkDepth = dout.compressBlkDepth; + addrin.metaBlkWidth = dout.metaBlkWidth; + addrin.metaBlkHeight = dout.metaBlkHeight; + addrin.metaBlkDepth = dout.metaBlkDepth; + addrin.dccRamSliceSize = 0; /* Don't care for non-layered images. */ + + surf->u.gfx9.dcc_retile_map = ac_compute_dcc_retile_map( + addrlib, info, retile_dim[0], retile_dim[1], surf->u.gfx9.dcc.rb_aligned, + surf->u.gfx9.dcc.pipe_aligned, surf->u.gfx9.dcc_retile_use_uint16, + surf->u.gfx9.dcc_retile_num_elements, &addrin); + if (!surf->u.gfx9.dcc_retile_map) + return ADDR_OUTOFMEMORY; + } + } + } + + /* FMASK */ + if (in->numSamples > 1 && info->has_graphics && !(surf->flags & RADEON_SURF_NO_FMASK)) { + ADDR2_COMPUTE_FMASK_INFO_INPUT fin = {0}; + ADDR2_COMPUTE_FMASK_INFO_OUTPUT fout = {0}; + + fin.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_INPUT); + fout.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_OUTPUT); + + ret = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, in, true, &fin.swizzleMode); + if (ret != ADDR_OK) + return ret; + + fin.unalignedWidth = in->width; + fin.unalignedHeight = in->height; + fin.numSlices = in->numSlices; + fin.numSamples = in->numSamples; + fin.numFrags = in->numFrags; + + ret = Addr2ComputeFmaskInfo(addrlib->handle, &fin, &fout); + if (ret != ADDR_OK) + return ret; + + surf->u.gfx9.fmask.swizzle_mode = fin.swizzleMode; + surf->u.gfx9.fmask.epitch = fout.pitch - 1; + surf->fmask_size = fout.fmaskBytes; + surf->fmask_alignment = fout.baseAlign; + + /* Compute tile swizzle for the FMASK surface. */ + if (config->info.fmask_surf_index && fin.swizzleMode >= ADDR_SW_64KB_Z_T && + !(surf->flags & RADEON_SURF_SHAREABLE)) { + ADDR2_COMPUTE_PIPEBANKXOR_INPUT xin = {0}; + ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT xout = {0}; + + xin.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_INPUT); + xout.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT); + + /* This counter starts from 1 instead of 0. */ + xin.surfIndex = p_atomic_inc_return(config->info.fmask_surf_index); + xin.flags = in->flags; + xin.swizzleMode = fin.swizzleMode; + xin.resourceType = in->resourceType; + xin.format = in->format; + xin.numSamples = in->numSamples; + xin.numFrags = in->numFrags; + + ret = Addr2ComputePipeBankXor(addrlib->handle, &xin, &xout); + if (ret != ADDR_OK) + return ret; + + assert(xout.pipeBankXor <= u_bit_consecutive(0, sizeof(surf->fmask_tile_swizzle) * 8)); + surf->fmask_tile_swizzle = xout.pipeBankXor; + } + } + + /* CMASK -- on GFX10 only for FMASK */ + if (in->swizzleMode != ADDR_SW_LINEAR && in->resourceType == ADDR_RSRC_TEX_2D && + ((info->chip_class <= GFX9 && in->numSamples == 1 && in->flags.metaPipeUnaligned == 0 && + in->flags.metaRbUnaligned == 0) || + (surf->fmask_size && in->numSamples >= 2))) { + ADDR2_COMPUTE_CMASK_INFO_INPUT cin = {0}; + ADDR2_COMPUTE_CMASK_INFO_OUTPUT cout = {0}; + + cin.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_INPUT); + cout.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_OUTPUT); + + assert(in->flags.metaPipeUnaligned == 0); + assert(in->flags.metaRbUnaligned == 0); + + cin.cMaskFlags.pipeAligned = 1; + cin.cMaskFlags.rbAligned = 1; + cin.resourceType = in->resourceType; + cin.unalignedWidth = in->width; + cin.unalignedHeight = in->height; + cin.numSlices = in->numSlices; + + if (in->numSamples > 1) + cin.swizzleMode = surf->u.gfx9.fmask.swizzle_mode; + else + cin.swizzleMode = in->swizzleMode; + + ret = Addr2ComputeCmaskInfo(addrlib->handle, &cin, &cout); + if (ret != ADDR_OK) + return ret; + + surf->cmask_size = cout.cmaskBytes; + surf->cmask_alignment = cout.baseAlign; + } + } + + return 0; } -static int gfx9_compute_surface(struct ac_addrlib *addrlib, - const struct radeon_info *info, - const struct ac_surf_config *config, - enum radeon_surf_mode mode, - struct radeon_surf *surf) +static int gfx9_compute_surface(struct ac_addrlib *addrlib, const struct radeon_info *info, + const struct ac_surf_config *config, enum radeon_surf_mode mode, + struct radeon_surf *surf) { - bool compressed; - ADDR2_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0}; - int r; - - AddrSurfInfoIn.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT); - - compressed = surf->blk_w == 4 && surf->blk_h == 4; - - /* The format must be set correctly for the allocation of compressed - * textures to work. In other cases, setting the bpp is sufficient. */ - if (compressed) { - switch (surf->bpe) { - case 8: - AddrSurfInfoIn.format = ADDR_FMT_BC1; - break; - case 16: - AddrSurfInfoIn.format = ADDR_FMT_BC3; - break; - default: - assert(0); - } - } else { - switch (surf->bpe) { - case 1: - assert(!(surf->flags & RADEON_SURF_ZBUFFER)); - AddrSurfInfoIn.format = ADDR_FMT_8; - break; - case 2: - assert(surf->flags & RADEON_SURF_ZBUFFER || - !(surf->flags & RADEON_SURF_SBUFFER)); - AddrSurfInfoIn.format = ADDR_FMT_16; - break; - case 4: - assert(surf->flags & RADEON_SURF_ZBUFFER || - !(surf->flags & RADEON_SURF_SBUFFER)); - AddrSurfInfoIn.format = ADDR_FMT_32; - break; - case 8: - assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); - AddrSurfInfoIn.format = ADDR_FMT_32_32; - break; - case 12: - assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); - AddrSurfInfoIn.format = ADDR_FMT_32_32_32; - break; - case 16: - assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); - AddrSurfInfoIn.format = ADDR_FMT_32_32_32_32; - break; - default: - assert(0); - } - AddrSurfInfoIn.bpp = surf->bpe * 8; - } - - bool is_color_surface = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER); - AddrSurfInfoIn.flags.color = is_color_surface && - !(surf->flags & RADEON_SURF_NO_RENDER_TARGET); - AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0; - AddrSurfInfoIn.flags.display = get_display_flag(config, surf); - /* flags.texture currently refers to TC-compatible HTILE */ - AddrSurfInfoIn.flags.texture = is_color_surface || - surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE; - AddrSurfInfoIn.flags.opt4space = 1; - - AddrSurfInfoIn.numMipLevels = config->info.levels; - AddrSurfInfoIn.numSamples = MAX2(1, config->info.samples); - AddrSurfInfoIn.numFrags = AddrSurfInfoIn.numSamples; - - if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)) - AddrSurfInfoIn.numFrags = MAX2(1, config->info.storage_samples); - - /* GFX9 doesn't support 1D depth textures, so allocate all 1D textures - * as 2D to avoid having shader variants for 1D vs 2D, so all shaders - * must sample 1D textures as 2D. */ - if (config->is_3d) - AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_3D; - else if (info->chip_class != GFX9 && config->is_1d) - AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_1D; - else - AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_2D; - - AddrSurfInfoIn.width = config->info.width; - AddrSurfInfoIn.height = config->info.height; - - if (config->is_3d) - AddrSurfInfoIn.numSlices = config->info.depth; - else if (config->is_cube) - AddrSurfInfoIn.numSlices = 6; - else - AddrSurfInfoIn.numSlices = config->info.array_size; - - /* This is propagated to DCC. It must be 0 for HTILE and CMASK. */ - AddrSurfInfoIn.flags.metaPipeUnaligned = 0; - AddrSurfInfoIn.flags.metaRbUnaligned = 0; - - /* Optimal values for the L2 cache. */ - if (info->chip_class == GFX9) { - surf->u.gfx9.dcc.independent_64B_blocks = 1; - surf->u.gfx9.dcc.independent_128B_blocks = 0; - surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; - } else if (info->chip_class >= GFX10) { - surf->u.gfx9.dcc.independent_64B_blocks = 0; - surf->u.gfx9.dcc.independent_128B_blocks = 1; - surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B; - } - - if (AddrSurfInfoIn.flags.display) { - /* The display hardware can only read DCC with RB_ALIGNED=0 and - * PIPE_ALIGNED=0. PIPE_ALIGNED really means L2CACHE_ALIGNED. - * - * The CB block requires RB_ALIGNED=1 except 1 RB chips. - * PIPE_ALIGNED is optional, but PIPE_ALIGNED=0 requires L2 flushes - * after rendering, so PIPE_ALIGNED=1 is recommended. - */ - if (info->use_display_dcc_unaligned) { - AddrSurfInfoIn.flags.metaPipeUnaligned = 1; - AddrSurfInfoIn.flags.metaRbUnaligned = 1; - } - - /* Adjust DCC settings to meet DCN requirements. */ - if (info->use_display_dcc_unaligned || - info->use_display_dcc_with_retile_blit) { - /* Only Navi12/14 support independent 64B blocks in L2, - * but without DCC image stores. - */ - if (info->family == CHIP_NAVI12 || - info->family == CHIP_NAVI14) { - surf->u.gfx9.dcc.independent_64B_blocks = 1; - surf->u.gfx9.dcc.independent_128B_blocks = 0; - surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; - } - - if (info->chip_class >= GFX10_3) { - surf->u.gfx9.dcc.independent_64B_blocks = 1; - surf->u.gfx9.dcc.independent_128B_blocks = 1; - surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; - } - } - } - - switch (mode) { - case RADEON_SURF_MODE_LINEAR_ALIGNED: - assert(config->info.samples <= 1); - assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); - AddrSurfInfoIn.swizzleMode = ADDR_SW_LINEAR; - break; - - case RADEON_SURF_MODE_1D: - case RADEON_SURF_MODE_2D: - if (surf->flags & RADEON_SURF_IMPORTED || - (info->chip_class >= GFX10 && - surf->flags & RADEON_SURF_FORCE_SWIZZLE_MODE)) { - AddrSurfInfoIn.swizzleMode = surf->u.gfx9.surf.swizzle_mode; - break; - } - - r = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, &AddrSurfInfoIn, - false, &AddrSurfInfoIn.swizzleMode); - if (r) - return r; - break; - - default: - assert(0); - } - - surf->u.gfx9.resource_type = AddrSurfInfoIn.resourceType; - surf->has_stencil = !!(surf->flags & RADEON_SURF_SBUFFER); - - surf->num_dcc_levels = 0; - surf->surf_size = 0; - surf->fmask_size = 0; - surf->dcc_size = 0; - surf->htile_size = 0; - surf->htile_slice_size = 0; - surf->u.gfx9.surf_offset = 0; - surf->u.gfx9.stencil_offset = 0; - surf->cmask_size = 0; - surf->u.gfx9.dcc_retile_use_uint16 = false; - surf->u.gfx9.dcc_retile_num_elements = 0; - surf->u.gfx9.dcc_retile_map = NULL; - - /* Calculate texture layout information. */ - r = gfx9_compute_miptree(addrlib, info, config, surf, compressed, - &AddrSurfInfoIn); - if (r) - return r; - - /* Calculate texture layout information for stencil. */ - if (surf->flags & RADEON_SURF_SBUFFER) { - AddrSurfInfoIn.flags.stencil = 1; - AddrSurfInfoIn.bpp = 8; - AddrSurfInfoIn.format = ADDR_FMT_8; - - if (!AddrSurfInfoIn.flags.depth) { - r = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, &AddrSurfInfoIn, - false, &AddrSurfInfoIn.swizzleMode); - if (r) - return r; - } else - AddrSurfInfoIn.flags.depth = 0; - - r = gfx9_compute_miptree(addrlib, info, config, surf, compressed, - &AddrSurfInfoIn); - if (r) - return r; - } - - surf->is_linear = surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR; - - /* Query whether the surface is displayable. */ - /* This is only useful for surfaces that are allocated without SCANOUT. */ - bool displayable = false; - if (!config->is_3d && !config->is_cube) { - r = Addr2IsValidDisplaySwizzleMode(addrlib->handle, surf->u.gfx9.surf.swizzle_mode, - surf->bpe * 8, &displayable); - if (r) - return r; - - /* Display needs unaligned DCC. */ - if (surf->num_dcc_levels && - (!is_dcc_supported_by_DCN(info, config, surf, - surf->u.gfx9.dcc.rb_aligned, - surf->u.gfx9.dcc.pipe_aligned) || - /* Don't set is_displayable if displayable DCC is missing. */ - (info->use_display_dcc_with_retile_blit && - !surf->u.gfx9.dcc_retile_num_elements))) - displayable = false; - } - surf->is_displayable = displayable; - - /* Validate that we allocated a displayable surface if requested. */ - assert(!AddrSurfInfoIn.flags.display || surf->is_displayable); - - /* Validate that DCC is set up correctly. */ - if (surf->num_dcc_levels) { - assert(is_dcc_supported_by_L2(info, surf)); - if (AddrSurfInfoIn.flags.color) - assert(is_dcc_supported_by_CB(info, surf->u.gfx9.surf.swizzle_mode)); - if (AddrSurfInfoIn.flags.display) { - assert(is_dcc_supported_by_DCN(info, config, surf, - surf->u.gfx9.dcc.rb_aligned, - surf->u.gfx9.dcc.pipe_aligned)); - } - } - - if (info->has_graphics && - !compressed && - !config->is_3d && - config->info.levels == 1 && - AddrSurfInfoIn.flags.color && - !surf->is_linear && - surf->surf_alignment >= 64 * 1024 && /* 64KB tiling */ - !(surf->flags & (RADEON_SURF_DISABLE_DCC | - RADEON_SURF_FORCE_SWIZZLE_MODE | - RADEON_SURF_FORCE_MICRO_TILE_MODE))) { - /* Validate that DCC is enabled if DCN can do it. */ - if ((info->use_display_dcc_unaligned || - info->use_display_dcc_with_retile_blit) && - AddrSurfInfoIn.flags.display && - surf->bpe == 4) { - assert(surf->num_dcc_levels); - } - - /* Validate that non-scanout DCC is always enabled. */ - if (!AddrSurfInfoIn.flags.display) - assert(surf->num_dcc_levels); - } - - if (!surf->htile_size) { - /* Unset this if HTILE is not present. */ - surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE; - } - - switch (surf->u.gfx9.surf.swizzle_mode) { - /* S = standard. */ - case ADDR_SW_256B_S: - case ADDR_SW_4KB_S: - case ADDR_SW_64KB_S: - case ADDR_SW_64KB_S_T: - case ADDR_SW_4KB_S_X: - case ADDR_SW_64KB_S_X: - surf->micro_tile_mode = RADEON_MICRO_MODE_STANDARD; - break; - - /* D = display. */ - case ADDR_SW_LINEAR: - case ADDR_SW_256B_D: - case ADDR_SW_4KB_D: - case ADDR_SW_64KB_D: - case ADDR_SW_64KB_D_T: - case ADDR_SW_4KB_D_X: - case ADDR_SW_64KB_D_X: - surf->micro_tile_mode = RADEON_MICRO_MODE_DISPLAY; - break; - - /* R = rotated (gfx9), render target (gfx10). */ - case ADDR_SW_256B_R: - case ADDR_SW_4KB_R: - case ADDR_SW_64KB_R: - case ADDR_SW_64KB_R_T: - case ADDR_SW_4KB_R_X: - case ADDR_SW_64KB_R_X: - case ADDR_SW_VAR_R_X: - /* The rotated micro tile mode doesn't work if both CMASK and RB+ are - * used at the same time. We currently do not use rotated - * in gfx9. - */ - assert(info->chip_class >= GFX10 || - !"rotate micro tile mode is unsupported"); - surf->micro_tile_mode = RADEON_MICRO_MODE_RENDER; - break; - - /* Z = depth. */ - case ADDR_SW_4KB_Z: - case ADDR_SW_64KB_Z: - case ADDR_SW_64KB_Z_T: - case ADDR_SW_4KB_Z_X: - case ADDR_SW_64KB_Z_X: - case ADDR_SW_VAR_Z_X: - surf->micro_tile_mode = RADEON_MICRO_MODE_DEPTH; - break; - - default: - assert(0); - } - - return 0; + bool compressed; + ADDR2_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0}; + int r; + + AddrSurfInfoIn.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT); + + compressed = surf->blk_w == 4 && surf->blk_h == 4; + + /* The format must be set correctly for the allocation of compressed + * textures to work. In other cases, setting the bpp is sufficient. */ + if (compressed) { + switch (surf->bpe) { + case 8: + AddrSurfInfoIn.format = ADDR_FMT_BC1; + break; + case 16: + AddrSurfInfoIn.format = ADDR_FMT_BC3; + break; + default: + assert(0); + } + } else { + switch (surf->bpe) { + case 1: + assert(!(surf->flags & RADEON_SURF_ZBUFFER)); + AddrSurfInfoIn.format = ADDR_FMT_8; + break; + case 2: + assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER)); + AddrSurfInfoIn.format = ADDR_FMT_16; + break; + case 4: + assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER)); + AddrSurfInfoIn.format = ADDR_FMT_32; + break; + case 8: + assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); + AddrSurfInfoIn.format = ADDR_FMT_32_32; + break; + case 12: + assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); + AddrSurfInfoIn.format = ADDR_FMT_32_32_32; + break; + case 16: + assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); + AddrSurfInfoIn.format = ADDR_FMT_32_32_32_32; + break; + default: + assert(0); + } + AddrSurfInfoIn.bpp = surf->bpe * 8; + } + + bool is_color_surface = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER); + AddrSurfInfoIn.flags.color = is_color_surface && !(surf->flags & RADEON_SURF_NO_RENDER_TARGET); + AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0; + AddrSurfInfoIn.flags.display = get_display_flag(config, surf); + /* flags.texture currently refers to TC-compatible HTILE */ + AddrSurfInfoIn.flags.texture = is_color_surface || surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE; + AddrSurfInfoIn.flags.opt4space = 1; + + AddrSurfInfoIn.numMipLevels = config->info.levels; + AddrSurfInfoIn.numSamples = MAX2(1, config->info.samples); + AddrSurfInfoIn.numFrags = AddrSurfInfoIn.numSamples; + + if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)) + AddrSurfInfoIn.numFrags = MAX2(1, config->info.storage_samples); + + /* GFX9 doesn't support 1D depth textures, so allocate all 1D textures + * as 2D to avoid having shader variants for 1D vs 2D, so all shaders + * must sample 1D textures as 2D. */ + if (config->is_3d) + AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_3D; + else if (info->chip_class != GFX9 && config->is_1d) + AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_1D; + else + AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_2D; + + AddrSurfInfoIn.width = config->info.width; + AddrSurfInfoIn.height = config->info.height; + + if (config->is_3d) + AddrSurfInfoIn.numSlices = config->info.depth; + else if (config->is_cube) + AddrSurfInfoIn.numSlices = 6; + else + AddrSurfInfoIn.numSlices = config->info.array_size; + + /* This is propagated to DCC. It must be 0 for HTILE and CMASK. */ + AddrSurfInfoIn.flags.metaPipeUnaligned = 0; + AddrSurfInfoIn.flags.metaRbUnaligned = 0; + + /* Optimal values for the L2 cache. */ + if (info->chip_class == GFX9) { + surf->u.gfx9.dcc.independent_64B_blocks = 1; + surf->u.gfx9.dcc.independent_128B_blocks = 0; + surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; + } else if (info->chip_class >= GFX10) { + surf->u.gfx9.dcc.independent_64B_blocks = 0; + surf->u.gfx9.dcc.independent_128B_blocks = 1; + surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B; + } + + if (AddrSurfInfoIn.flags.display) { + /* The display hardware can only read DCC with RB_ALIGNED=0 and + * PIPE_ALIGNED=0. PIPE_ALIGNED really means L2CACHE_ALIGNED. + * + * The CB block requires RB_ALIGNED=1 except 1 RB chips. + * PIPE_ALIGNED is optional, but PIPE_ALIGNED=0 requires L2 flushes + * after rendering, so PIPE_ALIGNED=1 is recommended. + */ + if (info->use_display_dcc_unaligned) { + AddrSurfInfoIn.flags.metaPipeUnaligned = 1; + AddrSurfInfoIn.flags.metaRbUnaligned = 1; + } + + /* Adjust DCC settings to meet DCN requirements. */ + if (info->use_display_dcc_unaligned || info->use_display_dcc_with_retile_blit) { + /* Only Navi12/14 support independent 64B blocks in L2, + * but without DCC image stores. + */ + if (info->family == CHIP_NAVI12 || info->family == CHIP_NAVI14) { + surf->u.gfx9.dcc.independent_64B_blocks = 1; + surf->u.gfx9.dcc.independent_128B_blocks = 0; + surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; + } + + if (info->chip_class >= GFX10_3) { + surf->u.gfx9.dcc.independent_64B_blocks = 1; + surf->u.gfx9.dcc.independent_128B_blocks = 1; + surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B; + } + } + } + + switch (mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: + assert(config->info.samples <= 1); + assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); + AddrSurfInfoIn.swizzleMode = ADDR_SW_LINEAR; + break; + + case RADEON_SURF_MODE_1D: + case RADEON_SURF_MODE_2D: + if (surf->flags & RADEON_SURF_IMPORTED || + (info->chip_class >= GFX10 && surf->flags & RADEON_SURF_FORCE_SWIZZLE_MODE)) { + AddrSurfInfoIn.swizzleMode = surf->u.gfx9.surf.swizzle_mode; + break; + } + + r = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, &AddrSurfInfoIn, false, + &AddrSurfInfoIn.swizzleMode); + if (r) + return r; + break; + + default: + assert(0); + } + + surf->u.gfx9.resource_type = AddrSurfInfoIn.resourceType; + surf->has_stencil = !!(surf->flags & RADEON_SURF_SBUFFER); + + surf->num_dcc_levels = 0; + surf->surf_size = 0; + surf->fmask_size = 0; + surf->dcc_size = 0; + surf->htile_size = 0; + surf->htile_slice_size = 0; + surf->u.gfx9.surf_offset = 0; + surf->u.gfx9.stencil_offset = 0; + surf->cmask_size = 0; + surf->u.gfx9.dcc_retile_use_uint16 = false; + surf->u.gfx9.dcc_retile_num_elements = 0; + surf->u.gfx9.dcc_retile_map = NULL; + + /* Calculate texture layout information. */ + r = gfx9_compute_miptree(addrlib, info, config, surf, compressed, &AddrSurfInfoIn); + if (r) + return r; + + /* Calculate texture layout information for stencil. */ + if (surf->flags & RADEON_SURF_SBUFFER) { + AddrSurfInfoIn.flags.stencil = 1; + AddrSurfInfoIn.bpp = 8; + AddrSurfInfoIn.format = ADDR_FMT_8; + + if (!AddrSurfInfoIn.flags.depth) { + r = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, &AddrSurfInfoIn, false, + &AddrSurfInfoIn.swizzleMode); + if (r) + return r; + } else + AddrSurfInfoIn.flags.depth = 0; + + r = gfx9_compute_miptree(addrlib, info, config, surf, compressed, &AddrSurfInfoIn); + if (r) + return r; + } + + surf->is_linear = surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR; + + /* Query whether the surface is displayable. */ + /* This is only useful for surfaces that are allocated without SCANOUT. */ + bool displayable = false; + if (!config->is_3d && !config->is_cube) { + r = Addr2IsValidDisplaySwizzleMode(addrlib->handle, surf->u.gfx9.surf.swizzle_mode, + surf->bpe * 8, &displayable); + if (r) + return r; + + /* Display needs unaligned DCC. */ + if (surf->num_dcc_levels && + (!is_dcc_supported_by_DCN(info, config, surf, surf->u.gfx9.dcc.rb_aligned, + surf->u.gfx9.dcc.pipe_aligned) || + /* Don't set is_displayable if displayable DCC is missing. */ + (info->use_display_dcc_with_retile_blit && !surf->u.gfx9.dcc_retile_num_elements))) + displayable = false; + } + surf->is_displayable = displayable; + + /* Validate that we allocated a displayable surface if requested. */ + assert(!AddrSurfInfoIn.flags.display || surf->is_displayable); + + /* Validate that DCC is set up correctly. */ + if (surf->num_dcc_levels) { + assert(is_dcc_supported_by_L2(info, surf)); + if (AddrSurfInfoIn.flags.color) + assert(is_dcc_supported_by_CB(info, surf->u.gfx9.surf.swizzle_mode)); + if (AddrSurfInfoIn.flags.display) { + assert(is_dcc_supported_by_DCN(info, config, surf, surf->u.gfx9.dcc.rb_aligned, + surf->u.gfx9.dcc.pipe_aligned)); + } + } + + if (info->has_graphics && !compressed && !config->is_3d && config->info.levels == 1 && + AddrSurfInfoIn.flags.color && !surf->is_linear && + surf->surf_alignment >= 64 * 1024 && /* 64KB tiling */ + !(surf->flags & (RADEON_SURF_DISABLE_DCC | RADEON_SURF_FORCE_SWIZZLE_MODE | + RADEON_SURF_FORCE_MICRO_TILE_MODE))) { + /* Validate that DCC is enabled if DCN can do it. */ + if ((info->use_display_dcc_unaligned || info->use_display_dcc_with_retile_blit) && + AddrSurfInfoIn.flags.display && surf->bpe == 4) { + assert(surf->num_dcc_levels); + } + + /* Validate that non-scanout DCC is always enabled. */ + if (!AddrSurfInfoIn.flags.display) + assert(surf->num_dcc_levels); + } + + if (!surf->htile_size) { + /* Unset this if HTILE is not present. */ + surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE; + } + + switch (surf->u.gfx9.surf.swizzle_mode) { + /* S = standard. */ + case ADDR_SW_256B_S: + case ADDR_SW_4KB_S: + case ADDR_SW_64KB_S: + case ADDR_SW_64KB_S_T: + case ADDR_SW_4KB_S_X: + case ADDR_SW_64KB_S_X: + surf->micro_tile_mode = RADEON_MICRO_MODE_STANDARD; + break; + + /* D = display. */ + case ADDR_SW_LINEAR: + case ADDR_SW_256B_D: + case ADDR_SW_4KB_D: + case ADDR_SW_64KB_D: + case ADDR_SW_64KB_D_T: + case ADDR_SW_4KB_D_X: + case ADDR_SW_64KB_D_X: + surf->micro_tile_mode = RADEON_MICRO_MODE_DISPLAY; + break; + + /* R = rotated (gfx9), render target (gfx10). */ + case ADDR_SW_256B_R: + case ADDR_SW_4KB_R: + case ADDR_SW_64KB_R: + case ADDR_SW_64KB_R_T: + case ADDR_SW_4KB_R_X: + case ADDR_SW_64KB_R_X: + case ADDR_SW_VAR_R_X: + /* The rotated micro tile mode doesn't work if both CMASK and RB+ are + * used at the same time. We currently do not use rotated + * in gfx9. + */ + assert(info->chip_class >= GFX10 || !"rotate micro tile mode is unsupported"); + surf->micro_tile_mode = RADEON_MICRO_MODE_RENDER; + break; + + /* Z = depth. */ + case ADDR_SW_4KB_Z: + case ADDR_SW_64KB_Z: + case ADDR_SW_64KB_Z_T: + case ADDR_SW_4KB_Z_X: + case ADDR_SW_64KB_Z_X: + case ADDR_SW_VAR_Z_X: + surf->micro_tile_mode = RADEON_MICRO_MODE_DEPTH; + break; + + default: + assert(0); + } + + return 0; } int ac_compute_surface(struct ac_addrlib *addrlib, const struct radeon_info *info, - const struct ac_surf_config *config, - enum radeon_surf_mode mode, - struct radeon_surf *surf) + const struct ac_surf_config *config, enum radeon_surf_mode mode, + struct radeon_surf *surf) { - int r; - - r = surf_config_sanity(config, surf->flags); - if (r) - return r; - - if (info->chip_class >= GFX9) - r = gfx9_compute_surface(addrlib, info, config, mode, surf); - else - r = gfx6_compute_surface(addrlib->handle, info, config, mode, surf); - - if (r) - return r; - - /* Determine the memory layout of multiple allocations in one buffer. */ - surf->total_size = surf->surf_size; - surf->alignment = surf->surf_alignment; - - if (surf->htile_size) { - surf->htile_offset = align64(surf->total_size, surf->htile_alignment); - surf->total_size = surf->htile_offset + surf->htile_size; - surf->alignment = MAX2(surf->alignment, surf->htile_alignment); - } - - if (surf->fmask_size) { - assert(config->info.samples >= 2); - surf->fmask_offset = align64(surf->total_size, surf->fmask_alignment); - surf->total_size = surf->fmask_offset + surf->fmask_size; - surf->alignment = MAX2(surf->alignment, surf->fmask_alignment); - } - - /* Single-sample CMASK is in a separate buffer. */ - if (surf->cmask_size && config->info.samples >= 2) { - surf->cmask_offset = align64(surf->total_size, surf->cmask_alignment); - surf->total_size = surf->cmask_offset + surf->cmask_size; - surf->alignment = MAX2(surf->alignment, surf->cmask_alignment); - } - - if (surf->is_displayable) - surf->flags |= RADEON_SURF_SCANOUT; - - if (surf->dcc_size && - /* dcc_size is computed on GFX9+ only if it's displayable. */ - (info->chip_class >= GFX9 || !get_display_flag(config, surf))) { - /* It's better when displayable DCC is immediately after - * the image due to hw-specific reasons. - */ - if (info->chip_class >= GFX9 && - surf->u.gfx9.dcc_retile_num_elements) { - /* Add space for the displayable DCC buffer. */ - surf->display_dcc_offset = - align64(surf->total_size, surf->u.gfx9.display_dcc_alignment); - surf->total_size = surf->display_dcc_offset + - surf->u.gfx9.display_dcc_size; - - /* Add space for the DCC retile buffer. (16-bit or 32-bit elements) */ - surf->dcc_retile_map_offset = - align64(surf->total_size, info->tcc_cache_line_size); - - if (surf->u.gfx9.dcc_retile_use_uint16) { - surf->total_size = surf->dcc_retile_map_offset + - surf->u.gfx9.dcc_retile_num_elements * 2; - } else { - surf->total_size = surf->dcc_retile_map_offset + - surf->u.gfx9.dcc_retile_num_elements * 4; - } - } - - surf->dcc_offset = align64(surf->total_size, surf->dcc_alignment); - surf->total_size = surf->dcc_offset + surf->dcc_size; - surf->alignment = MAX2(surf->alignment, surf->dcc_alignment); - } - - return 0; + int r; + + r = surf_config_sanity(config, surf->flags); + if (r) + return r; + + if (info->chip_class >= GFX9) + r = gfx9_compute_surface(addrlib, info, config, mode, surf); + else + r = gfx6_compute_surface(addrlib->handle, info, config, mode, surf); + + if (r) + return r; + + /* Determine the memory layout of multiple allocations in one buffer. */ + surf->total_size = surf->surf_size; + surf->alignment = surf->surf_alignment; + + if (surf->htile_size) { + surf->htile_offset = align64(surf->total_size, surf->htile_alignment); + surf->total_size = surf->htile_offset + surf->htile_size; + surf->alignment = MAX2(surf->alignment, surf->htile_alignment); + } + + if (surf->fmask_size) { + assert(config->info.samples >= 2); + surf->fmask_offset = align64(surf->total_size, surf->fmask_alignment); + surf->total_size = surf->fmask_offset + surf->fmask_size; + surf->alignment = MAX2(surf->alignment, surf->fmask_alignment); + } + + /* Single-sample CMASK is in a separate buffer. */ + if (surf->cmask_size && config->info.samples >= 2) { + surf->cmask_offset = align64(surf->total_size, surf->cmask_alignment); + surf->total_size = surf->cmask_offset + surf->cmask_size; + surf->alignment = MAX2(surf->alignment, surf->cmask_alignment); + } + + if (surf->is_displayable) + surf->flags |= RADEON_SURF_SCANOUT; + + if (surf->dcc_size && + /* dcc_size is computed on GFX9+ only if it's displayable. */ + (info->chip_class >= GFX9 || !get_display_flag(config, surf))) { + /* It's better when displayable DCC is immediately after + * the image due to hw-specific reasons. + */ + if (info->chip_class >= GFX9 && surf->u.gfx9.dcc_retile_num_elements) { + /* Add space for the displayable DCC buffer. */ + surf->display_dcc_offset = align64(surf->total_size, surf->u.gfx9.display_dcc_alignment); + surf->total_size = surf->display_dcc_offset + surf->u.gfx9.display_dcc_size; + + /* Add space for the DCC retile buffer. (16-bit or 32-bit elements) */ + surf->dcc_retile_map_offset = align64(surf->total_size, info->tcc_cache_line_size); + + if (surf->u.gfx9.dcc_retile_use_uint16) { + surf->total_size = + surf->dcc_retile_map_offset + surf->u.gfx9.dcc_retile_num_elements * 2; + } else { + surf->total_size = + surf->dcc_retile_map_offset + surf->u.gfx9.dcc_retile_num_elements * 4; + } + } + + surf->dcc_offset = align64(surf->total_size, surf->dcc_alignment); + surf->total_size = surf->dcc_offset + surf->dcc_size; + surf->alignment = MAX2(surf->alignment, surf->dcc_alignment); + } + + return 0; } /* This is meant to be used for disabling DCC. */ @@ -2289,14 +2166,28 @@ void ac_surface_zero_dcc_fields(struct radeon_surf *surf) static unsigned eg_tile_split(unsigned tile_split) { switch (tile_split) { - case 0: tile_split = 64; break; - case 1: tile_split = 128; break; - case 2: tile_split = 256; break; - case 3: tile_split = 512; break; + case 0: + tile_split = 64; + break; + case 1: + tile_split = 128; + break; + case 2: + tile_split = 256; + break; + case 3: + tile_split = 512; + break; default: - case 4: tile_split = 1024; break; - case 5: tile_split = 2048; break; - case 6: tile_split = 4096; break; + case 4: + tile_split = 1024; + break; + case 5: + tile_split = 2048; + break; + case 6: + tile_split = 4096; + break; } return tile_split; } @@ -2304,35 +2195,45 @@ static unsigned eg_tile_split(unsigned tile_split) static unsigned eg_tile_split_rev(unsigned eg_tile_split) { switch (eg_tile_split) { - case 64: return 0; - case 128: return 1; - case 256: return 2; - case 512: return 3; + case 64: + return 0; + case 128: + return 1; + case 256: + return 2; + case 512: + return 3; default: - case 1024: return 4; - case 2048: return 5; - case 4096: return 6; + case 1024: + return 4; + case 2048: + return 5; + case 4096: + return 6; } } -#define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_SHIFT 45 -#define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_MASK 0x3 +#define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_SHIFT 45 +#define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_MASK 0x3 /* This should be called before ac_compute_surface. */ -void ac_surface_set_bo_metadata(const struct radeon_info *info, - struct radeon_surf *surf, uint64_t tiling_flags, - enum radeon_surf_mode *mode) +void ac_surface_set_bo_metadata(const struct radeon_info *info, struct radeon_surf *surf, + uint64_t tiling_flags, enum radeon_surf_mode *mode) { bool scanout; if (info->chip_class >= GFX9) { surf->u.gfx9.surf.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE); - surf->u.gfx9.dcc.independent_64B_blocks = AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_64B); - surf->u.gfx9.dcc.independent_128B_blocks = AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_128B); - surf->u.gfx9.dcc.max_compressed_block_size = AMDGPU_TILING_GET(tiling_flags, DCC_MAX_COMPRESSED_BLOCK_SIZE); + surf->u.gfx9.dcc.independent_64B_blocks = + AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_64B); + surf->u.gfx9.dcc.independent_128B_blocks = + AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_128B); + surf->u.gfx9.dcc.max_compressed_block_size = + AMDGPU_TILING_GET(tiling_flags, DCC_MAX_COMPRESSED_BLOCK_SIZE); surf->u.gfx9.display_dcc_pitch_max = AMDGPU_TILING_GET(tiling_flags, DCC_PITCH_MAX); scanout = AMDGPU_TILING_GET(tiling_flags, SCANOUT); - *mode = surf->u.gfx9.surf.swizzle_mode > 0 ? RADEON_SURF_MODE_2D : RADEON_SURF_MODE_LINEAR_ALIGNED; + *mode = + surf->u.gfx9.surf.swizzle_mode > 0 ? RADEON_SURF_MODE_2D : RADEON_SURF_MODE_LINEAR_ALIGNED; } else { surf->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG); surf->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); @@ -2342,7 +2243,7 @@ void ac_surface_set_bo_metadata(const struct radeon_info *info, surf->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS); scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ - if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ + if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ *mode = RADEON_SURF_MODE_2D; else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */ *mode = RADEON_SURF_MODE_1D; @@ -2356,8 +2257,8 @@ void ac_surface_set_bo_metadata(const struct radeon_info *info, surf->flags &= ~RADEON_SURF_SCANOUT; } -void ac_surface_get_bo_metadata(const struct radeon_info *info, - struct radeon_surf *surf, uint64_t *tiling_flags) +void ac_surface_get_bo_metadata(const struct radeon_info *info, struct radeon_surf *surf, + uint64_t *tiling_flags) { *tiling_flags = 0; @@ -2365,17 +2266,19 @@ void ac_surface_get_bo_metadata(const struct radeon_info *info, uint64_t dcc_offset = 0; if (surf->dcc_offset) { - dcc_offset = surf->display_dcc_offset ? surf->display_dcc_offset - : surf->dcc_offset; + dcc_offset = surf->display_dcc_offset ? surf->display_dcc_offset : surf->dcc_offset; assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24)); } *tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, surf->u.gfx9.surf.swizzle_mode); *tiling_flags |= AMDGPU_TILING_SET(DCC_OFFSET_256B, dcc_offset >> 8); *tiling_flags |= AMDGPU_TILING_SET(DCC_PITCH_MAX, surf->u.gfx9.display_dcc_pitch_max); - *tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_64B, surf->u.gfx9.dcc.independent_64B_blocks); - *tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_128B, surf->u.gfx9.dcc.independent_128B_blocks); - *tiling_flags |= AMDGPU_TILING_SET(DCC_MAX_COMPRESSED_BLOCK_SIZE, surf->u.gfx9.dcc.max_compressed_block_size); + *tiling_flags |= + AMDGPU_TILING_SET(DCC_INDEPENDENT_64B, surf->u.gfx9.dcc.independent_64B_blocks); + *tiling_flags |= + AMDGPU_TILING_SET(DCC_INDEPENDENT_128B, surf->u.gfx9.dcc.independent_128B_blocks); + *tiling_flags |= AMDGPU_TILING_SET(DCC_MAX_COMPRESSED_BLOCK_SIZE, + surf->u.gfx9.dcc.max_compressed_block_size); *tiling_flags |= AMDGPU_TILING_SET(SCANOUT, (surf->flags & RADEON_SURF_SCANOUT) != 0); } else { if (surf->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D) @@ -2389,9 +2292,10 @@ void ac_surface_get_bo_metadata(const struct radeon_info *info, *tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(surf->u.legacy.bankw)); *tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(surf->u.legacy.bankh)); if (surf->u.legacy.tile_split) - *tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(surf->u.legacy.tile_split)); + *tiling_flags |= + AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(surf->u.legacy.tile_split)); *tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(surf->u.legacy.mtilea)); - *tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(surf->u.legacy.num_banks)-1); + *tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(surf->u.legacy.num_banks) - 1); if (surf->flags & RADEON_SURF_SCANOUT) *tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ @@ -2406,12 +2310,9 @@ static uint32_t ac_get_umd_metadata_word1(const struct radeon_info *info) } /* This should be called after ac_compute_surface. */ -bool ac_surface_set_umd_metadata(const struct radeon_info *info, - struct radeon_surf *surf, - unsigned num_storage_samples, - unsigned num_mipmap_levels, - unsigned size_metadata, - uint32_t metadata[64]) +bool ac_surface_set_umd_metadata(const struct radeon_info *info, struct radeon_surf *surf, + unsigned num_storage_samples, unsigned num_mipmap_levels, + unsigned size_metadata, uint32_t metadata[64]) { uint32_t *desc = &metadata[2]; uint64_t offset; @@ -2497,10 +2398,8 @@ bool ac_surface_set_umd_metadata(const struct radeon_info *info, return true; } -void ac_surface_get_umd_metadata(const struct radeon_info *info, - struct radeon_surf *surf, - unsigned num_mipmap_levels, - uint32_t desc[8], +void ac_surface_get_umd_metadata(const struct radeon_info *info, struct radeon_surf *surf, + unsigned num_mipmap_levels, uint32_t desc[8], unsigned *size_metadata, uint32_t metadata[64]) { /* Clear the base address and set the relative DCC offset. */ @@ -2557,18 +2456,15 @@ void ac_surface_get_umd_metadata(const struct radeon_info *info, } } -void ac_surface_override_offset_stride(const struct radeon_info *info, - struct radeon_surf *surf, - unsigned num_mipmap_levels, - uint64_t offset, unsigned pitch) +void ac_surface_override_offset_stride(const struct radeon_info *info, struct radeon_surf *surf, + unsigned num_mipmap_levels, uint64_t offset, unsigned pitch) { if (info->chip_class >= GFX9) { if (pitch) { surf->u.gfx9.surf_pitch = pitch; if (num_mipmap_levels == 1) surf->u.gfx9.surf.epitch = pitch - 1; - surf->u.gfx9.surf_slice_size = - (uint64_t)pitch * surf->u.gfx9.surf_height * surf->bpe; + surf->u.gfx9.surf_slice_size = (uint64_t)pitch * surf->u.gfx9.surf_height * surf->bpe; } surf->u.gfx9.surf_offset = offset; if (surf->u.gfx9.stencil_offset) @@ -2577,7 +2473,7 @@ void ac_surface_override_offset_stride(const struct radeon_info *info, if (pitch) { surf->u.legacy.level[0].nblk_x = pitch; surf->u.legacy.level[0].slice_size_dw = - ((uint64_t)pitch * surf->u.legacy.level[0].nblk_y * surf->bpe) / 4; + ((uint64_t)pitch * surf->u.legacy.level[0].nblk_y * surf->bpe) / 4; } if (offset) { diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h index 5dce253..5605ba7 100644 --- a/src/amd/common/ac_surface.h +++ b/src/amd/common/ac_surface.h @@ -26,11 +26,11 @@ #ifndef AC_SURFACE_H #define AC_SURFACE_H -#include -#include - #include "amd_family.h" +#include +#include + #ifdef __cplusplus extern "C" { #endif @@ -41,280 +41,274 @@ struct ac_addrlib; struct amdgpu_gpu_info; struct radeon_info; -#define RADEON_SURF_MAX_LEVELS 15 +#define RADEON_SURF_MAX_LEVELS 15 -enum radeon_surf_mode { - RADEON_SURF_MODE_LINEAR_ALIGNED = 1, - RADEON_SURF_MODE_1D = 2, - RADEON_SURF_MODE_2D = 3, +enum radeon_surf_mode +{ + RADEON_SURF_MODE_LINEAR_ALIGNED = 1, + RADEON_SURF_MODE_1D = 2, + RADEON_SURF_MODE_2D = 3, }; /* This describes D/S/Z/R swizzle modes. * Defined in the GB_TILE_MODEn.MICRO_TILE_MODE_NEW order. */ -enum radeon_micro_mode { - RADEON_MICRO_MODE_DISPLAY = 0, - RADEON_MICRO_MODE_STANDARD = 1, - RADEON_MICRO_MODE_DEPTH = 2, - RADEON_MICRO_MODE_RENDER = 3, /* gfx9 and older: rotated */ +enum radeon_micro_mode +{ + RADEON_MICRO_MODE_DISPLAY = 0, + RADEON_MICRO_MODE_STANDARD = 1, + RADEON_MICRO_MODE_DEPTH = 2, + RADEON_MICRO_MODE_RENDER = 3, /* gfx9 and older: rotated */ }; /* the first 16 bits are reserved for libdrm_radeon, don't use them */ -#define RADEON_SURF_SCANOUT (1 << 16) -#define RADEON_SURF_ZBUFFER (1 << 17) -#define RADEON_SURF_SBUFFER (1 << 18) -#define RADEON_SURF_Z_OR_SBUFFER (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER) +#define RADEON_SURF_SCANOUT (1 << 16) +#define RADEON_SURF_ZBUFFER (1 << 17) +#define RADEON_SURF_SBUFFER (1 << 18) +#define RADEON_SURF_Z_OR_SBUFFER (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER) /* bits 19 and 20 are reserved for libdrm_radeon, don't use them */ -#define RADEON_SURF_FMASK (1 << 21) -#define RADEON_SURF_DISABLE_DCC (1 << 22) -#define RADEON_SURF_TC_COMPATIBLE_HTILE (1 << 23) -#define RADEON_SURF_IMPORTED (1 << 24) -#define RADEON_SURF_CONTIGUOUS_DCC_LAYERS (1 << 25) -#define RADEON_SURF_SHAREABLE (1 << 26) -#define RADEON_SURF_NO_RENDER_TARGET (1 << 27) +#define RADEON_SURF_FMASK (1 << 21) +#define RADEON_SURF_DISABLE_DCC (1 << 22) +#define RADEON_SURF_TC_COMPATIBLE_HTILE (1 << 23) +#define RADEON_SURF_IMPORTED (1 << 24) +#define RADEON_SURF_CONTIGUOUS_DCC_LAYERS (1 << 25) +#define RADEON_SURF_SHAREABLE (1 << 26) +#define RADEON_SURF_NO_RENDER_TARGET (1 << 27) /* Force a swizzle mode (gfx9+) or tile mode (gfx6-8). * If this is not set, optimize for space. */ -#define RADEON_SURF_FORCE_SWIZZLE_MODE (1 << 28) -#define RADEON_SURF_NO_FMASK (1 << 29) -#define RADEON_SURF_NO_HTILE (1 << 30) -#define RADEON_SURF_FORCE_MICRO_TILE_MODE (1u << 31) +#define RADEON_SURF_FORCE_SWIZZLE_MODE (1 << 28) +#define RADEON_SURF_NO_FMASK (1 << 29) +#define RADEON_SURF_NO_HTILE (1 << 30) +#define RADEON_SURF_FORCE_MICRO_TILE_MODE (1u << 31) struct legacy_surf_level { - uint64_t offset; - uint32_t slice_size_dw; /* in dwords; max = 4GB / 4. */ - uint32_t dcc_offset; /* relative offset within DCC mip tree */ - uint32_t dcc_fast_clear_size; - uint32_t dcc_slice_fast_clear_size; - unsigned nblk_x:15; - unsigned nblk_y:15; - enum radeon_surf_mode mode:2; + uint64_t offset; + uint32_t slice_size_dw; /* in dwords; max = 4GB / 4. */ + uint32_t dcc_offset; /* relative offset within DCC mip tree */ + uint32_t dcc_fast_clear_size; + uint32_t dcc_slice_fast_clear_size; + unsigned nblk_x : 15; + unsigned nblk_y : 15; + enum radeon_surf_mode mode : 2; }; struct legacy_surf_fmask { - unsigned slice_tile_max; /* max 4M */ - uint8_t tiling_index; /* max 31 */ - uint8_t bankh; /* max 8 */ - uint16_t pitch_in_pixels; - uint64_t slice_size; + unsigned slice_tile_max; /* max 4M */ + uint8_t tiling_index; /* max 31 */ + uint8_t bankh; /* max 8 */ + uint16_t pitch_in_pixels; + uint64_t slice_size; }; struct legacy_surf_layout { - unsigned bankw:4; /* max 8 */ - unsigned bankh:4; /* max 8 */ - unsigned mtilea:4; /* max 8 */ - unsigned tile_split:13; /* max 4K */ - unsigned stencil_tile_split:13; /* max 4K */ - unsigned pipe_config:5; /* max 17 */ - unsigned num_banks:5; /* max 16 */ - unsigned macro_tile_index:4; /* max 15 */ - - /* Whether the depth miptree or stencil miptree as used by the DB are - * adjusted from their TC compatible form to ensure depth/stencil - * compatibility. If either is true, the corresponding plane cannot be - * sampled from. - */ - unsigned depth_adjusted:1; - unsigned stencil_adjusted:1; - - struct legacy_surf_level level[RADEON_SURF_MAX_LEVELS]; - struct legacy_surf_level stencil_level[RADEON_SURF_MAX_LEVELS]; - uint8_t tiling_index[RADEON_SURF_MAX_LEVELS]; - uint8_t stencil_tiling_index[RADEON_SURF_MAX_LEVELS]; - struct legacy_surf_fmask fmask; - unsigned cmask_slice_tile_max; + unsigned bankw : 4; /* max 8 */ + unsigned bankh : 4; /* max 8 */ + unsigned mtilea : 4; /* max 8 */ + unsigned tile_split : 13; /* max 4K */ + unsigned stencil_tile_split : 13; /* max 4K */ + unsigned pipe_config : 5; /* max 17 */ + unsigned num_banks : 5; /* max 16 */ + unsigned macro_tile_index : 4; /* max 15 */ + + /* Whether the depth miptree or stencil miptree as used by the DB are + * adjusted from their TC compatible form to ensure depth/stencil + * compatibility. If either is true, the corresponding plane cannot be + * sampled from. + */ + unsigned depth_adjusted : 1; + unsigned stencil_adjusted : 1; + + struct legacy_surf_level level[RADEON_SURF_MAX_LEVELS]; + struct legacy_surf_level stencil_level[RADEON_SURF_MAX_LEVELS]; + uint8_t tiling_index[RADEON_SURF_MAX_LEVELS]; + uint8_t stencil_tiling_index[RADEON_SURF_MAX_LEVELS]; + struct legacy_surf_fmask fmask; + unsigned cmask_slice_tile_max; }; /* Same as addrlib - AddrResourceType. */ -enum gfx9_resource_type { - RADEON_RESOURCE_1D = 0, - RADEON_RESOURCE_2D, - RADEON_RESOURCE_3D, +enum gfx9_resource_type +{ + RADEON_RESOURCE_1D = 0, + RADEON_RESOURCE_2D, + RADEON_RESOURCE_3D, }; struct gfx9_surf_flags { - uint16_t swizzle_mode; /* tile mode */ - uint16_t epitch; /* (pitch - 1) or (height - 1) */ + uint16_t swizzle_mode; /* tile mode */ + uint16_t epitch; /* (pitch - 1) or (height - 1) */ }; struct gfx9_surf_meta_flags { - unsigned rb_aligned:1; /* optimal for RBs */ - unsigned pipe_aligned:1; /* optimal for TC */ - unsigned independent_64B_blocks:1; - unsigned independent_128B_blocks:1; - unsigned max_compressed_block_size:2; + unsigned rb_aligned : 1; /* optimal for RBs */ + unsigned pipe_aligned : 1; /* optimal for TC */ + unsigned independent_64B_blocks : 1; + unsigned independent_128B_blocks : 1; + unsigned max_compressed_block_size : 2; }; struct gfx9_surf_layout { - struct gfx9_surf_flags surf; /* color or depth surface */ - struct gfx9_surf_flags fmask; /* not added to surf_size */ - struct gfx9_surf_flags stencil; /* added to surf_size, use stencil_offset */ - - struct gfx9_surf_meta_flags dcc; /* metadata of color */ - - enum gfx9_resource_type resource_type; /* 1D, 2D or 3D */ - uint16_t surf_pitch; /* in blocks */ - uint16_t surf_height; - - uint64_t surf_offset; /* 0 unless imported with an offset */ - /* The size of the 2D plane containing all mipmap levels. */ - uint64_t surf_slice_size; - /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */ - uint32_t offset[RADEON_SURF_MAX_LEVELS]; - /* Mipmap level pitch in elements. Only valid for LINEAR. */ - uint16_t pitch[RADEON_SURF_MAX_LEVELS]; - - uint64_t stencil_offset; /* separate stencil */ - - uint8_t dcc_block_width; - uint8_t dcc_block_height; - uint8_t dcc_block_depth; - - /* Displayable DCC. This is always rb_aligned=0 and pipe_aligned=0. - * The 3D engine doesn't support that layout except for chips with 1 RB. - * All other chips must set rb_aligned=1. - * A compute shader needs to convert from aligned DCC to unaligned. - */ - uint32_t display_dcc_size; - uint32_t display_dcc_alignment; - uint16_t display_dcc_pitch_max; /* (mip chain pitch - 1) */ - bool dcc_retile_use_uint16; /* if all values fit into uint16_t */ - uint32_t dcc_retile_num_elements; - void *dcc_retile_map; + struct gfx9_surf_flags surf; /* color or depth surface */ + struct gfx9_surf_flags fmask; /* not added to surf_size */ + struct gfx9_surf_flags stencil; /* added to surf_size, use stencil_offset */ + + struct gfx9_surf_meta_flags dcc; /* metadata of color */ + + enum gfx9_resource_type resource_type; /* 1D, 2D or 3D */ + uint16_t surf_pitch; /* in blocks */ + uint16_t surf_height; + + uint64_t surf_offset; /* 0 unless imported with an offset */ + /* The size of the 2D plane containing all mipmap levels. */ + uint64_t surf_slice_size; + /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */ + uint32_t offset[RADEON_SURF_MAX_LEVELS]; + /* Mipmap level pitch in elements. Only valid for LINEAR. */ + uint16_t pitch[RADEON_SURF_MAX_LEVELS]; + + uint64_t stencil_offset; /* separate stencil */ + + uint8_t dcc_block_width; + uint8_t dcc_block_height; + uint8_t dcc_block_depth; + + /* Displayable DCC. This is always rb_aligned=0 and pipe_aligned=0. + * The 3D engine doesn't support that layout except for chips with 1 RB. + * All other chips must set rb_aligned=1. + * A compute shader needs to convert from aligned DCC to unaligned. + */ + uint32_t display_dcc_size; + uint32_t display_dcc_alignment; + uint16_t display_dcc_pitch_max; /* (mip chain pitch - 1) */ + bool dcc_retile_use_uint16; /* if all values fit into uint16_t */ + uint32_t dcc_retile_num_elements; + void *dcc_retile_map; }; struct radeon_surf { - /* Format properties. */ - unsigned blk_w:4; - unsigned blk_h:4; - unsigned bpe:5; - /* Number of mipmap levels where DCC is enabled starting from level 0. - * Non-zero levels may be disabled due to alignment constraints, but not - * the first level. - */ - unsigned num_dcc_levels:4; - unsigned is_linear:1; - unsigned has_stencil:1; - /* This might be true even if micro_tile_mode isn't displayable or rotated. */ - unsigned is_displayable:1; - /* Displayable, thin, depth, rotated. AKA D,S,Z,R swizzle modes. */ - unsigned micro_tile_mode:3; - uint32_t flags; - - /* These are return values. Some of them can be set by the caller, but - * they will be treated as hints (e.g. bankw, bankh) and might be - * changed by the calculator. - */ - - /* Tile swizzle can be OR'd with low bits of the BASE_256B address. - * The value is the same for all mipmap levels. Supported tile modes: - * - GFX6: Only macro tiling. - * - GFX9: Only *_X and *_T swizzle modes. Level 0 must not be in the mip - * tail. - * - * Only these surfaces are allowed to set it: - * - color (if it doesn't have to be displayable) - * - DCC (same tile swizzle as color) - * - FMASK - * - CMASK if it's TC-compatible or if the gen is GFX9 - * - depth/stencil if HTILE is not TC-compatible and if the gen is not GFX9 - */ - uint8_t tile_swizzle; - uint8_t fmask_tile_swizzle; - - uint64_t surf_size; - uint64_t fmask_size; - uint32_t surf_alignment; - uint32_t fmask_alignment; - - /* DCC and HTILE are very small. */ - uint32_t dcc_size; - uint32_t dcc_slice_size; - uint32_t dcc_alignment; - - uint32_t htile_size; - uint32_t htile_slice_size; - uint32_t htile_alignment; - - uint32_t cmask_size; - uint32_t cmask_slice_size; - uint32_t cmask_alignment; - - /* All buffers combined. */ - uint64_t htile_offset; - uint64_t fmask_offset; - uint64_t cmask_offset; - uint64_t dcc_offset; - uint64_t display_dcc_offset; - uint64_t dcc_retile_map_offset; - uint64_t total_size; - uint32_t alignment; - - union { - /* Return values for GFX8 and older. - * - * Some of them can be set by the caller if certain parameters are - * desirable. The allocator will try to obey them. - */ - struct legacy_surf_layout legacy; - - /* GFX9+ return values. */ - struct gfx9_surf_layout gfx9; - } u; + /* Format properties. */ + unsigned blk_w : 4; + unsigned blk_h : 4; + unsigned bpe : 5; + /* Number of mipmap levels where DCC is enabled starting from level 0. + * Non-zero levels may be disabled due to alignment constraints, but not + * the first level. + */ + unsigned num_dcc_levels : 4; + unsigned is_linear : 1; + unsigned has_stencil : 1; + /* This might be true even if micro_tile_mode isn't displayable or rotated. */ + unsigned is_displayable : 1; + /* Displayable, thin, depth, rotated. AKA D,S,Z,R swizzle modes. */ + unsigned micro_tile_mode : 3; + uint32_t flags; + + /* These are return values. Some of them can be set by the caller, but + * they will be treated as hints (e.g. bankw, bankh) and might be + * changed by the calculator. + */ + + /* Tile swizzle can be OR'd with low bits of the BASE_256B address. + * The value is the same for all mipmap levels. Supported tile modes: + * - GFX6: Only macro tiling. + * - GFX9: Only *_X and *_T swizzle modes. Level 0 must not be in the mip + * tail. + * + * Only these surfaces are allowed to set it: + * - color (if it doesn't have to be displayable) + * - DCC (same tile swizzle as color) + * - FMASK + * - CMASK if it's TC-compatible or if the gen is GFX9 + * - depth/stencil if HTILE is not TC-compatible and if the gen is not GFX9 + */ + uint8_t tile_swizzle; + uint8_t fmask_tile_swizzle; + + uint64_t surf_size; + uint64_t fmask_size; + uint32_t surf_alignment; + uint32_t fmask_alignment; + + /* DCC and HTILE are very small. */ + uint32_t dcc_size; + uint32_t dcc_slice_size; + uint32_t dcc_alignment; + + uint32_t htile_size; + uint32_t htile_slice_size; + uint32_t htile_alignment; + + uint32_t cmask_size; + uint32_t cmask_slice_size; + uint32_t cmask_alignment; + + /* All buffers combined. */ + uint64_t htile_offset; + uint64_t fmask_offset; + uint64_t cmask_offset; + uint64_t dcc_offset; + uint64_t display_dcc_offset; + uint64_t dcc_retile_map_offset; + uint64_t total_size; + uint32_t alignment; + + union { + /* Return values for GFX8 and older. + * + * Some of them can be set by the caller if certain parameters are + * desirable. The allocator will try to obey them. + */ + struct legacy_surf_layout legacy; + + /* GFX9+ return values. */ + struct gfx9_surf_layout gfx9; + } u; }; struct ac_surf_info { - uint32_t width; - uint32_t height; - uint32_t depth; - uint8_t samples; /* For Z/S: samples; For color: FMASK coverage samples */ - uint8_t storage_samples; /* For color: allocated samples */ - uint8_t levels; - uint8_t num_channels; /* heuristic for displayability */ - uint16_t array_size; - uint32_t *surf_index; /* Set a monotonic counter for tile swizzling. */ - uint32_t *fmask_surf_index; + uint32_t width; + uint32_t height; + uint32_t depth; + uint8_t samples; /* For Z/S: samples; For color: FMASK coverage samples */ + uint8_t storage_samples; /* For color: allocated samples */ + uint8_t levels; + uint8_t num_channels; /* heuristic for displayability */ + uint16_t array_size; + uint32_t *surf_index; /* Set a monotonic counter for tile swizzling. */ + uint32_t *fmask_surf_index; }; struct ac_surf_config { - struct ac_surf_info info; - unsigned is_1d : 1; - unsigned is_3d : 1; - unsigned is_cube : 1; + struct ac_surf_info info; + unsigned is_1d : 1; + unsigned is_3d : 1; + unsigned is_cube : 1; }; struct ac_addrlib *ac_addrlib_create(const struct radeon_info *info, - const struct amdgpu_gpu_info *amdinfo, - uint64_t *max_alignment); + const struct amdgpu_gpu_info *amdinfo, + uint64_t *max_alignment); void ac_addrlib_destroy(struct ac_addrlib *addrlib); int ac_compute_surface(struct ac_addrlib *addrlib, const struct radeon_info *info, - const struct ac_surf_config * config, - enum radeon_surf_mode mode, - struct radeon_surf *surf); + const struct ac_surf_config *config, enum radeon_surf_mode mode, + struct radeon_surf *surf); void ac_surface_zero_dcc_fields(struct radeon_surf *surf); -void ac_surface_set_bo_metadata(const struct radeon_info *info, - struct radeon_surf *surf, uint64_t tiling_flags, - enum radeon_surf_mode *mode); -void ac_surface_get_bo_metadata(const struct radeon_info *info, - struct radeon_surf *surf, uint64_t *tiling_flags); - -bool ac_surface_set_umd_metadata(const struct radeon_info *info, - struct radeon_surf *surf, - unsigned num_storage_samples, - unsigned num_mipmap_levels, - unsigned size_metadata, - uint32_t metadata[64]); -void ac_surface_get_umd_metadata(const struct radeon_info *info, - struct radeon_surf *surf, - unsigned num_mipmap_levels, - uint32_t desc[8], +void ac_surface_set_bo_metadata(const struct radeon_info *info, struct radeon_surf *surf, + uint64_t tiling_flags, enum radeon_surf_mode *mode); +void ac_surface_get_bo_metadata(const struct radeon_info *info, struct radeon_surf *surf, + uint64_t *tiling_flags); + +bool ac_surface_set_umd_metadata(const struct radeon_info *info, struct radeon_surf *surf, + unsigned num_storage_samples, unsigned num_mipmap_levels, + unsigned size_metadata, uint32_t metadata[64]); +void ac_surface_get_umd_metadata(const struct radeon_info *info, struct radeon_surf *surf, + unsigned num_mipmap_levels, uint32_t desc[8], unsigned *size_metadata, uint32_t metadata[64]); -void ac_surface_override_offset_stride(const struct radeon_info *info, - struct radeon_surf *surf, - unsigned num_mipmap_levels, - uint64_t offset, unsigned pitch); +void ac_surface_override_offset_stride(const struct radeon_info *info, struct radeon_surf *surf, + unsigned num_mipmap_levels, uint64_t offset, unsigned pitch); #ifdef __cplusplus } diff --git a/src/amd/common/amd_family.h b/src/amd/common/amd_family.h index 485ae27..475cb83 100644 --- a/src/amd/common/amd_family.h +++ b/src/amd/common/amd_family.h @@ -24,117 +24,120 @@ #ifndef AMD_FAMILY_H #define AMD_FAMILY_H -enum radeon_family { - CHIP_UNKNOWN = 0, - CHIP_R300, /* R3xx-based cores. (GFX2) */ - CHIP_R350, - CHIP_RV350, - CHIP_RV370, - CHIP_RV380, - CHIP_RS400, - CHIP_RC410, - CHIP_RS480, - CHIP_R420, /* R4xx-based cores. (GFX2) */ - CHIP_R423, - CHIP_R430, - CHIP_R480, - CHIP_R481, - CHIP_RV410, - CHIP_RS600, - CHIP_RS690, - CHIP_RS740, - CHIP_RV515, /* R5xx-based cores. (GFX2) */ - CHIP_R520, - CHIP_RV530, - CHIP_R580, - CHIP_RV560, - CHIP_RV570, - CHIP_R600, /* GFX3 (R6xx) */ - CHIP_RV610, - CHIP_RV630, - CHIP_RV670, - CHIP_RV620, - CHIP_RV635, - CHIP_RS780, - CHIP_RS880, - CHIP_RV770, /* GFX3 (R7xx) */ - CHIP_RV730, - CHIP_RV710, - CHIP_RV740, - CHIP_CEDAR, /* GFX4 (Evergreen) */ - CHIP_REDWOOD, - CHIP_JUNIPER, - CHIP_CYPRESS, - CHIP_HEMLOCK, - CHIP_PALM, - CHIP_SUMO, - CHIP_SUMO2, - CHIP_BARTS, - CHIP_TURKS, - CHIP_CAICOS, - CHIP_CAYMAN, /* GFX5 (Northern Islands) */ - CHIP_ARUBA, - CHIP_TAHITI, /* GFX6 (Southern Islands) */ - CHIP_PITCAIRN, - CHIP_VERDE, - CHIP_OLAND, - CHIP_HAINAN, - CHIP_BONAIRE, /* GFX7 (Sea Islands) */ - CHIP_KAVERI, - CHIP_KABINI, - CHIP_HAWAII, - CHIP_TONGA, /* GFX8 (Volcanic Islands & Polaris) */ - CHIP_ICELAND, - CHIP_CARRIZO, - CHIP_FIJI, - CHIP_STONEY, - CHIP_POLARIS10, - CHIP_POLARIS11, - CHIP_POLARIS12, - CHIP_VEGAM, - CHIP_VEGA10, /* GFX9 (Vega) */ - CHIP_VEGA12, - CHIP_VEGA20, - CHIP_RAVEN, - CHIP_RAVEN2, - CHIP_RENOIR, - CHIP_ARCTURUS, - CHIP_NAVI10, - CHIP_NAVI12, - CHIP_NAVI14, - CHIP_SIENNA_CICHLID, - CHIP_NAVY_FLOUNDER, - CHIP_LAST, +enum radeon_family +{ + CHIP_UNKNOWN = 0, + CHIP_R300, /* R3xx-based cores. (GFX2) */ + CHIP_R350, + CHIP_RV350, + CHIP_RV370, + CHIP_RV380, + CHIP_RS400, + CHIP_RC410, + CHIP_RS480, + CHIP_R420, /* R4xx-based cores. (GFX2) */ + CHIP_R423, + CHIP_R430, + CHIP_R480, + CHIP_R481, + CHIP_RV410, + CHIP_RS600, + CHIP_RS690, + CHIP_RS740, + CHIP_RV515, /* R5xx-based cores. (GFX2) */ + CHIP_R520, + CHIP_RV530, + CHIP_R580, + CHIP_RV560, + CHIP_RV570, + CHIP_R600, /* GFX3 (R6xx) */ + CHIP_RV610, + CHIP_RV630, + CHIP_RV670, + CHIP_RV620, + CHIP_RV635, + CHIP_RS780, + CHIP_RS880, + CHIP_RV770, /* GFX3 (R7xx) */ + CHIP_RV730, + CHIP_RV710, + CHIP_RV740, + CHIP_CEDAR, /* GFX4 (Evergreen) */ + CHIP_REDWOOD, + CHIP_JUNIPER, + CHIP_CYPRESS, + CHIP_HEMLOCK, + CHIP_PALM, + CHIP_SUMO, + CHIP_SUMO2, + CHIP_BARTS, + CHIP_TURKS, + CHIP_CAICOS, + CHIP_CAYMAN, /* GFX5 (Northern Islands) */ + CHIP_ARUBA, + CHIP_TAHITI, /* GFX6 (Southern Islands) */ + CHIP_PITCAIRN, + CHIP_VERDE, + CHIP_OLAND, + CHIP_HAINAN, + CHIP_BONAIRE, /* GFX7 (Sea Islands) */ + CHIP_KAVERI, + CHIP_KABINI, + CHIP_HAWAII, + CHIP_TONGA, /* GFX8 (Volcanic Islands & Polaris) */ + CHIP_ICELAND, + CHIP_CARRIZO, + CHIP_FIJI, + CHIP_STONEY, + CHIP_POLARIS10, + CHIP_POLARIS11, + CHIP_POLARIS12, + CHIP_VEGAM, + CHIP_VEGA10, /* GFX9 (Vega) */ + CHIP_VEGA12, + CHIP_VEGA20, + CHIP_RAVEN, + CHIP_RAVEN2, + CHIP_RENOIR, + CHIP_ARCTURUS, + CHIP_NAVI10, + CHIP_NAVI12, + CHIP_NAVI14, + CHIP_SIENNA_CICHLID, + CHIP_NAVY_FLOUNDER, + CHIP_LAST, }; -enum chip_class { - CLASS_UNKNOWN = 0, - R300, - R400, - R500, - R600, - R700, - EVERGREEN, - CAYMAN, - GFX6, - GFX7, - GFX8, - GFX9, - GFX10, - GFX10_3, +enum chip_class +{ + CLASS_UNKNOWN = 0, + R300, + R400, + R500, + R600, + R700, + EVERGREEN, + CAYMAN, + GFX6, + GFX7, + GFX8, + GFX9, + GFX10, + GFX10_3, }; -enum ring_type { - RING_GFX = 0, - RING_COMPUTE, - RING_DMA, - RING_UVD, - RING_VCE, - RING_UVD_ENC, - RING_VCN_DEC, - RING_VCN_ENC, - RING_VCN_JPEG, - NUM_RING_TYPES, +enum ring_type +{ + RING_GFX = 0, + RING_COMPUTE, + RING_DMA, + RING_UVD, + RING_VCE, + RING_UVD_ENC, + RING_VCN_DEC, + RING_VCN_ENC, + RING_VCN_JPEG, + NUM_RING_TYPES, }; #endif diff --git a/src/amd/common/amd_kernel_code_t.h b/src/amd/common/amd_kernel_code_t.h index f8e9508..7a5c2ea 100644 --- a/src/amd/common/amd_kernel_code_t.h +++ b/src/amd/common/amd_kernel_code_t.h @@ -30,13 +30,12 @@ //---------------------------------------------------------------------------// // Sets val bits for specified mask in specified dst packed instance. -#define AMD_HSA_BITS_SET(dst, mask, val) \ - dst &= (~(1 << mask ## _SHIFT) & ~mask); \ - dst |= (((val) << mask ## _SHIFT) & mask) +#define AMD_HSA_BITS_SET(dst, mask, val) \ + dst &= (~(1 << mask##_SHIFT) & ~mask); \ + dst |= (((val) << mask##_SHIFT) & mask) // Gets bits for specified mask from specified src packed instance. -#define AMD_HSA_BITS_GET(src, mask) \ - ((src & mask) >> mask ## _SHIFT) +#define AMD_HSA_BITS_GET(src, mask) ((src & mask) >> mask##_SHIFT) /* Every amd_*_code_t has the following properties, which are composed of * a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), @@ -47,132 +46,164 @@ * implementation defined in the C standard and so cannot be used to * specify an ABI) */ -enum amd_code_property_mask_t { - - /* Enable the setup of the SGPR user data registers - * (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t - * for initial register state. - * - * The total number of SGPRuser data registers requested must not - * exceed 16. Any requests beyond 16 will be ignored. - * - * Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of - * SGPR user data registers enabled up to 16). - */ - - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, - - AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10, - AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6, - AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT, - - /* Control wave ID base counter for GDS ordered-append. Used to set - * COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if - * ORDERED_APPEND_MODE also needs to be settable) - */ - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16, - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, - - /* The interleave (swizzle) element size in bytes required by the - * code for private memory. This must be 2, 4, 8 or 16. This value - * is provided to the finalizer when it is invoked and is recorded - * here. The hardware will interleave the memory requests of each - * lane of a wavefront by this element size to ensure each - * work-item gets a distinct memory memory location. Therefore, the - * finalizer ensures that all load and store operations done to - * private memory do not exceed this size. For example, if the - * element size is 4 (32-bits or dword) and a 64-bit value must be - * loaded, the finalizer will generate two 32-bit loads. This - * ensures that the interleaving will get the work-item - * specific dword for both halves of the 64-bit value. If it just - * did a 64-bit load then it would get one dword which belonged to - * its own work-item, but the second dword would belong to the - * adjacent lane work-item since the interleaving is in dwords. - * - * The value used must match the value that the runtime configures - * the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This - * is generally DWORD. - * - * USE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM. - */ - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17, - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, - - /* Are global memory addresses 64 bits. Must match - * amd_kernel_code_t.hsail_machine_model == - * HSA_MACHINE_LARGE. Must also match - * SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), - * SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). - */ - AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19, - AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, - AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, - - /* Indicate if the generated ISA is using a dynamically sized call - * stack. This can happen if calls are implemented using a call - * stack and recursion, alloca or calls to indirect functions are - * present. In these cases the Finalizer cannot compute the total - * private segment size at compile time. In this case the - * workitem_private_segment_byte_size only specifies the statically - * know private segment size, and additional space must be added - * for the call stack. - */ - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20, - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, - - /* Indicate if code generated has support for debugging. */ - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21, - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT, - - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22, - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1, - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT, - - AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23, - AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9, - AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT +enum amd_code_property_mask_t +{ + + /* Enable the setup of the SGPR user data registers + * (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t + * for initial register state. + * + * The total number of SGPRuser data registers requested must not + * exceed 16. Any requests beyond 16 will be ignored. + * + * Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of + * SGPR user data registers enabled up to 16). + */ + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = + ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = + ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = + ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = + ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = + ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = + ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = + ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = + ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = + ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = + ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, + + AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10, + AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6, + AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) + << AMD_CODE_PROPERTY_RESERVED1_SHIFT, + + /* Control wave ID base counter for GDS ordered-append. Used to set + * COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if + * ORDERED_APPEND_MODE also needs to be settable) + */ + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = + ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) + << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, + + /* The interleave (swizzle) element size in bytes required by the + * code for private memory. This must be 2, 4, 8 or 16. This value + * is provided to the finalizer when it is invoked and is recorded + * here. The hardware will interleave the memory requests of each + * lane of a wavefront by this element size to ensure each + * work-item gets a distinct memory memory location. Therefore, the + * finalizer ensures that all load and store operations done to + * private memory do not exceed this size. For example, if the + * element size is 4 (32-bits or dword) and a 64-bit value must be + * loaded, the finalizer will generate two 32-bit loads. This + * ensures that the interleaving will get the work-item + * specific dword for both halves of the 64-bit value. If it just + * did a 64-bit load then it would get one dword which belonged to + * its own work-item, but the second dword would belong to the + * adjacent lane work-item since the interleaving is in dwords. + * + * The value used must match the value that the runtime configures + * the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This + * is generally DWORD. + * + * USE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM. + */ + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = + ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) + << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, + + /* Are global memory addresses 64 bits. Must match + * amd_kernel_code_t.hsail_machine_model == + * HSA_MACHINE_LARGE. Must also match + * SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), + * SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). + */ + AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19, + AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, + AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) + << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, + + /* Indicate if the generated ISA is using a dynamically sized call + * stack. This can happen if calls are implemented using a call + * stack and recursion, alloca or calls to indirect functions are + * present. In these cases the Finalizer cannot compute the total + * private segment size at compile time. In this case the + * workitem_private_segment_byte_size only specifies the statically + * know private segment size, and additional space must be added + * for the call stack. + */ + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = + ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) + << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, + + /* Indicate if code generated has support for debugging. */ + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) + << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT, + + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22, + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1, + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) + << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT, + + AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23, + AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9, + AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) + << AMD_CODE_PROPERTY_RESERVED2_SHIFT }; /* AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel @@ -381,154 +412,154 @@ enum amd_code_property_mask_t { */ typedef struct amd_kernel_code_s { - uint32_t amd_kernel_code_version_major; - uint32_t amd_kernel_code_version_minor; - uint16_t amd_machine_kind; - uint16_t amd_machine_version_major; - uint16_t amd_machine_version_minor; - uint16_t amd_machine_version_stepping; - - /* Byte offset (possibly negative) from start of amd_kernel_code_t - * object to kernel's entry point instruction. The actual code for - * the kernel is required to be 256 byte aligned to match hardware - * requirements (SQ cache line is 16). The code must be position - * independent code (PIC) for AMD devices to give runtime the - * option of copying code to discrete GPU memory or APU L2 - * cache. The Finalizer should endeavour to allocate all kernel - * machine code in contiguous memory pages so that a device - * pre-fetcher will tend to only pre-fetch Kernel Code objects, - * improving cache performance. - */ - int64_t kernel_code_entry_byte_offset; - - /* Range of bytes to consider prefetching expressed as an offset - * and size. The offset is from the start (possibly negative) of - * amd_kernel_code_t object. Set both to 0 if no prefetch - * information is available. - */ - int64_t kernel_code_prefetch_byte_offset; - uint64_t kernel_code_prefetch_byte_size; - - /* Number of bytes of scratch backing memory required for full - * occupancy of target chip. This takes into account the number of - * bytes of scratch per work-item, the wavefront size, the maximum - * number of wavefronts per CU, and the number of CUs. This is an - * upper limit on scratch. If the grid being dispatched is small it - * may only need less than this. If the kernel uses no scratch, or - * the Finalizer has not computed this value, it must be 0. - */ - uint64_t max_scratch_backing_memory_byte_size; - - /* Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and - * COMPUTE_PGM_RSRC2 registers. - */ - uint64_t compute_pgm_resource_registers; - - /* Code properties. See amd_code_property_mask_t for a full list of - * properties. - */ - uint32_t code_properties; - - /* The amount of memory required for the combined private, spill - * and arg segments for a work-item in bytes. If - * is_dynamic_callstack is 1 then additional space must be added to - * this value for the call stack. - */ - uint32_t workitem_private_segment_byte_size; - - /* The amount of group segment memory required by a work-group in - * bytes. This does not include any dynamically allocated group - * segment memory that may be added when the kernel is - * dispatched. - */ - uint32_t workgroup_group_segment_byte_size; - - /* Number of byte of GDS required by kernel dispatch. Must be 0 if - * not using GDS. - */ - uint32_t gds_segment_byte_size; - - /* The size in bytes of the kernarg segment that holds the values - * of the arguments to the kernel. This could be used by CP to - * prefetch the kernarg segment pointed to by the dispatch packet. - */ - uint64_t kernarg_segment_byte_size; - - /* Number of fbarrier's used in the kernel and all functions it - * calls. If the implementation uses group memory to allocate the - * fbarriers then that amount must already be included in the - * workgroup_group_segment_byte_size total. - */ - uint32_t workgroup_fbarrier_count; - - /* Number of scalar registers used by a wavefront. This includes - * the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size - * and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a - * trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. - */ - uint16_t wavefront_sgpr_count; - - /* Number of vector registers used by each work-item. Used to set - * COMPUTE_PGM_RSRC1.VGPRS. - */ - uint16_t workitem_vgpr_count; - - /* If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the - * first fixed VGPR number reserved. - */ - uint16_t reserved_vgpr_first; - - /* The number of consecutive VGPRs reserved by the client. If - * is_debug_supported then this count includes VGPRs reserved - * for debugger use. - */ - uint16_t reserved_vgpr_count; - - /* If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the - * first fixed SGPR number reserved. - */ - uint16_t reserved_sgpr_first; - - /* The number of consecutive SGPRs reserved by the client. If - * is_debug_supported then this count includes SGPRs reserved - * for debugger use. - */ - uint16_t reserved_sgpr_count; - - /* If is_debug_supported is 0 then must be 0. Otherwise, this is the - * fixed SGPR number used to hold the wave scratch offset for the - * entire kernel execution, or uint16_t(-1) if the register is not - * used or not known. - */ - uint16_t debug_wavefront_private_segment_offset_sgpr; - - /* If is_debug_supported is 0 then must be 0. Otherwise, this is the - * fixed SGPR number of the first of 4 SGPRs used to hold the - * scratch V# used for the entire kernel execution, or uint16_t(-1) - * if the registers are not used or not known. - */ - uint16_t debug_private_segment_buffer_sgpr; - - /* The maximum byte alignment of variables used by the kernel in - * the specified memory segment. Expressed as a power of two. Must - * be at least HSA_POWERTWO_16. - */ - uint8_t kernarg_segment_alignment; - uint8_t group_segment_alignment; - uint8_t private_segment_alignment; - - /* Wavefront size expressed as a power of two. Must be a power of 2 - * in range 1..64 inclusive. Used to support runtime query that - * obtains wavefront size, which may be used by application to - * allocated dynamic group memory and set the dispatch work-group - * size. - */ - uint8_t wavefront_size; - - int32_t call_convention; - uint8_t reserved3[12]; - uint64_t runtime_loader_kernel_symbol; - uint64_t control_directives[16]; + uint32_t amd_kernel_code_version_major; + uint32_t amd_kernel_code_version_minor; + uint16_t amd_machine_kind; + uint16_t amd_machine_version_major; + uint16_t amd_machine_version_minor; + uint16_t amd_machine_version_stepping; + + /* Byte offset (possibly negative) from start of amd_kernel_code_t + * object to kernel's entry point instruction. The actual code for + * the kernel is required to be 256 byte aligned to match hardware + * requirements (SQ cache line is 16). The code must be position + * independent code (PIC) for AMD devices to give runtime the + * option of copying code to discrete GPU memory or APU L2 + * cache. The Finalizer should endeavour to allocate all kernel + * machine code in contiguous memory pages so that a device + * pre-fetcher will tend to only pre-fetch Kernel Code objects, + * improving cache performance. + */ + int64_t kernel_code_entry_byte_offset; + + /* Range of bytes to consider prefetching expressed as an offset + * and size. The offset is from the start (possibly negative) of + * amd_kernel_code_t object. Set both to 0 if no prefetch + * information is available. + */ + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + + /* Number of bytes of scratch backing memory required for full + * occupancy of target chip. This takes into account the number of + * bytes of scratch per work-item, the wavefront size, the maximum + * number of wavefronts per CU, and the number of CUs. This is an + * upper limit on scratch. If the grid being dispatched is small it + * may only need less than this. If the kernel uses no scratch, or + * the Finalizer has not computed this value, it must be 0. + */ + uint64_t max_scratch_backing_memory_byte_size; + + /* Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and + * COMPUTE_PGM_RSRC2 registers. + */ + uint64_t compute_pgm_resource_registers; + + /* Code properties. See amd_code_property_mask_t for a full list of + * properties. + */ + uint32_t code_properties; + + /* The amount of memory required for the combined private, spill + * and arg segments for a work-item in bytes. If + * is_dynamic_callstack is 1 then additional space must be added to + * this value for the call stack. + */ + uint32_t workitem_private_segment_byte_size; + + /* The amount of group segment memory required by a work-group in + * bytes. This does not include any dynamically allocated group + * segment memory that may be added when the kernel is + * dispatched. + */ + uint32_t workgroup_group_segment_byte_size; + + /* Number of byte of GDS required by kernel dispatch. Must be 0 if + * not using GDS. + */ + uint32_t gds_segment_byte_size; + + /* The size in bytes of the kernarg segment that holds the values + * of the arguments to the kernel. This could be used by CP to + * prefetch the kernarg segment pointed to by the dispatch packet. + */ + uint64_t kernarg_segment_byte_size; + + /* Number of fbarrier's used in the kernel and all functions it + * calls. If the implementation uses group memory to allocate the + * fbarriers then that amount must already be included in the + * workgroup_group_segment_byte_size total. + */ + uint32_t workgroup_fbarrier_count; + + /* Number of scalar registers used by a wavefront. This includes + * the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size + * and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a + * trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. + */ + uint16_t wavefront_sgpr_count; + + /* Number of vector registers used by each work-item. Used to set + * COMPUTE_PGM_RSRC1.VGPRS. + */ + uint16_t workitem_vgpr_count; + + /* If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the + * first fixed VGPR number reserved. + */ + uint16_t reserved_vgpr_first; + + /* The number of consecutive VGPRs reserved by the client. If + * is_debug_supported then this count includes VGPRs reserved + * for debugger use. + */ + uint16_t reserved_vgpr_count; + + /* If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the + * first fixed SGPR number reserved. + */ + uint16_t reserved_sgpr_first; + + /* The number of consecutive SGPRs reserved by the client. If + * is_debug_supported then this count includes SGPRs reserved + * for debugger use. + */ + uint16_t reserved_sgpr_count; + + /* If is_debug_supported is 0 then must be 0. Otherwise, this is the + * fixed SGPR number used to hold the wave scratch offset for the + * entire kernel execution, or uint16_t(-1) if the register is not + * used or not known. + */ + uint16_t debug_wavefront_private_segment_offset_sgpr; + + /* If is_debug_supported is 0 then must be 0. Otherwise, this is the + * fixed SGPR number of the first of 4 SGPRs used to hold the + * scratch V# used for the entire kernel execution, or uint16_t(-1) + * if the registers are not used or not known. + */ + uint16_t debug_private_segment_buffer_sgpr; + + /* The maximum byte alignment of variables used by the kernel in + * the specified memory segment. Expressed as a power of two. Must + * be at least HSA_POWERTWO_16. + */ + uint8_t kernarg_segment_alignment; + uint8_t group_segment_alignment; + uint8_t private_segment_alignment; + + /* Wavefront size expressed as a power of two. Must be a power of 2 + * in range 1..64 inclusive. Used to support runtime query that + * obtains wavefront size, which may be used by application to + * allocated dynamic group memory and set the dispatch work-group + * size. + */ + uint8_t wavefront_size; + + int32_t call_convention; + uint8_t reserved3[12]; + uint64_t runtime_loader_kernel_symbol; + uint64_t control_directives[16]; } amd_kernel_code_t; #endif // AMDKERNELCODET_H diff --git a/src/amd/common/gfx10_format_table.h b/src/amd/common/gfx10_format_table.h index 2692c81..9eff122 100644 --- a/src/amd/common/gfx10_format_table.h +++ b/src/amd/common/gfx10_format_table.h @@ -27,16 +27,17 @@ #ifndef GFX10_FORMAT_TABLE_H #define GFX10_FORMAT_TABLE_H -#include #include "pipe/p_format.h" +#include + struct gfx10_format { - unsigned img_format:9; + unsigned img_format : 9; - /* Various formats are only supported with workarounds for vertex fetch, - * and some 32_32_32 formats are supported natively, but only for buffers - * (possibly with some image support, actually, but no filtering). */ - bool buffers_only:1; + /* Various formats are only supported with workarounds for vertex fetch, + * and some 32_32_32 formats are supported natively, but only for buffers + * (possibly with some image support, actually, but no filtering). */ + bool buffers_only : 1; }; extern const struct gfx10_format gfx10_format_table[PIPE_FORMAT_COUNT]; diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h index c447438..a55598c 100644 --- a/src/amd/common/sid.h +++ b/src/amd/common/sid.h @@ -27,227 +27,227 @@ #include "amdgfxregs.h" /* si values */ -#define SI_CONFIG_REG_OFFSET 0x00008000 -#define SI_CONFIG_REG_END 0x0000B000 -#define SI_SH_REG_OFFSET 0x0000B000 -#define SI_SH_REG_END 0x0000C000 -#define SI_CONTEXT_REG_OFFSET 0x00028000 -#define SI_CONTEXT_REG_END 0x00030000 -#define CIK_UCONFIG_REG_OFFSET 0x00030000 -#define CIK_UCONFIG_REG_END 0x00040000 -#define SI_UCONFIG_PERF_REG_OFFSET 0x00034000 -#define SI_UCONFIG_PERF_REG_END 0x00038000 +#define SI_CONFIG_REG_OFFSET 0x00008000 +#define SI_CONFIG_REG_END 0x0000B000 +#define SI_SH_REG_OFFSET 0x0000B000 +#define SI_SH_REG_END 0x0000C000 +#define SI_CONTEXT_REG_OFFSET 0x00028000 +#define SI_CONTEXT_REG_END 0x00030000 +#define CIK_UCONFIG_REG_OFFSET 0x00030000 +#define CIK_UCONFIG_REG_END 0x00040000 +#define SI_UCONFIG_PERF_REG_OFFSET 0x00034000 +#define SI_UCONFIG_PERF_REG_END 0x00038000 /* For register shadowing: */ -#define SI_SH_REG_SPACE_SIZE (SI_SH_REG_END - SI_SH_REG_OFFSET) -#define SI_CONTEXT_REG_SPACE_SIZE (SI_CONTEXT_REG_END - SI_CONTEXT_REG_OFFSET) -#define SI_UCONFIG_REG_SPACE_SIZE (CIK_UCONFIG_REG_END - CIK_UCONFIG_REG_OFFSET) -#define SI_UCONFIG_PERF_REG_SPACE_SIZE (SI_UCONFIG_PERF_REG_END - SI_UCONFIG_PERF_REG_OFFSET) +#define SI_SH_REG_SPACE_SIZE (SI_SH_REG_END - SI_SH_REG_OFFSET) +#define SI_CONTEXT_REG_SPACE_SIZE (SI_CONTEXT_REG_END - SI_CONTEXT_REG_OFFSET) +#define SI_UCONFIG_REG_SPACE_SIZE (CIK_UCONFIG_REG_END - CIK_UCONFIG_REG_OFFSET) +#define SI_UCONFIG_PERF_REG_SPACE_SIZE (SI_UCONFIG_PERF_REG_END - SI_UCONFIG_PERF_REG_OFFSET) -#define SI_SHADOWED_SH_REG_OFFSET 0 -#define SI_SHADOWED_CONTEXT_REG_OFFSET SI_SH_REG_SPACE_SIZE -#define SI_SHADOWED_UCONFIG_REG_OFFSET (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE) -#define SI_SHADOWED_REG_BUFFER_SIZE (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE + \ - SI_UCONFIG_REG_SPACE_SIZE) +#define SI_SHADOWED_SH_REG_OFFSET 0 +#define SI_SHADOWED_CONTEXT_REG_OFFSET SI_SH_REG_SPACE_SIZE +#define SI_SHADOWED_UCONFIG_REG_OFFSET (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE) +#define SI_SHADOWED_REG_BUFFER_SIZE \ + (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE + SI_UCONFIG_REG_SPACE_SIZE) #define EVENT_TYPE_CACHE_FLUSH 0x6 -#define EVENT_TYPE_PS_PARTIAL_FLUSH 0x10 +#define EVENT_TYPE_PS_PARTIAL_FLUSH 0x10 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14 -#define EVENT_TYPE_ZPASS_DONE 0x15 -#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16 -#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH 0x1f -#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS 0x20 -#define EVENT_TYPE(x) ((x) << 0) -#define EVENT_INDEX(x) ((x) << 8) - /* 0 - any non-TS event - * 1 - ZPASS_DONE - * 2 - SAMPLE_PIPELINESTAT - * 3 - SAMPLE_STREAMOUTSTAT* - * 4 - *S_PARTIAL_FLUSH - * 5 - TS events - */ +#define EVENT_TYPE_ZPASS_DONE 0x15 +#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16 +#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH 0x1f +#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS 0x20 +#define EVENT_TYPE(x) ((x) << 0) +#define EVENT_INDEX(x) ((x) << 8) +/* 0 - any non-TS event + * 1 - ZPASS_DONE + * 2 - SAMPLE_PIPELINESTAT + * 3 - SAMPLE_STREAMOUTSTAT* + * 4 - *S_PARTIAL_FLUSH + * 5 - TS events + */ /* EVENT_WRITE_EOP (SI-VI) & RELEASE_MEM (GFX9) */ -#define EVENT_TCL1_VOL_ACTION_ENA (1 << 12) -#define EVENT_TC_VOL_ACTION_ENA (1 << 13) -#define EVENT_TC_WB_ACTION_ENA (1 << 15) -#define EVENT_TCL1_ACTION_ENA (1 << 16) -#define EVENT_TC_ACTION_ENA (1 << 17) -#define EVENT_TC_NC_ACTION_ENA (1 << 19) /* GFX9+ */ -#define EVENT_TC_WC_ACTION_ENA (1 << 20) /* GFX9+ */ -#define EVENT_TC_MD_ACTION_ENA (1 << 21) /* GFX9+ */ - +#define EVENT_TCL1_VOL_ACTION_ENA (1 << 12) +#define EVENT_TC_VOL_ACTION_ENA (1 << 13) +#define EVENT_TC_WB_ACTION_ENA (1 << 15) +#define EVENT_TCL1_ACTION_ENA (1 << 16) +#define EVENT_TC_ACTION_ENA (1 << 17) +#define EVENT_TC_NC_ACTION_ENA (1 << 19) /* GFX9+ */ +#define EVENT_TC_WC_ACTION_ENA (1 << 20) /* GFX9+ */ +#define EVENT_TC_MD_ACTION_ENA (1 << 21) /* GFX9+ */ -#define PREDICATION_OP_CLEAR 0x0 -#define PREDICATION_OP_ZPASS 0x1 +#define PREDICATION_OP_CLEAR 0x0 +#define PREDICATION_OP_ZPASS 0x1 #define PREDICATION_OP_PRIMCOUNT 0x2 -#define PREDICATION_OP_BOOL64 0x3 +#define PREDICATION_OP_BOOL64 0x3 #define PRED_OP(x) ((x) << 16) #define PREDICATION_CONTINUE (1 << 31) -#define PREDICATION_HINT_WAIT (0 << 12) +#define PREDICATION_HINT_WAIT (0 << 12) #define PREDICATION_HINT_NOWAIT_DRAW (1 << 12) #define PREDICATION_DRAW_NOT_VISIBLE (0 << 8) -#define PREDICATION_DRAW_VISIBLE (1 << 8) +#define PREDICATION_DRAW_VISIBLE (1 << 8) -#define R600_TEXEL_PITCH_ALIGNMENT_MASK 0x7 +#define R600_TEXEL_PITCH_ALIGNMENT_MASK 0x7 /* All registers defined in this packet section don't exist and the only * purpose of these definitions is to define packet encoding that * the IB parser understands, and also to have an accurate documentation. */ -#define PKT3_NOP 0x10 -#define PKT3_SET_BASE 0x11 -#define PKT3_CLEAR_STATE 0x12 -#define PKT3_INDEX_BUFFER_SIZE 0x13 -#define PKT3_DISPATCH_DIRECT 0x15 -#define PKT3_DISPATCH_INDIRECT 0x16 -#define PKT3_OCCLUSION_QUERY 0x1F /* new for CIK */ -#define PKT3_SET_PREDICATION 0x20 -#define PKT3_COND_EXEC 0x22 -#define PKT3_PRED_EXEC 0x23 -#define PKT3_DRAW_INDIRECT 0x24 -#define PKT3_DRAW_INDEX_INDIRECT 0x25 -#define PKT3_INDEX_BASE 0x26 -#define PKT3_DRAW_INDEX_2 0x27 -#define PKT3_CONTEXT_CONTROL 0x28 -#define CC0_LOAD_GLOBAL_CONFIG(x) (((unsigned)(x) & 0x1) << 0) -#define CC0_LOAD_PER_CONTEXT_STATE(x) (((unsigned)(x) & 0x1) << 1) -#define CC0_LOAD_GLOBAL_UCONFIG(x) (((unsigned)(x) & 0x1) << 15) -#define CC0_LOAD_GFX_SH_REGS(x) (((unsigned)(x) & 0x1) << 16) -#define CC0_LOAD_CS_SH_REGS(x) (((unsigned)(x) & 0x1) << 24) -#define CC0_LOAD_CE_RAM(x) (((unsigned)(x) & 0x1) << 28) -#define CC0_UPDATE_LOAD_ENABLES(x) (((unsigned)(x) & 0x1) << 31) -#define CC1_SHADOW_GLOBAL_CONFIG(x) (((unsigned)(x) & 0x1) << 0) -#define CC1_SHADOW_PER_CONTEXT_STATE(x) (((unsigned)(x) & 0x1) << 1) -#define CC1_SHADOW_GLOBAL_UCONFIG(x) (((unsigned)(x) & 0x1) << 15) -#define CC1_SHADOW_GFX_SH_REGS(x) (((unsigned)(x) & 0x1) << 16) -#define CC1_SHADOW_CS_SH_REGS(x) (((unsigned)(x) & 0x1) << 24) -#define CC1_UPDATE_SHADOW_ENABLES(x) (((unsigned)(x) & 0x1) << 31) -#define PKT3_INDEX_TYPE 0x2A /* not on GFX9 */ -#define PKT3_DRAW_INDIRECT_MULTI 0x2C -#define R_2C3_DRAW_INDEX_LOC 0x2C3 -#define S_2C3_COUNT_INDIRECT_ENABLE(x) (((unsigned)(x) & 0x1) << 30) -#define S_2C3_DRAW_INDEX_ENABLE(x) (((unsigned)(x) & 0x1) << 31) -#define PKT3_DRAW_INDEX_AUTO 0x2D -#define PKT3_DRAW_INDEX_IMMD 0x2E /* not on CIK */ -#define PKT3_NUM_INSTANCES 0x2F -#define PKT3_DRAW_INDEX_MULTI_AUTO 0x30 -#define PKT3_INDIRECT_BUFFER_SI 0x32 /* not on CIK */ -#define PKT3_INDIRECT_BUFFER_CONST 0x33 -#define PKT3_STRMOUT_BUFFER_UPDATE 0x34 -#define STRMOUT_STORE_BUFFER_FILLED_SIZE 1 -#define STRMOUT_OFFSET_SOURCE(x) (((unsigned)(x) & 0x3) << 1) -#define STRMOUT_OFFSET_FROM_PACKET 0 -#define STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE 1 -#define STRMOUT_OFFSET_FROM_MEM 2 -#define STRMOUT_OFFSET_NONE 3 -#define STRMOUT_DATA_TYPE(x) (((unsigned)(x) & 0x1) << 7) -#define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x) & 0x3) << 8) -#define PKT3_DRAW_INDEX_OFFSET_2 0x35 -#define PKT3_WRITE_DATA 0x37 -#define PKT3_DRAW_INDEX_INDIRECT_MULTI 0x38 -#define PKT3_MEM_SEMAPHORE 0x39 -#define PKT3_MPEG_INDEX 0x3A /* not on CIK */ -#define PKT3_WAIT_REG_MEM 0x3C -#define WAIT_REG_MEM_EQUAL 3 -#define WAIT_REG_MEM_NOT_EQUAL 4 -#define WAIT_REG_MEM_GREATER_OR_EQUAL 5 -#define WAIT_REG_MEM_MEM_SPACE(x) (((unsigned)(x) & 0x3) << 4) -#define WAIT_REG_MEM_PFP (1 << 8) -#define PKT3_MEM_WRITE 0x3D /* not on CIK */ -#define PKT3_INDIRECT_BUFFER_CIK 0x3F /* new on CIK */ +#define PKT3_NOP 0x10 +#define PKT3_SET_BASE 0x11 +#define PKT3_CLEAR_STATE 0x12 +#define PKT3_INDEX_BUFFER_SIZE 0x13 +#define PKT3_DISPATCH_DIRECT 0x15 +#define PKT3_DISPATCH_INDIRECT 0x16 +#define PKT3_OCCLUSION_QUERY 0x1F /* new for CIK */ +#define PKT3_SET_PREDICATION 0x20 +#define PKT3_COND_EXEC 0x22 +#define PKT3_PRED_EXEC 0x23 +#define PKT3_DRAW_INDIRECT 0x24 +#define PKT3_DRAW_INDEX_INDIRECT 0x25 +#define PKT3_INDEX_BASE 0x26 +#define PKT3_DRAW_INDEX_2 0x27 +#define PKT3_CONTEXT_CONTROL 0x28 +#define CC0_LOAD_GLOBAL_CONFIG(x) (((unsigned)(x)&0x1) << 0) +#define CC0_LOAD_PER_CONTEXT_STATE(x) (((unsigned)(x)&0x1) << 1) +#define CC0_LOAD_GLOBAL_UCONFIG(x) (((unsigned)(x)&0x1) << 15) +#define CC0_LOAD_GFX_SH_REGS(x) (((unsigned)(x)&0x1) << 16) +#define CC0_LOAD_CS_SH_REGS(x) (((unsigned)(x)&0x1) << 24) +#define CC0_LOAD_CE_RAM(x) (((unsigned)(x)&0x1) << 28) +#define CC0_UPDATE_LOAD_ENABLES(x) (((unsigned)(x)&0x1) << 31) +#define CC1_SHADOW_GLOBAL_CONFIG(x) (((unsigned)(x)&0x1) << 0) +#define CC1_SHADOW_PER_CONTEXT_STATE(x) (((unsigned)(x)&0x1) << 1) +#define CC1_SHADOW_GLOBAL_UCONFIG(x) (((unsigned)(x)&0x1) << 15) +#define CC1_SHADOW_GFX_SH_REGS(x) (((unsigned)(x)&0x1) << 16) +#define CC1_SHADOW_CS_SH_REGS(x) (((unsigned)(x)&0x1) << 24) +#define CC1_UPDATE_SHADOW_ENABLES(x) (((unsigned)(x)&0x1) << 31) +#define PKT3_INDEX_TYPE 0x2A /* not on GFX9 */ +#define PKT3_DRAW_INDIRECT_MULTI 0x2C +#define R_2C3_DRAW_INDEX_LOC 0x2C3 +#define S_2C3_COUNT_INDIRECT_ENABLE(x) (((unsigned)(x)&0x1) << 30) +#define S_2C3_DRAW_INDEX_ENABLE(x) (((unsigned)(x)&0x1) << 31) +#define PKT3_DRAW_INDEX_AUTO 0x2D +#define PKT3_DRAW_INDEX_IMMD 0x2E /* not on CIK */ +#define PKT3_NUM_INSTANCES 0x2F +#define PKT3_DRAW_INDEX_MULTI_AUTO 0x30 +#define PKT3_INDIRECT_BUFFER_SI 0x32 /* not on CIK */ +#define PKT3_INDIRECT_BUFFER_CONST 0x33 +#define PKT3_STRMOUT_BUFFER_UPDATE 0x34 +#define STRMOUT_STORE_BUFFER_FILLED_SIZE 1 +#define STRMOUT_OFFSET_SOURCE(x) (((unsigned)(x)&0x3) << 1) +#define STRMOUT_OFFSET_FROM_PACKET 0 +#define STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE 1 +#define STRMOUT_OFFSET_FROM_MEM 2 +#define STRMOUT_OFFSET_NONE 3 +#define STRMOUT_DATA_TYPE(x) (((unsigned)(x)&0x1) << 7) +#define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x)&0x3) << 8) +#define PKT3_DRAW_INDEX_OFFSET_2 0x35 +#define PKT3_WRITE_DATA 0x37 +#define PKT3_DRAW_INDEX_INDIRECT_MULTI 0x38 +#define PKT3_MEM_SEMAPHORE 0x39 +#define PKT3_MPEG_INDEX 0x3A /* not on CIK */ +#define PKT3_WAIT_REG_MEM 0x3C +#define WAIT_REG_MEM_EQUAL 3 +#define WAIT_REG_MEM_NOT_EQUAL 4 +#define WAIT_REG_MEM_GREATER_OR_EQUAL 5 +#define WAIT_REG_MEM_MEM_SPACE(x) (((unsigned)(x)&0x3) << 4) +#define WAIT_REG_MEM_PFP (1 << 8) +#define PKT3_MEM_WRITE 0x3D /* not on CIK */ +#define PKT3_INDIRECT_BUFFER_CIK 0x3F /* new on CIK */ -#define PKT3_COPY_DATA 0x40 -#define COPY_DATA_SRC_SEL(x) ((x) & 0xf) -#define COPY_DATA_REG 0 -#define COPY_DATA_SRC_MEM 1 /* only valid as source */ -#define COPY_DATA_TC_L2 2 -#define COPY_DATA_GDS 3 -#define COPY_DATA_PERF 4 -#define COPY_DATA_IMM 5 -#define COPY_DATA_TIMESTAMP 9 -#define COPY_DATA_DST_SEL(x) (((unsigned)(x) & 0xf) << 8) -#define COPY_DATA_DST_MEM_GRBM 1 /* sync across GRBM, deprecated */ -#define COPY_DATA_TC_L2 2 -#define COPY_DATA_GDS 3 -#define COPY_DATA_PERF 4 -#define COPY_DATA_DST_MEM 5 -#define COPY_DATA_COUNT_SEL (1 << 16) -#define COPY_DATA_WR_CONFIRM (1 << 20) -#define COPY_DATA_ENGINE_PFP (1 << 30) -#define PKT3_PFP_SYNC_ME 0x42 +#define PKT3_COPY_DATA 0x40 +#define COPY_DATA_SRC_SEL(x) ((x)&0xf) +#define COPY_DATA_REG 0 +#define COPY_DATA_SRC_MEM 1 /* only valid as source */ +#define COPY_DATA_TC_L2 2 +#define COPY_DATA_GDS 3 +#define COPY_DATA_PERF 4 +#define COPY_DATA_IMM 5 +#define COPY_DATA_TIMESTAMP 9 +#define COPY_DATA_DST_SEL(x) (((unsigned)(x)&0xf) << 8) +#define COPY_DATA_DST_MEM_GRBM 1 /* sync across GRBM, deprecated */ +#define COPY_DATA_TC_L2 2 +#define COPY_DATA_GDS 3 +#define COPY_DATA_PERF 4 +#define COPY_DATA_DST_MEM 5 +#define COPY_DATA_COUNT_SEL (1 << 16) +#define COPY_DATA_WR_CONFIRM (1 << 20) +#define COPY_DATA_ENGINE_PFP (1 << 30) +#define PKT3_PFP_SYNC_ME 0x42 #define PKT3_SURFACE_SYNC 0x43 /* deprecated on CIK, use ACQUIRE_MEM */ #define PKT3_ME_INITIALIZE 0x44 /* not on CIK */ #define PKT3_COND_WRITE 0x45 #define PKT3_EVENT_WRITE 0x46 #define PKT3_EVENT_WRITE_EOP 0x47 /* not on GFX9 */ -#define EOP_DST_SEL(x) ((x) << 16) -#define EOP_DST_SEL_MEM 0 -#define EOP_DST_SEL_TC_L2 1 -#define EOP_INT_SEL(x) ((x) << 24) -#define EOP_INT_SEL_NONE 0 -#define EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM 3 -#define EOP_DATA_SEL(x) ((x) << 29) -#define EOP_DATA_SEL_DISCARD 0 -#define EOP_DATA_SEL_VALUE_32BIT 1 -#define EOP_DATA_SEL_VALUE_64BIT 2 -#define EOP_DATA_SEL_TIMESTAMP 3 -#define EOP_DATA_SEL_GDS 5 -#define EOP_DATA_GDS(dw_offset, num_dwords) ((dw_offset) | ((unsigned)(num_dwords) << 16)) +#define EOP_DST_SEL(x) ((x) << 16) +#define EOP_DST_SEL_MEM 0 +#define EOP_DST_SEL_TC_L2 1 +#define EOP_INT_SEL(x) ((x) << 24) +#define EOP_INT_SEL_NONE 0 +#define EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM 3 +#define EOP_DATA_SEL(x) ((x) << 29) +#define EOP_DATA_SEL_DISCARD 0 +#define EOP_DATA_SEL_VALUE_32BIT 1 +#define EOP_DATA_SEL_VALUE_64BIT 2 +#define EOP_DATA_SEL_TIMESTAMP 3 +#define EOP_DATA_SEL_GDS 5 +#define EOP_DATA_GDS(dw_offset, num_dwords) ((dw_offset) | ((unsigned)(num_dwords) << 16)) /* CP DMA bug: Any use of CP_DMA.DST_SEL=TC must be avoided when EOS packets * are used. Use DST_SEL=MC instead. For prefetch, use SRC_SEL=TC and * DST_SEL=MC. Only CIK chips are affected. */ /* fix CP DMA before uncommenting: */ /*#define PKT3_EVENT_WRITE_EOS 0x48*/ /* not on GFX9 */ -#define PKT3_RELEASE_MEM 0x49 /* GFX9+ [any ring] or GFX8 [compute ring only] */ -#define PKT3_CONTEXT_REG_RMW 0x51 /* older firmware versions on older chips don't have this */ -#define PKT3_ONE_REG_WRITE 0x57 /* not on CIK */ -#define PKT3_ACQUIRE_MEM 0x58 /* new for CIK */ -#define PKT3_REWIND 0x59 /* VI+ [any ring] or CIK [compute ring only] */ -#define PKT3_LOAD_UCONFIG_REG 0x5E /* GFX7+ */ -#define PKT3_LOAD_SH_REG 0x5F -#define PKT3_LOAD_CONTEXT_REG 0x61 -#define PKT3_SET_CONFIG_REG 0x68 -#define PKT3_SET_CONTEXT_REG 0x69 -#define PKT3_SET_SH_REG 0x76 -#define PKT3_SET_SH_REG_OFFSET 0x77 -#define PKT3_SET_UCONFIG_REG 0x79 /* new for CIK */ -#define PKT3_SET_UCONFIG_REG_INDEX 0x7A /* new for GFX9, CP ucode version >= 26 */ -#define PKT3_LOAD_CONST_RAM 0x80 -#define PKT3_WRITE_CONST_RAM 0x81 -#define PKT3_DUMP_CONST_RAM 0x83 -#define PKT3_INCREMENT_CE_COUNTER 0x84 -#define PKT3_INCREMENT_DE_COUNTER 0x85 -#define PKT3_WAIT_ON_CE_COUNTER 0x86 -#define PKT3_SET_SH_REG_INDEX 0x9B -#define PKT3_LOAD_CONTEXT_REG_INDEX 0x9F /* new for VI */ +#define PKT3_RELEASE_MEM 0x49 /* GFX9+ [any ring] or GFX8 [compute ring only] */ +#define PKT3_CONTEXT_REG_RMW 0x51 /* older firmware versions on older chips don't have this */ +#define PKT3_ONE_REG_WRITE 0x57 /* not on CIK */ +#define PKT3_ACQUIRE_MEM 0x58 /* new for CIK */ +#define PKT3_REWIND 0x59 /* VI+ [any ring] or CIK [compute ring only] */ +#define PKT3_LOAD_UCONFIG_REG 0x5E /* GFX7+ */ +#define PKT3_LOAD_SH_REG 0x5F +#define PKT3_LOAD_CONTEXT_REG 0x61 +#define PKT3_SET_CONFIG_REG 0x68 +#define PKT3_SET_CONTEXT_REG 0x69 +#define PKT3_SET_SH_REG 0x76 +#define PKT3_SET_SH_REG_OFFSET 0x77 +#define PKT3_SET_UCONFIG_REG 0x79 /* new for CIK */ +#define PKT3_SET_UCONFIG_REG_INDEX 0x7A /* new for GFX9, CP ucode version >= 26 */ +#define PKT3_LOAD_CONST_RAM 0x80 +#define PKT3_WRITE_CONST_RAM 0x81 +#define PKT3_DUMP_CONST_RAM 0x83 +#define PKT3_INCREMENT_CE_COUNTER 0x84 +#define PKT3_INCREMENT_DE_COUNTER 0x85 +#define PKT3_WAIT_ON_CE_COUNTER 0x86 +#define PKT3_SET_SH_REG_INDEX 0x9B +#define PKT3_LOAD_CONTEXT_REG_INDEX 0x9F /* new for VI */ -#define PKT_TYPE_S(x) (((unsigned)(x) & 0x3) << 30) -#define PKT_TYPE_G(x) (((x) >> 30) & 0x3) -#define PKT_TYPE_C 0x3FFFFFFF -#define PKT_COUNT_S(x) (((unsigned)(x) & 0x3FFF) << 16) -#define PKT_COUNT_G(x) (((x) >> 16) & 0x3FFF) -#define PKT_COUNT_C 0xC000FFFF -#define PKT0_BASE_INDEX_S(x) (((unsigned)(x) & 0xFFFF) << 0) -#define PKT0_BASE_INDEX_G(x) (((x) >> 0) & 0xFFFF) -#define PKT0_BASE_INDEX_C 0xFFFF0000 -#define PKT3_IT_OPCODE_S(x) (((unsigned)(x) & 0xFF) << 8) -#define PKT3_IT_OPCODE_G(x) (((x) >> 8) & 0xFF) -#define PKT3_IT_OPCODE_C 0xFFFF00FF -#define PKT3_PREDICATE(x) (((x) >> 0) & 0x1) -#define PKT3_SHADER_TYPE_S(x) (((unsigned)(x) & 0x1) << 1) -#define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count)) -#define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT_COUNT_S(count) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate)) +#define PKT_TYPE_S(x) (((unsigned)(x)&0x3) << 30) +#define PKT_TYPE_G(x) (((x) >> 30) & 0x3) +#define PKT_TYPE_C 0x3FFFFFFF +#define PKT_COUNT_S(x) (((unsigned)(x)&0x3FFF) << 16) +#define PKT_COUNT_G(x) (((x) >> 16) & 0x3FFF) +#define PKT_COUNT_C 0xC000FFFF +#define PKT0_BASE_INDEX_S(x) (((unsigned)(x)&0xFFFF) << 0) +#define PKT0_BASE_INDEX_G(x) (((x) >> 0) & 0xFFFF) +#define PKT0_BASE_INDEX_C 0xFFFF0000 +#define PKT3_IT_OPCODE_S(x) (((unsigned)(x)&0xFF) << 8) +#define PKT3_IT_OPCODE_G(x) (((x) >> 8) & 0xFF) +#define PKT3_IT_OPCODE_C 0xFFFF00FF +#define PKT3_PREDICATE(x) (((x) >> 0) & 0x1) +#define PKT3_SHADER_TYPE_S(x) (((unsigned)(x)&0x1) << 1) +#define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count)) +#define PKT3(op, count, predicate) \ + (PKT_TYPE_S(3) | PKT_COUNT_S(count) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate)) -#define PKT2_NOP_PAD PKT_TYPE_S(2) -#define PKT3_NOP_PAD PKT3(PKT3_NOP, 0x3fff, 0) /* header-only version */ +#define PKT2_NOP_PAD PKT_TYPE_S(2) +#define PKT3_NOP_PAD PKT3(PKT3_NOP, 0x3fff, 0) /* header-only version */ -#define PKT3_CP_DMA 0x41 +#define PKT3_CP_DMA 0x41 /* 1. header * 2. SRC_ADDR_LO [31:0] or DATA [31:0] * 3. CP_SYNC [31] | SRC_SEL [30:29] | ENGINE [27] | DST_SEL [21:20] | SRC_ADDR_HI [15:0] @@ -256,7 +256,7 @@ * 6. COMMAND [29:22] | BYTE_COUNT [20:0] */ -#define PKT3_DMA_DATA 0x50 /* new for CIK */ +#define PKT3_DMA_DATA 0x50 /* new for CIK */ /* 1. header * 2. CP_SYNC [31] | SRC_SEL [30:29] | DST_SEL [21:20] | ENGINE [0] * 2. SRC_ADDR_LO [31:0] or DATA [31:0] @@ -267,69 +267,70 @@ */ /* SI async DMA packets */ -#define SI_DMA_PACKET(cmd, sub_cmd, n) ((((unsigned)(cmd) & 0xF) << 28) | \ - (((unsigned)(sub_cmd) & 0xFF) << 20) |\ - (((unsigned)(n) & 0xFFFFF) << 0)) +#define SI_DMA_PACKET(cmd, sub_cmd, n) \ + ((((unsigned)(cmd)&0xF) << 28) | (((unsigned)(sub_cmd)&0xFF) << 20) | \ + (((unsigned)(n)&0xFFFFF) << 0)) /* SI async DMA Packet types */ -#define SI_DMA_PACKET_WRITE 0x2 -#define SI_DMA_PACKET_COPY 0x3 -#define SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE 0xfffe0 +#define SI_DMA_PACKET_WRITE 0x2 +#define SI_DMA_PACKET_COPY 0x3 +#define SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE 0xfffe0 /* The documentation says 0xffff8 is the maximum size in dwords, which is * 0x3fffe0 in bytes. */ -#define SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE 0x3fffe0 -#define SI_DMA_COPY_DWORD_ALIGNED 0x00 -#define SI_DMA_COPY_BYTE_ALIGNED 0x40 -#define SI_DMA_COPY_TILED 0x8 -#define SI_DMA_PACKET_INDIRECT_BUFFER 0x4 -#define SI_DMA_PACKET_SEMAPHORE 0x5 -#define SI_DMA_PACKET_FENCE 0x6 -#define SI_DMA_PACKET_TRAP 0x7 -#define SI_DMA_PACKET_SRBM_WRITE 0x9 -#define SI_DMA_PACKET_CONSTANT_FILL 0xd -#define SI_DMA_PACKET_NOP 0xf +#define SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE 0x3fffe0 +#define SI_DMA_COPY_DWORD_ALIGNED 0x00 +#define SI_DMA_COPY_BYTE_ALIGNED 0x40 +#define SI_DMA_COPY_TILED 0x8 +#define SI_DMA_PACKET_INDIRECT_BUFFER 0x4 +#define SI_DMA_PACKET_SEMAPHORE 0x5 +#define SI_DMA_PACKET_FENCE 0x6 +#define SI_DMA_PACKET_TRAP 0x7 +#define SI_DMA_PACKET_SRBM_WRITE 0x9 +#define SI_DMA_PACKET_CONSTANT_FILL 0xd +#define SI_DMA_PACKET_NOP 0xf /* CIK async DMA packets */ -#define CIK_SDMA_PACKET(op, sub_op, n) ((((unsigned)(n) & 0xFFFF) << 16) | \ - (((unsigned)(sub_op) & 0xFF) << 8) | \ - (((unsigned)(op) & 0xFF) << 0)) +#define CIK_SDMA_PACKET(op, sub_op, n) \ + ((((unsigned)(n)&0xFFFF) << 16) | (((unsigned)(sub_op)&0xFF) << 8) | \ + (((unsigned)(op)&0xFF) << 0)) /* CIK async DMA packet types */ -#define CIK_SDMA_OPCODE_NOP 0x0 -#define CIK_SDMA_OPCODE_COPY 0x1 -#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR 0x0 -#define CIK_SDMA_COPY_SUB_OPCODE_TILED 0x1 -#define CIK_SDMA_COPY_SUB_OPCODE_SOA 0x3 -#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW 0x4 -#define CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW 0x5 -#define CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW 0x6 -#define CIK_SDMA_OPCODE_WRITE 0x2 -#define SDMA_WRITE_SUB_OPCODE_LINEAR 0x0 -#define SDMA_WRTIE_SUB_OPCODE_TILED 0x1 -#define CIK_SDMA_OPCODE_INDIRECT_BUFFER 0x4 -#define CIK_SDMA_PACKET_FENCE 0x5 -#define CIK_SDMA_PACKET_TRAP 0x6 -#define CIK_SDMA_PACKET_SEMAPHORE 0x7 -#define CIK_SDMA_PACKET_CONSTANT_FILL 0xb -#define CIK_SDMA_OPCODE_TIMESTAMP 0xd -#define SDMA_TS_SUB_OPCODE_SET_LOCAL_TIMESTAMP 0x0 -#define SDMA_TS_SUB_OPCODE_GET_LOCAL_TIMESTAMP 0x1 -#define SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP 0x2 -#define CIK_SDMA_PACKET_SRBM_WRITE 0xe +#define CIK_SDMA_OPCODE_NOP 0x0 +#define CIK_SDMA_OPCODE_COPY 0x1 +#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR 0x0 +#define CIK_SDMA_COPY_SUB_OPCODE_TILED 0x1 +#define CIK_SDMA_COPY_SUB_OPCODE_SOA 0x3 +#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW 0x4 +#define CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW 0x5 +#define CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW 0x6 +#define CIK_SDMA_OPCODE_WRITE 0x2 +#define SDMA_WRITE_SUB_OPCODE_LINEAR 0x0 +#define SDMA_WRTIE_SUB_OPCODE_TILED 0x1 +#define CIK_SDMA_OPCODE_INDIRECT_BUFFER 0x4 +#define CIK_SDMA_PACKET_FENCE 0x5 +#define CIK_SDMA_PACKET_TRAP 0x6 +#define CIK_SDMA_PACKET_SEMAPHORE 0x7 +#define CIK_SDMA_PACKET_CONSTANT_FILL 0xb +#define CIK_SDMA_OPCODE_TIMESTAMP 0xd +#define SDMA_TS_SUB_OPCODE_SET_LOCAL_TIMESTAMP 0x0 +#define SDMA_TS_SUB_OPCODE_GET_LOCAL_TIMESTAMP 0x1 +#define SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP 0x2 +#define CIK_SDMA_PACKET_SRBM_WRITE 0xe /* There is apparently an undocumented HW limitation that prevents the HW from copying the last 255 bytes of (1 << 22) - 1 */ -#define CIK_SDMA_COPY_MAX_SIZE 0x3fff00 /* almost 4 MB*/ -#define GFX103_SDMA_COPY_MAX_SIZE 0x3fffff00 /* almost 1 GB */ +#define CIK_SDMA_COPY_MAX_SIZE 0x3fff00 /* almost 4 MB*/ +#define GFX103_SDMA_COPY_MAX_SIZE 0x3fffff00 /* almost 1 GB */ -enum amd_cmp_class_flags { - S_NAN = 1 << 0, // Signaling NaN - Q_NAN = 1 << 1, // Quiet NaN - N_INFINITY = 1 << 2, // Negative infinity - N_NORMAL = 1 << 3, // Negative normal - N_SUBNORMAL = 1 << 4, // Negative subnormal - N_ZERO = 1 << 5, // Negative zero - P_ZERO = 1 << 6, // Positive zero - P_SUBNORMAL = 1 << 7, // Positive subnormal - P_NORMAL = 1 << 8, // Positive normal - P_INFINITY = 1 << 9 // Positive infinity +enum amd_cmp_class_flags +{ + S_NAN = 1 << 0, // Signaling NaN + Q_NAN = 1 << 1, // Quiet NaN + N_INFINITY = 1 << 2, // Negative infinity + N_NORMAL = 1 << 3, // Negative normal + N_SUBNORMAL = 1 << 4, // Negative subnormal + N_ZERO = 1 << 5, // Negative zero + P_ZERO = 1 << 6, // Positive zero + P_SUBNORMAL = 1 << 7, // Positive subnormal + P_NORMAL = 1 << 8, // Positive normal + P_INFINITY = 1 << 9 // Positive infinity }; #endif /* _SID_H */ -- 2.7.4