From f29c81f863c9879a6a87724cbdae1e1818f3f6b4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 21 Sep 2020 18:35:52 +0100 Subject: [PATCH] aco: use VOP2 for v_cvt_pkrtz_f16_f32 if possible This patch also does a slight rework of export_fs_mrt_color() to avoid setting of enabled channels which are not used. Totals from 52404 (38.38% of 136546) affected shaders (NAVI): SGPRs: 3097443 -> 3097435 (-0.00%) CodeSize: 189151600 -> 188546200 (-0.32%) Instrs: 36445061 -> 36445104 (+0.00%); split: -0.00%, +0.00% Cycles: 1739388020 -> 1739388192 (+0.00%); split: -0.00%, +0.00% VMEM: 21071501 -> 21071665 (+0.00%); split: +0.00%, -0.00% SMEM: 3470983 -> 3470982 (-0.00%); split: +0.00%, -0.00% PreSGPRs: 2058965 -> 2058962 (-0.00%) PreVGPRs: 1860294 -> 1860295 (+0.00%) Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 92 ++++++++++++-------------- src/amd/compiler/aco_opcodes.py | 2 +- 2 files changed, 45 insertions(+), 49 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index a7745d4..0ebb753 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2143,8 +2143,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); if (ctx->block->fp_mode.round16_64 == fp_round_tz) bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); - else + else if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9) bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand(0u)); + else + bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src)); break; } case nir_op_f2f32: { @@ -10321,13 +10323,28 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1; bool is_16bit = values[0].regClass() == v2b; + /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */ + if (ctx->options->enable_mrt_output_nan_fixup && + !is_16bit && + (col_format == V_028714_SPI_SHADER_32_R || + col_format == V_028714_SPI_SHADER_32_GR || + col_format == V_028714_SPI_SHADER_32_AR || + col_format == V_028714_SPI_SHADER_32_ABGR || + col_format == V_028714_SPI_SHADER_FP16_ABGR)) { + for (int i = 0; i < 4; i++) { + if (!(write_mask & (1 << i))) + continue; + + Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, + bld.hint_vcc(bld.def(bld.lm)), values[i], + bld.copy(bld.def(v1), Operand(3u))); + values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i], + bld.copy(bld.def(v1), Operand(0u)), isnan); + } + } + switch (col_format) { - case V_028714_SPI_SHADER_ZERO: - enabled_channels = 0; /* writemask */ - target = V_008DFC_SQ_EXP_NULL; - break; - case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break; @@ -10348,26 +10365,30 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) break; case V_028714_SPI_SHADER_FP16_ABGR: - enabled_channels = 0x5; - compr_op = aco_opcode::v_cvt_pkrtz_f16_f32_e64; - if (is_16bit) { - if (ctx->options->chip_class >= GFX9) { - /* Pack the FP16 values together instead of converting them to - * FP32 and back to FP16. - * TODO: use p_create_vector and let the compiler optimizes. - */ - compr_op = aco_opcode::v_pack_b32_f16; - } else { - for (unsigned i = 0; i < 4; i++) { - if ((write_mask >> i) & 1) - values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]); + for (int i = 0; i < 2; i++) { + bool enabled = (write_mask >> (i*2)) & 0x3; + if (enabled) { + enabled_channels |= 1 << i; + if (is_16bit) { + values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), + values[i*2].isUndefined() ? Operand(v2b) : values[i*2], + values[i*2+1].isUndefined() ? Operand(v2b): values[i*2+1]); + } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9 ) { + values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), + values[i*2].isUndefined() ? Operand(0u) : values[i*2], + values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]); + } else { + values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), + values[i*2].isUndefined() ? values[i*2+1] : values[i*2], + values[i*2+1].isUndefined() ? values[i*2] : values[i*2+1]); } + } else { + values[i] = Operand(v1); } } break; case V_028714_SPI_SHADER_UNORM16_ABGR: - enabled_channels = 0x5; if (is_16bit && ctx->options->chip_class >= GFX9) { compr_op = aco_opcode::v_cvt_pknorm_u16_f16; } else { @@ -10376,7 +10397,6 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) break; case V_028714_SPI_SHADER_SNORM16_ABGR: - enabled_channels = 0x5; if (is_16bit && ctx->options->chip_class >= GFX9) { compr_op = aco_opcode::v_cvt_pknorm_i16_f16; } else { @@ -10385,7 +10405,6 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) break; case V_028714_SPI_SHADER_UINT16_ABGR: { - enabled_channels = 0x5; compr_op = aco_opcode::v_cvt_pk_u16_u32; if (is_int8 || is_int10) { /* clamp */ @@ -10411,7 +10430,6 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) } case V_028714_SPI_SHADER_SINT16_ABGR: - enabled_channels = 0x5; compr_op = aco_opcode::v_cvt_pk_i16_i32; if (is_int8 || is_int10) { /* clamp */ @@ -10444,39 +10462,17 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot) enabled_channels = 0xF; break; + case V_028714_SPI_SHADER_ZERO: default: - break; - } - - if (target == V_008DFC_SQ_EXP_NULL) return false; - - /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */ - if (ctx->options->enable_mrt_output_nan_fixup && - !is_16bit && - (col_format == V_028714_SPI_SHADER_32_R || - col_format == V_028714_SPI_SHADER_32_GR || - col_format == V_028714_SPI_SHADER_32_AR || - col_format == V_028714_SPI_SHADER_32_ABGR || - col_format == V_028714_SPI_SHADER_FP16_ABGR)) { - for (int i = 0; i < 4; i++) { - if (!(write_mask & (1 << i))) - continue; - - Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, - bld.hint_vcc(bld.def(bld.lm)), values[i], - bld.copy(bld.def(v1), Operand(3u))); - values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i], - bld.copy(bld.def(v1), Operand(0u)), isnan); - } } if ((bool) compr_op) { for (int i = 0; i < 2; i++) { /* check if at least one of the values to be compressed is enabled */ - unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1; + bool enabled = (write_mask >> (i*2)) & 0x3; if (enabled) { - enabled_channels |= enabled << (i*2); + enabled_channels |= 1 << (i*2); values[i] = bld.vop3(compr_op, bld.def(v1), values[i*2].isUndefined() ? Operand(0u) : values[i*2], values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 97be2ad..42629c4 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -221,7 +221,7 @@ class Opcode(object): self.operand_size = 0 elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']: self.operand_size = 0 - elif '_pk_' in name or name in ['v_lerp_u8', 'v_sad_u8', 'v_sad_u16', + elif '_pk' in name or name in ['v_lerp_u8', 'v_sad_u8', 'v_sad_u16', 'v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1', 'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']: self.operand_size = 32 -- 2.7.4