From d3e0cf3d323347a5bb96e70dbcc19ccae06c5bf8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Fri, 27 Aug 2021 15:45:59 +0200 Subject: [PATCH] aco: Omit p_extract after ds_read with matching bit size. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Fossil DB stats on Sienna Cichlid: Totals from 135 (0.10% of 128647) affected shaders: CodeSize: 525184 -> 523704 (-0.28%) Instrs: 92835 -> 92684 (-0.16%) Latency: 311528 -> 311055 (-0.15%) InvThroughput: 86572 -> 86455 (-0.14%) Copies: 7666 -> 7650 (-0.21%) Fossil DB stats on Sienna Cichlid with NGGC on: Totals from 58374 (45.38% of 128647) affected shaders: CodeSize: 160322912 -> 159622564 (-0.44%) Instrs: 30755822 -> 30639193 (-0.38%) Latency: 136713768 -> 136690360 (-0.02%) InvThroughput: 21739219 -> 21658151 (-0.37%) Copies: 3297969 -> 3297953 (-0.00%) Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 61 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 7b22d76..5f52817 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1688,6 +1688,13 @@ label_instruction(opt_ctx& ctx, aco_ptr& instr) } break; } + case aco_opcode::ds_read_u8: + case aco_opcode::ds_read_u8_d16: + case aco_opcode::ds_read_u16: + case aco_opcode::ds_read_u16_d16: { + ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); + break; + } default: break; } @@ -2893,6 +2900,57 @@ apply_insert(opt_ctx& ctx, aco_ptr& instr) return true; } +/* Remove superfluous extract after ds_read like so: + * p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN() + */ +bool +apply_ds_extract(opt_ctx& ctx, aco_ptr& extract) +{ + /* Check if p_extract has a usedef operand and is the only user. */ + if (!ctx.info[extract->operands[0].tempId()].is_usedef() || + ctx.uses[extract->operands[0].tempId()] > 1) + return false; + + /* Check if the usedef is a DS instruction. */ + Instruction* ds = ctx.info[extract->operands[0].tempId()].instr; + if (ds->format != Format::DS) + return false; + + unsigned extract_idx = extract->operands[1].constantValue(); + unsigned bits_extracted = extract->operands[2].constantValue(); + unsigned sign_ext = extract->operands[3].constantValue(); + unsigned dst_bitsize = extract->definitions[0].bytes() * 8u; + + /* TODO: These are doable, but probably don't occour too often. */ + if (extract_idx || sign_ext || dst_bitsize != 32) + return false; + + unsigned bits_loaded = 0; + if (ds->opcode == aco_opcode::ds_read_u8 || ds->opcode == aco_opcode::ds_read_u8_d16) + bits_loaded = 8; + else if (ds->opcode == aco_opcode::ds_read_u16 || ds->opcode == aco_opcode::ds_read_u16_d16) + bits_loaded = 16; + else + return false; + + /* Shrink the DS load if the extracted bit size is smaller. */ + bits_loaded = MIN2(bits_loaded, bits_extracted); + + /* Change the DS opcode so it writes the full register. */ + if (bits_loaded == 8) + ds->opcode = aco_opcode::ds_read_u8; + else if (bits_loaded == 16) + ds->opcode = aco_opcode::ds_read_u16; + else + unreachable("Forgot to add DS opcode above."); + + /* The DS now produces the exact same thing as the extract, remove the extract. */ + std::swap(ds->definitions[0], extract->definitions[0]); + ctx.uses[extract->definitions[0].tempId()] = 0; + ctx.info[ds->definitions[0].tempId()].label = 0; + return true; +} + /* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */ bool combine_and_subbrev(opt_ctx& ctx, aco_ptr& instr) @@ -3217,6 +3275,9 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) if (instr->isSDWA() || instr->isDPP()) return; + if (instr->opcode == aco_opcode::p_extract) + apply_ds_extract(ctx, instr); + /* TODO: There are still some peephole optimizations that could be done: * - abs(a - b) -> s_absdiff_i32 * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32 -- 2.7.4