From 22be0d09a005b4b955a46b65a919cfd786d6814f Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Mon, 28 Nov 2022 17:10:22 +0100 Subject: [PATCH] aco: Don't prematurely emit s_andn2. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Split s_not + s_and allows more inverse comparision and s_cbranch_vccz optimizations. Foz-DB Navi21: Totals from 516 (0.38% of 134913) affected shaders: CodeSize: 7273724 -> 7273720 (-0.00%) Instrs: 1364408 -> 1364407 (-0.00%) Latency: 14604862 -> 14604858 (-0.00%) Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 35 +++++++++++++------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index c7f58a3..0e77b85 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -7862,9 +7862,9 @@ emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp sr return src; } if (op == nir_op_iand && cluster_size == 4) { - /* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */ - Temp tmp = - bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); + /* subgroupClusteredAnd(val, 4) -> ~wqm(~val & exec) */ + Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src); + tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm)); return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp)); } else if (op == nir_op_ior && cluster_size == 4) { @@ -7873,11 +7873,11 @@ emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp sr Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))); } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) { - /* subgroupAnd(val) -> (exec & ~val) == 0 */ - Temp tmp = - bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src) - .def(1) - .getTemp(); + /* subgroupAnd(val) -> (~val & exec) == 0 */ + Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src); + tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm)) + .def(1) + .getTemp(); Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp)); return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond); } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) { @@ -7952,16 +7952,15 @@ emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src) Builder bld(ctx->program, ctx->block); assert(src.regClass() == bld.lm); - /* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0 + /* subgroupExclusiveAnd(val) -> mbcnt(~val & exec) == 0 * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0 * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0 */ - Temp tmp; if (op == nir_op_iand) - tmp = - bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); - else - tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src); + + Temp tmp = + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp)); @@ -8708,10 +8707,10 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) assert(src.regClass() == bld.lm); assert(dst.regClass() == bld.lm); - Temp tmp = - bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src) - .def(1) - .getTemp(); + Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src); + tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm)) + .def(1) + .getTemp(); Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp)); bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond); break; -- 2.7.4