From 828aff2a2de31dc934b36453840a9118d85bb2ee Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Tue, 7 Mar 2023 14:38:34 +0100 Subject: [PATCH] aco: use array indexing for opsel/opsel_lo/opsel_hi Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_optimizer.cpp | 86 ++++++++++++++-------------- src/amd/compiler/aco_print_ir.cpp | 11 ++-- src/amd/compiler/aco_register_allocation.cpp | 24 +++----- src/amd/compiler/aco_validate.cpp | 11 ++-- 4 files changed, 59 insertions(+), 73 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 7e4a6e3..57dd3de 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -954,8 +954,8 @@ propagate_constants_vop3p(opt_ctx& ctx, aco_ptr& instr, ssa_info& i /* try to fold inline constants */ VALU_instruction* vop3p = &instr->valu(); - bool opsel_lo = (vop3p->opsel_lo >> i) & 1; - bool opsel_hi = (vop3p->opsel_hi >> i) & 1; + bool opsel_lo = vop3p->opsel_lo[i]; + bool opsel_hi = vop3p->opsel_hi[i]; Operand const_op[2]; bool const_opsel[2] = {false, false}; @@ -1026,8 +1026,8 @@ propagate_constants_vop3p(opt_ctx& ctx, aco_ptr& instr, ssa_info& i opsel_hi = false; } - vop3p->opsel_lo = opsel_lo ? (vop3p->opsel_lo | (1 << i)) : (vop3p->opsel_lo & ~(1 << i)); - vop3p->opsel_hi = opsel_hi ? (vop3p->opsel_hi | (1 << i)) : (vop3p->opsel_hi & ~(1 << i)); + vop3p->opsel_lo[i] = opsel_lo; + vop3p->opsel_hi[i] = opsel_hi; } bool @@ -1103,7 +1103,7 @@ can_apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_i return true; } else if (instr->isVOP3() && sel.size() == 2 && can_use_opsel(ctx.program->gfx_level, instr->opcode, idx) && - !(instr->valu().opsel & (1 << idx))) { + !instr->valu().opsel[idx]) { return true; } else if (instr->opcode == aco_opcode::p_extract) { SubdwordSel instrSel = parse_extract(instr.get()); @@ -1162,7 +1162,7 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& mad->operands[0] = instr->operands[0]; mad->operands[1] = instr->operands[1]; mad->operands[2] = Operand::zero(); - mad->valu().opsel = (sel.offset() / 2) << idx; + mad->valu().opsel[idx] = sel.offset(); instr.reset(mad); } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) { @@ -1170,7 +1170,7 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& static_cast(instr.get())->sel[idx] = sel; } else if (instr->isVOP3()) { if (sel.offset()) - instr->valu().opsel |= 1 << idx; + instr->valu().opsel[idx] = true; } else if (instr->opcode == aco_opcode::p_extract) { SubdwordSel instrSel = parse_extract(instr.get()); @@ -2235,12 +2235,12 @@ combine_ordering_test(opt_ctx& ctx, aco_ptr& instr) if (op_instr[i]->isVOP3()) { VALU_instruction& vop3 = op_instr[i]->valu(); - if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 || - vop3.opsel == 2) + if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || + vop3.opsel[0] != vop3.opsel[1]) return false; neg[i] = vop3.neg[0]; abs[i] = vop3.abs[0]; - opsel |= (vop3.opsel & 1) << i; + opsel |= vop3.opsel[0] << i; } else if (op_instr[i]->isSDWA()) { return false; } @@ -2515,8 +2515,8 @@ combine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr& instr) if (nan_test->isVOP3()) { VALU_instruction& vop3 = nan_test->valu(); - if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 || - vop3.opsel == 2) + if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || + vop3.opsel[0] != vop3.opsel[1]) return false; } @@ -2636,8 +2636,8 @@ match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op return false; if (inbetween_opsel) - *inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << (unsigned)swap) : false; - else if (op1_vop3 && op1_vop3->opsel & (1 << (unsigned)swap)) + *inbetween_opsel = op1_vop3 ? op1_vop3->opsel[swap] : false; + else if (op1_vop3 && op1_vop3->opsel[swap]) return false; *precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise(); @@ -2650,14 +2650,14 @@ match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op operands[shuffle[0]] = op1_instr->operands[!swap]; neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false; abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false; - if (op1_vop3 && (op1_vop3->opsel & (1 << (unsigned)!swap))) + if (op1_vop3 && op1_vop3->opsel[!swap]) *opsel |= 1 << shuffle[0]; for (unsigned i = 0; i < 2; i++) { operands[shuffle[i + 1]] = op2_instr->operands[i]; neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false; abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false; - if (op2_vop3 && op2_vop3->opsel & (1 << i)) + if (op2_vop3 && op2_vop3->opsel[i]) *opsel |= 1 << shuffle[i + 1]; } @@ -3733,12 +3733,10 @@ combine_add_lshl(opt_ctx& ctx, aco_ptr& instr, bool is_sub) } void -propagate_swizzles(VALU_instruction* instr, uint8_t opsel_lo, uint8_t opsel_hi) +propagate_swizzles(VALU_instruction* instr, bool opsel_lo, bool opsel_hi) { /* propagate swizzles which apply to a result down to the instruction's operands: * result = a.xy + b.xx -> result.yx = a.yx + b.xx */ - assert((opsel_lo & 1) == opsel_lo); - assert((opsel_hi & 1) == opsel_hi); uint8_t tmp_lo = instr->opsel_lo; uint8_t tmp_hi = instr->opsel_hi; uint8_t neg_lo = instr->neg_lo; @@ -3761,13 +3759,13 @@ combine_vop3p(opt_ctx& ctx, aco_ptr& instr) /* apply clamp */ if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) && vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1 && - !((vop3p->opsel_lo | vop3p->opsel_hi) & 2)) { + !vop3p->opsel_lo[1] && !vop3p->opsel_hi[1]) { ssa_info& info = ctx.info[instr->operands[0].tempId()]; if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) { VALU_instruction* candidate = &ctx.info[instr->operands[0].tempId()].instr->valu(); candidate->clamp = true; - propagate_swizzles(candidate, vop3p->opsel_lo, vop3p->opsel_hi); + propagate_swizzles(candidate, vop3p->opsel_lo[0], vop3p->opsel_hi[0]); instr->definitions[0].swapTemp(candidate->definitions[0]); ctx.info[candidate->definitions[0].tempId()].instr = candidate; ctx.uses[instr->definitions[0].tempId()]--; @@ -3788,7 +3786,7 @@ combine_vop3p(opt_ctx& ctx, aco_ptr& instr) VALU_instruction* fneg = &info.instr->valu(); - if ((fneg->opsel_lo | fneg->opsel_hi) & 2) + if (fneg->opsel_lo[1] || fneg->opsel_hi[1]) continue; Operand ops[3]; @@ -3806,14 +3804,14 @@ combine_vop3p(opt_ctx& ctx, aco_ptr& instr) * if 0 - pick selection from fneg->lo * if 1 - pick selection from fneg->hi */ - bool opsel_lo = (vop3p->opsel_lo >> i) & 1; - bool opsel_hi = (vop3p->opsel_hi >> i) & 1; + bool opsel_lo = vop3p->opsel_lo[i]; + bool opsel_hi = vop3p->opsel_hi[i]; bool neg_lo = fneg->neg_lo[0] ^ fneg->neg_lo[1]; bool neg_hi = fneg->neg_hi[0] ^ fneg->neg_hi[1]; vop3p->neg_lo[i] ^= opsel_lo ? neg_hi : neg_lo; vop3p->neg_hi[i] ^= opsel_hi ? neg_hi : neg_lo; - vop3p->opsel_lo ^= ((opsel_lo ? ~fneg->opsel_hi : (unsigned)fneg->opsel_lo) & 1) << i; - vop3p->opsel_hi ^= ((opsel_hi ? ~fneg->opsel_hi : (unsigned)fneg->opsel_lo) & 1) << i; + vop3p->opsel_lo[i] ^= opsel_lo ? !fneg->opsel_hi[0] : fneg->opsel_lo[0]; + vop3p->opsel_hi[i] ^= opsel_hi ? !fneg->opsel_hi[0] : fneg->opsel_lo[0]; if (--ctx.uses[fneg->definitions[0].tempId()]) ctx.uses[fneg->operands[0].tempId()]++; @@ -3828,7 +3826,7 @@ combine_vop3p(opt_ctx& ctx, aco_ptr& instr) Instruction* mul_instr = nullptr; unsigned add_op_idx = 0; - uint8_t opsel_lo = 0, opsel_hi = 0; + bool opsel_lo = false, opsel_hi = false; uint32_t uses = UINT32_MAX; /* find the 'best' mul instruction to combine with the add */ @@ -3855,8 +3853,8 @@ combine_vop3p(opt_ctx& ctx, aco_ptr& instr) mul_instr = info.instr; add_op_idx = 1 - i; - opsel_lo = (vop3p->opsel_lo >> i) & 1; - opsel_hi = (vop3p->opsel_hi >> i) & 1; + opsel_lo = vop3p->opsel_lo[i]; + opsel_hi = vop3p->opsel_hi[i]; uses = ctx.uses[instr->operands[i].tempId()]; } @@ -3888,8 +3886,8 @@ combine_vop3p(opt_ctx& ctx, aco_ptr& instr) fma->opsel_lo = mul->opsel_lo; fma->opsel_hi = mul->opsel_hi; propagate_swizzles(fma.get(), opsel_lo, opsel_hi); - fma->opsel_lo |= (vop3p->opsel_lo << (2 - add_op_idx)) & 0x4; - fma->opsel_hi |= (vop3p->opsel_hi << (2 - add_op_idx)) & 0x4; + fma->opsel_lo[2] = vop3p->opsel_lo[add_op_idx]; + fma->opsel_hi[2] = vop3p->opsel_hi[add_op_idx]; fma->neg_lo[2] = vop3p->neg_lo[add_op_idx]; fma->neg_hi[2] = vop3p->neg_hi[add_op_idx]; fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx]; @@ -3927,7 +3925,7 @@ can_use_mad_mix(opt_ctx& ctx, aco_ptr& instr) return false; if (instr->isVOP3()) - return !instr->valu().omod && !(instr->valu().opsel & 0x8); + return !instr->valu().omod && !instr->valu().opsel[3]; return instr->format == Format::VOP2; } @@ -4046,9 +4044,9 @@ combine_mad_mix(opt_ctx& ctx, aco_ptr& instr) instr->operands[i].setTemp(conv->operands[0].getTemp()); if (conv->definitions[0].isPrecise()) instr->definitions[0].setPrecise(true); - instr->valu().opsel_hi ^= 1u << i; + instr->valu().opsel_hi[i] ^= true; if (conv->isSDWA() && conv->sdwa().sel[0].offset() == 2) - instr->valu().opsel_lo |= 1u << i; + instr->valu().opsel_lo[i] = true; bool neg = conv->valu().neg[0]; bool abs = conv->valu().abs[0]; if (!instr->valu().abs[i]) { @@ -4221,9 +4219,9 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) (instr->opcode == aco_opcode::v_fma_mix_f32 || instr->opcode == aco_opcode::v_fma_mixlo_f16) && !instr->valu().neg_lo[0] && - ((instr->operands[0].constantEquals(0x3f800000) && (instr->valu().opsel_hi & 0x1) == 0) || - (instr->operands[0].constantEquals(0x3C00) && (instr->valu().opsel_hi & 0x1) && - !(instr->valu().opsel_lo & 0x1))); + ((instr->operands[0].constantEquals(0x3f800000) && !instr->valu().opsel_hi[0]) || + (instr->operands[0].constantEquals(0x3C00) && instr->valu().opsel_hi[0] && + !instr->valu().opsel_lo[0])); bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_subrev_f32; bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 || @@ -4332,8 +4330,8 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) VALU_instruction& valu = instr->valu(); neg[2] = valu.neg[add_op_idx]; abs[2] = valu.abs[add_op_idx]; - opsel_lo |= valu.opsel_lo & (1 << add_op_idx) ? 0x4 : 0x0; - opsel_hi |= valu.opsel_hi & (1 << add_op_idx) ? 0x4 : 0x0; + opsel_lo |= valu.opsel_lo[add_op_idx] ? 0x4 : 0x0; + opsel_hi |= valu.opsel_hi[add_op_idx] ? 0x4 : 0x0; omod = valu.omod; clamp = valu.clamp; /* abs of the multiplication result */ @@ -5068,7 +5066,7 @@ unswizzle_vop3p_literals(opt_ctx& ctx, aco_ptr& instr) for (unsigned i = 0; i < instr->operands.size(); i++) { if (!instr->operands[i].isLiteral()) continue; - unsigned new_swizzle = ((vop3p.opsel_lo >> i) & 0x1) | (((vop3p.opsel_hi >> i) & 0x1) << 1); + unsigned new_swizzle = vop3p.opsel_lo[i] | (vop3p.opsel_hi[i] << 1); if (literal_swizzle != ~0u && new_swizzle != literal_swizzle) return; /* Literal swizzles conflict. */ literal_swizzle = new_swizzle; @@ -5084,8 +5082,8 @@ unswizzle_vop3p_literals(opt_ctx& ctx, aco_ptr& instr) literal = (literal >> (16 * (literal_swizzle & 0x1)) & 0xffff) | (literal >> (8 * (literal_swizzle & 0x2)) << 16); instr->operands[i] = Operand::literal32(literal); - vop3p.opsel_lo &= ~(1 << i); - vop3p.opsel_hi |= (1 << i); + vop3p.opsel_lo[i] = false; + vop3p.opsel_hi[i] = true; } } @@ -5113,8 +5111,8 @@ apply_literals(opt_ctx& ctx, aco_ptr& instr) u_foreach_bit (i, info->fp16_mask) { float value = uif(ctx.info[instr->operands[i].tempId()].val); literal |= _mesa_float_to_half(value) << (second * 16); - instr->valu().opsel_lo |= second << i; - instr->valu().opsel_hi |= 1 << i; + instr->valu().opsel_lo[i] = second; + instr->valu().opsel_hi[i] = true; second = true; } diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 623538b..6d2dab9 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -746,15 +746,15 @@ aco_print_instr(enum amd_gfx_level gfx_level, const Instruction* instr, FILE* ou for (unsigned i = 0; i < MIN2(num_operands, 3); ++i) { abs[i] = valu.abs[i]; neg[i] = valu.neg[i]; - opsel[i] = valu.opsel & (1 << i); + opsel[i] = valu.opsel[i]; } } else if (instr->isVOP3P() && is_mad_mix) { const VALU_instruction& vop3p = instr->valu(); for (unsigned i = 0; i < MIN2(num_operands, 3); ++i) { abs[i] = vop3p.neg_hi[i]; neg[i] = vop3p.neg_lo[i]; - f2f32[i] = vop3p.opsel_hi & (1 << i); - opsel[i] = f2f32[i] && (vop3p.opsel_lo & (1 << i)); + f2f32[i] = vop3p.opsel_hi[i]; + opsel[i] = f2f32[i] && vop3p.opsel_lo[i]; } } for (unsigned i = 0; i < num_operands; ++i) { @@ -779,9 +779,8 @@ aco_print_instr(enum amd_gfx_level gfx_level, const Instruction* instr, FILE* ou if (instr->isVOP3P() && !is_mad_mix) { const VALU_instruction& vop3 = instr->valu(); - if ((vop3.opsel_lo & (1 << i)) || !(vop3.opsel_hi & (1 << i))) { - fprintf(output, ".%c%c", vop3.opsel_lo & (1 << i) ? 'y' : 'x', - vop3.opsel_hi & (1 << i) ? 'y' : 'x'); + if (vop3.opsel_lo[i] || !vop3.opsel_hi[i]) { + fprintf(output, ".%c%c", vop3.opsel_lo[i] ? 'y' : 'x', vop3.opsel_hi[i] ? 'y' : 'x'); } if (vop3.neg_lo[i] && vop3.neg_hi[i]) fprintf(output, "*[-1,-1]"); diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index af055ab..7ffbf93 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -541,20 +541,15 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr& instr, unsigned idx, uns assert(rc.bytes() <= 2); if (instr->isVALU()) { /* check if we can use opsel */ - if (instr->format == Format::VOP3) { + if (instr->format == Format::VOP3 || instr->isVINTERP_INREG()) { assert(byte == 2); - instr->valu().opsel |= 1 << idx; - return; - } - if (instr->isVINTERP_INREG()) { - assert(byte == 2); - instr->vinterp_inreg().opsel |= 1 << idx; + instr->valu().opsel[idx] = true; return; } if (instr->isVOP3P()) { - assert(byte == 2 && !(instr->valu().opsel_lo & (1 << idx))); - instr->valu().opsel_lo |= 1 << idx; - instr->valu().opsel_hi |= 1 << idx; + assert(byte == 2 && !instr->valu().opsel_lo[idx]); + instr->valu().opsel_lo[idx] = true; + instr->valu().opsel_hi[idx] = true; return; } if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) { @@ -692,15 +687,10 @@ add_subdword_definition(Program* program, aco_ptr& instr, PhysReg r return; /* check if we can use opsel */ - if (instr->format == Format::VOP3) { - assert(reg.byte() == 2); - assert(can_use_opsel(gfx_level, instr->opcode, -1)); - instr->valu().opsel |= (1 << 3); /* dst in high half */ - return; - } else if (instr->isVINTERP_INREG()) { + if (instr->format == Format::VOP3 || instr->isVINTERP_INREG()) { assert(reg.byte() == 2); assert(can_use_opsel(gfx_level, instr->opcode, -1)); - instr->vinterp_inreg().opsel |= (1 << 3); /* dst in high half */ + instr->valu().opsel[3] = true; /* dst in high half */ return; } diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 996b49c..a597c93 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -232,11 +232,10 @@ validate_ir(Program* program) if (i >= instr->operands.size() || (instr->operands[i].hasRegClass() && instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed())) - check((vop3.opsel & (1 << i)) == 0, "Unexpected opsel for operand", instr.get()); + check(!vop3.opsel[i], "Unexpected opsel for operand", instr.get()); } if (instr->definitions[0].regClass().is_subdword() && !instr->definitions[0].isFixed()) - check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition", - instr.get()); + check(!vop3.opsel[3], "Unexpected opsel for sub-dword definition", instr.get()); } else if (instr->opcode == aco_opcode::v_fma_mixlo_f16 || instr->opcode == aco_opcode::v_fma_mixhi_f16 || instr->opcode == aco_opcode::v_fma_mix_f32) { @@ -248,7 +247,7 @@ validate_ir(Program* program) for (unsigned i = 0; i < instr->operands.size(); i++) { if (instr->operands[i].hasRegClass() && instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()) - check((vop3p.opsel_lo & (1 << i)) == 0 && (vop3p.opsel_hi & (1 << i)) == 0, + check(!vop3p.opsel_lo[i] && !vop3p.opsel_hi[i], "Unexpected opsel for subdword operand", instr.get()); } check(instr->definitions[0].regClass() == v1, "VOP3P must have v1 definition", @@ -866,8 +865,8 @@ validate_subdword_operand(amd_gfx_level gfx_level, const aco_ptr& i bool fma_mix = instr->opcode == aco_opcode::v_fma_mixlo_f16 || instr->opcode == aco_opcode::v_fma_mixhi_f16 || instr->opcode == aco_opcode::v_fma_mix_f32; - return ((instr->valu().opsel_lo >> index) & 1) == (byte >> 1) && - ((instr->valu().opsel_hi >> index) & 1) == (fma_mix || (byte >> 1)); + return instr->valu().opsel_lo[index] == (byte >> 1) && + instr->valu().opsel_hi[index] == (fma_mix || (byte >> 1)); } if (byte == 2 && can_use_opsel(gfx_level, instr->opcode, index)) return true; -- 2.7.4