2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "aco_builder.h"
28 #include "util/half_float.h"
29 #include "util/memstream.h"
39 perfwarn(Program* program, bool cond, const char* msg, Instruction* instr)
44 struct u_memstream mem;
45 u_memstream_open(&mem, &out, &outsize);
46 FILE* const memf = u_memstream_get(&mem);
48 fprintf(memf, "%s: ", msg);
49 aco_print_instr(program->gfx_level, instr, memf);
50 u_memstream_close(&mem);
52 aco_perfwarn(program, out);
55 if (debug_flags & DEBUG_PERFWARN)
62 * The optimizer works in 4 phases:
63 * (1) The first pass collects information for each ssa-def,
64 * propagates reg->reg operands of the same type, inline constants
65 * and neg/abs input modifiers.
66 * (2) The second pass combines instructions like mad, omod, clamp and
67 * propagates sgpr's on VALU instructions.
68 * This pass depends on information collected in the first pass.
69 * (3) The third pass goes backwards, and selects instructions,
70 * i.e. decides if a mad instruction is profitable and eliminates dead code.
71 * (4) The fourth pass cleans up the sequence: literals get applied and dead
72 * instructions are removed from the sequence.
76 aco_ptr<Instruction> add_instr;
78 uint16_t literal_mask;
81 mad_info(aco_ptr<Instruction> instr, uint32_t id)
82 : add_instr(std::move(instr)), mul_temp_id(id), literal_mask(0), fp16_mask(0)
88 label_constant_32bit = 1 << 1,
89 /* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and
90 * 32-bit operations but this shouldn't cause any issues because we don't
91 * look through any conversions */
96 label_literal = 1 << 6,
100 label_omod5 = 1 << 10,
101 label_clamp = 1 << 12,
102 label_undefined = 1 << 14,
105 label_add_sub = 1 << 17,
106 label_bitwise = 1 << 18,
107 label_minmax = 1 << 19,
108 label_vopc = 1 << 20,
109 label_uniform_bool = 1 << 21,
110 label_constant_64bit = 1 << 22,
111 label_uniform_bitwise = 1 << 23,
112 label_scc_invert = 1 << 24,
113 label_scc_needed = 1 << 26,
115 label_fcanonicalize = 1 << 28,
116 label_constant_16bit = 1 << 29,
117 label_usedef = 1 << 30, /* generic label */
118 label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */
119 label_canonicalized = 1ull << 32,
120 label_extract = 1ull << 33,
121 label_insert = 1ull << 34,
122 label_dpp16 = 1ull << 35,
123 label_dpp8 = 1ull << 36,
124 label_f2f32 = 1ull << 37,
125 label_f2f16 = 1ull << 38,
126 label_split = 1ull << 39,
127 label_subgroup_invocation = 1ull << 40,
130 static constexpr uint64_t instr_usedef_labels =
131 label_vec | label_mul | label_add_sub | label_vop3p | label_bitwise | label_uniform_bitwise |
132 label_minmax | label_vopc | label_usedef | label_extract | label_dpp16 | label_dpp8 |
133 label_f2f32 | label_subgroup_invocation;
134 static constexpr uint64_t instr_mod_labels =
135 label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16;
137 static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels | label_split;
138 static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f |
139 label_uniform_bool | label_scc_invert | label_b2i |
141 static constexpr uint32_t val_labels =
142 label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal | label_mad;
144 static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect");
145 static_assert((instr_labels & val_labels) == 0, "labels cannot intersect");
146 static_assert((temp_labels & val_labels) == 0, "labels cannot intersect");
156 ssa_info() : label(0) {}
158 void add_label(Label new_label)
160 /* Since all the instr_usedef_labels use instr for the same thing
161 * (indicating the defining instruction), there is usually no need to
162 * clear any other instr labels. */
163 if (new_label & instr_usedef_labels)
164 label &= ~(instr_mod_labels | temp_labels | val_labels); /* instr, temp and val alias */
166 if (new_label & instr_mod_labels) {
167 label &= ~instr_labels;
168 label &= ~(temp_labels | val_labels); /* instr, temp and val alias */
171 if (new_label & temp_labels) {
172 label &= ~temp_labels;
173 label &= ~(instr_labels | val_labels); /* instr, temp and val alias */
176 uint32_t const_labels =
177 label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;
178 if (new_label & const_labels) {
179 label &= ~val_labels | const_labels;
180 label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
181 } else if (new_label & val_labels) {
182 label &= ~val_labels;
183 label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
189 void set_vec(Instruction* vec)
191 add_label(label_vec);
195 bool is_vec() { return label & label_vec; }
197 void set_constant(amd_gfx_level gfx_level, uint64_t constant)
199 Operand op16 = Operand::c16(constant);
200 Operand op32 = Operand::get_const(gfx_level, constant, 4);
201 add_label(label_literal);
204 /* check that no upper bits are lost in case of packed 16bit constants */
205 if (gfx_level >= GFX8 && !op16.isLiteral() &&
206 op16.constantValue16(true) == ((constant >> 16) & 0xffff))
207 add_label(label_constant_16bit);
209 if (!op32.isLiteral())
210 add_label(label_constant_32bit);
212 if (Operand::is_constant_representable(constant, 8))
213 add_label(label_constant_64bit);
215 if (label & label_constant_64bit) {
216 val = Operand::c64(constant).constantValue();
218 label &= ~(label_literal | label_constant_16bit | label_constant_32bit);
222 bool is_constant(unsigned bits)
225 case 8: return label & label_literal;
226 case 16: return label & label_constant_16bit;
227 case 32: return label & label_constant_32bit;
228 case 64: return label & label_constant_64bit;
233 bool is_literal(unsigned bits)
235 bool is_lit = label & label_literal;
237 case 8: return false;
238 case 16: return is_lit && ~(label & label_constant_16bit);
239 case 32: return is_lit && ~(label & label_constant_32bit);
240 case 64: return false;
245 bool is_constant_or_literal(unsigned bits)
248 return label & label_constant_64bit;
250 return label & label_literal;
253 void set_abs(Temp abs_temp)
255 add_label(label_abs);
259 bool is_abs() { return label & label_abs; }
261 void set_neg(Temp neg_temp)
263 add_label(label_neg);
267 bool is_neg() { return label & label_neg; }
269 void set_neg_abs(Temp neg_abs_temp)
271 add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));
275 void set_mul(Instruction* mul)
277 add_label(label_mul);
281 bool is_mul() { return label & label_mul; }
283 void set_temp(Temp tmp)
285 add_label(label_temp);
289 bool is_temp() { return label & label_temp; }
291 void set_mad(uint32_t mad_info_idx)
293 add_label(label_mad);
297 bool is_mad() { return label & label_mad; }
299 void set_omod2(Instruction* mul)
301 if (label & temp_labels)
303 add_label(label_omod2);
307 bool is_omod2() { return label & label_omod2; }
309 void set_omod4(Instruction* mul)
311 if (label & temp_labels)
313 add_label(label_omod4);
317 bool is_omod4() { return label & label_omod4; }
319 void set_omod5(Instruction* mul)
321 if (label & temp_labels)
323 add_label(label_omod5);
327 bool is_omod5() { return label & label_omod5; }
329 void set_clamp(Instruction* med3)
331 if (label & temp_labels)
333 add_label(label_clamp);
337 bool is_clamp() { return label & label_clamp; }
339 void set_f2f16(Instruction* conv)
341 if (label & temp_labels)
343 add_label(label_f2f16);
347 bool is_f2f16() { return label & label_f2f16; }
349 void set_undefined() { add_label(label_undefined); }
351 bool is_undefined() { return label & label_undefined; }
353 void set_vcc(Temp vcc_val)
355 add_label(label_vcc);
359 bool is_vcc() { return label & label_vcc; }
361 void set_b2f(Temp b2f_val)
363 add_label(label_b2f);
367 bool is_b2f() { return label & label_b2f; }
369 void set_add_sub(Instruction* add_sub_instr)
371 add_label(label_add_sub);
372 instr = add_sub_instr;
375 bool is_add_sub() { return label & label_add_sub; }
377 void set_bitwise(Instruction* bitwise_instr)
379 add_label(label_bitwise);
380 instr = bitwise_instr;
383 bool is_bitwise() { return label & label_bitwise; }
385 void set_uniform_bitwise() { add_label(label_uniform_bitwise); }
387 bool is_uniform_bitwise() { return label & label_uniform_bitwise; }
389 void set_minmax(Instruction* minmax_instr)
391 add_label(label_minmax);
392 instr = minmax_instr;
395 bool is_minmax() { return label & label_minmax; }
397 void set_vopc(Instruction* vopc_instr)
399 add_label(label_vopc);
403 bool is_vopc() { return label & label_vopc; }
405 void set_scc_needed() { add_label(label_scc_needed); }
407 bool is_scc_needed() { return label & label_scc_needed; }
409 void set_scc_invert(Temp scc_inv)
411 add_label(label_scc_invert);
415 bool is_scc_invert() { return label & label_scc_invert; }
417 void set_uniform_bool(Temp uniform_bool)
419 add_label(label_uniform_bool);
423 bool is_uniform_bool() { return label & label_uniform_bool; }
425 void set_b2i(Temp b2i_val)
427 add_label(label_b2i);
431 bool is_b2i() { return label & label_b2i; }
433 void set_usedef(Instruction* label_instr)
435 add_label(label_usedef);
439 bool is_usedef() { return label & label_usedef; }
441 void set_vop3p(Instruction* vop3p_instr)
443 add_label(label_vop3p);
447 bool is_vop3p() { return label & label_vop3p; }
449 void set_fcanonicalize(Temp tmp)
451 add_label(label_fcanonicalize);
455 bool is_fcanonicalize() { return label & label_fcanonicalize; }
457 void set_canonicalized() { add_label(label_canonicalized); }
459 bool is_canonicalized() { return label & label_canonicalized; }
461 void set_f2f32(Instruction* cvt)
463 add_label(label_f2f32);
467 bool is_f2f32() { return label & label_f2f32; }
469 void set_extract(Instruction* extract)
471 add_label(label_extract);
475 bool is_extract() { return label & label_extract; }
477 void set_insert(Instruction* insert)
479 if (label & temp_labels)
481 add_label(label_insert);
485 bool is_insert() { return label & label_insert; }
487 void set_dpp16(Instruction* mov)
489 add_label(label_dpp16);
493 void set_dpp8(Instruction* mov)
495 add_label(label_dpp8);
499 bool is_dpp() { return label & (label_dpp16 | label_dpp8); }
500 bool is_dpp16() { return label & label_dpp16; }
501 bool is_dpp8() { return label & label_dpp8; }
503 void set_split(Instruction* split)
505 add_label(label_split);
509 bool is_split() { return label & label_split; }
511 void set_subgroup_invocation(Instruction* label_instr)
513 add_label(label_subgroup_invocation);
517 bool is_subgroup_invocation() { return label & label_subgroup_invocation; }
523 std::vector<aco_ptr<Instruction>> instructions;
525 std::pair<uint32_t, Temp> last_literal;
526 std::vector<mad_info> mad_infos;
527 std::vector<uint16_t> uses;
531 can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
536 if (instr->isVOP3P())
539 if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->gfx_level < GFX10)
545 if (instr->isDPP() && ctx.program->gfx_level < GFX11)
548 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
549 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
550 instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
551 instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
552 instr->opcode != aco_opcode::v_readlane_b32 &&
553 instr->opcode != aco_opcode::v_writelane_b32 &&
554 instr->opcode != aco_opcode::v_readfirstlane_b32;
558 pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsigned index)
560 if (instr->definitions.empty())
564 instr->opcode == aco_opcode::p_as_uniform ||
565 std::all_of(instr->definitions.begin(), instr->definitions.end(),
566 [](const Definition& def) { return def.regClass().type() == RegType::vgpr; });
568 /* don't propagate VGPRs into SGPR instructions */
569 if (temp.type() == RegType::vgpr && !vgpr)
572 bool can_accept_sgpr =
573 ctx.program->gfx_level >= GFX9 ||
574 std::none_of(instr->definitions.begin(), instr->definitions.end(),
575 [](const Definition& def) { return def.regClass().is_subdword(); });
577 switch (instr->opcode) {
578 case aco_opcode::p_phi:
579 case aco_opcode::p_linear_phi:
580 case aco_opcode::p_parallelcopy:
581 case aco_opcode::p_create_vector:
582 if (temp.bytes() != instr->operands[index].bytes())
585 case aco_opcode::p_extract_vector:
586 case aco_opcode::p_extract:
587 if (temp.type() == RegType::sgpr && !can_accept_sgpr)
590 case aco_opcode::p_split_vector: {
591 if (temp.type() == RegType::sgpr && !can_accept_sgpr)
593 /* don't increase the vector size */
594 if (temp.bytes() > instr->operands[index].bytes())
596 /* We can decrease the vector size as smaller temporaries are only
597 * propagated by p_as_uniform instructions.
598 * If this propagation leads to invalid IR or hits the assertion below,
599 * it means that some undefined bytes within a dword are begin accessed
600 * and a bug in instruction_selection is likely. */
601 int decrease = instr->operands[index].bytes() - temp.bytes();
602 while (decrease > 0) {
603 decrease -= instr->definitions.back().bytes();
604 instr->definitions.pop_back();
606 assert(decrease == 0);
609 case aco_opcode::p_as_uniform:
610 if (temp.regClass() == instr->definitions[0].regClass())
611 instr->opcode = aco_opcode::p_parallelcopy;
613 default: return false;
616 instr->operands[index].setTemp(temp);
620 /* This expects the DPP modifier to be removed. */
622 can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
624 assert(instr->isVALU());
625 if (instr->isSDWA() && ctx.program->gfx_level < GFX9)
627 return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
628 instr->opcode != aco_opcode::v_readlane_b32 &&
629 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
630 instr->opcode != aco_opcode::v_writelane_b32 &&
631 instr->opcode != aco_opcode::v_writelane_b32_e64 &&
632 instr->opcode != aco_opcode::v_permlane16_b32 &&
633 instr->opcode != aco_opcode::v_permlanex16_b32 &&
634 instr->opcode != aco_opcode::v_interp_p1_f32 &&
635 instr->opcode != aco_opcode::v_interp_p2_f32 &&
636 instr->opcode != aco_opcode::v_interp_mov_f32 &&
637 instr->opcode != aco_opcode::v_interp_p1ll_f16 &&
638 instr->opcode != aco_opcode::v_interp_p1lv_f16 &&
639 instr->opcode != aco_opcode::v_interp_p2_legacy_f16 &&
640 instr->opcode != aco_opcode::v_interp_p2_f16 &&
641 instr->opcode != aco_opcode::v_interp_p10_f32_inreg &&
642 instr->opcode != aco_opcode::v_interp_p2_f32_inreg &&
643 instr->opcode != aco_opcode::v_interp_p10_f16_f32_inreg &&
644 instr->opcode != aco_opcode::v_interp_p2_f16_f32_inreg &&
645 instr->opcode != aco_opcode::v_interp_p10_rtz_f16_f32_inreg &&
646 instr->opcode != aco_opcode::v_interp_p2_rtz_f16_f32_inreg;
650 is_operand_vgpr(Operand op)
652 return op.isTemp() && op.getTemp().type() == RegType::vgpr;
655 /* only covers special cases */
657 alu_can_accept_constant(const aco_ptr<Instruction>& instr, unsigned operand)
659 /* Fixed operands can't accept constants because we need them
660 * to be in their fixed register.
662 assert(instr->operands.size() > operand);
663 if (instr->operands[operand].isFixed())
666 /* SOPP instructions can't use constants. */
670 switch (instr->opcode) {
671 case aco_opcode::v_mac_f32:
672 case aco_opcode::v_writelane_b32:
673 case aco_opcode::v_writelane_b32_e64:
674 case aco_opcode::v_cndmask_b32: return operand != 2;
675 case aco_opcode::s_addk_i32:
676 case aco_opcode::s_mulk_i32:
677 case aco_opcode::p_extract_vector:
678 case aco_opcode::p_split_vector:
679 case aco_opcode::v_readlane_b32:
680 case aco_opcode::v_readlane_b32_e64:
681 case aco_opcode::v_readfirstlane_b32:
682 case aco_opcode::p_extract:
683 case aco_opcode::p_insert: return operand != 0;
684 case aco_opcode::p_bpermute_readlane:
685 case aco_opcode::p_bpermute_shared_vgpr:
686 case aco_opcode::p_bpermute_permlane:
687 case aco_opcode::p_interp_gfx11:
688 case aco_opcode::p_dual_src_export_gfx11:
689 case aco_opcode::v_interp_p1_f32:
690 case aco_opcode::v_interp_p2_f32:
691 case aco_opcode::v_interp_mov_f32:
692 case aco_opcode::v_interp_p1ll_f16:
693 case aco_opcode::v_interp_p1lv_f16:
694 case aco_opcode::v_interp_p2_legacy_f16:
695 case aco_opcode::v_interp_p10_f32_inreg:
696 case aco_opcode::v_interp_p2_f32_inreg:
697 case aco_opcode::v_interp_p10_f16_f32_inreg:
698 case aco_opcode::v_interp_p2_f16_f32_inreg:
699 case aco_opcode::v_interp_p10_rtz_f16_f32_inreg:
700 case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return false;
701 default: return true;
706 valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
708 if (instr->opcode == aco_opcode::v_readlane_b32 ||
709 instr->opcode == aco_opcode::v_readlane_b32_e64 ||
710 instr->opcode == aco_opcode::v_writelane_b32 ||
711 instr->opcode == aco_opcode::v_writelane_b32_e64)
713 if (instr->opcode == aco_opcode::v_permlane16_b32 ||
714 instr->opcode == aco_opcode::v_permlanex16_b32)
719 /* check constant bus and literal limitations */
721 check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand* operands)
723 int limit = ctx.program->gfx_level >= GFX10 ? 2 : 1;
724 Operand literal32(s1);
725 Operand literal64(s2);
726 unsigned num_sgprs = 0;
727 unsigned sgpr[] = {0, 0};
729 for (unsigned i = 0; i < num_operands; i++) {
730 Operand op = operands[i];
732 if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {
733 /* two reads of the same SGPR count as 1 to the limit */
734 if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
736 sgpr[num_sgprs++] = op.tempId();
741 } else if (op.isLiteral()) {
742 if (ctx.program->gfx_level < GFX10)
745 if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())
747 if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())
750 /* Any number of 32-bit literals counts as only 1 to the limit. Same
751 * (but separately) for 64-bit literals. */
752 if (op.size() == 1 && literal32.isUndefined()) {
755 } else if (op.size() == 2 && literal64.isUndefined()) {
769 parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* base, uint32_t* offset,
770 bool prevent_overflow)
772 Operand op = instr->operands[op_index];
776 Temp tmp = op.getTemp();
777 if (!ctx.info[tmp.id()].is_add_sub())
780 Instruction* add_instr = ctx.info[tmp.id()].instr;
784 switch (add_instr->opcode) {
785 case aco_opcode::v_add_u32:
786 case aco_opcode::v_add_co_u32:
787 case aco_opcode::v_add_co_u32_e64:
788 case aco_opcode::s_add_i32:
789 case aco_opcode::s_add_u32: break;
790 case aco_opcode::v_sub_u32:
791 case aco_opcode::v_sub_i32:
792 case aco_opcode::v_sub_co_u32:
793 case aco_opcode::v_sub_co_u32_e64:
794 case aco_opcode::s_sub_u32:
795 case aco_opcode::s_sub_i32:
799 case aco_opcode::v_subrev_u32:
800 case aco_opcode::v_subrev_co_u32:
801 case aco_opcode::v_subrev_co_u32_e64:
805 default: return false;
807 if (prevent_overflow && !add_instr->definitions[0].isNUW())
810 if (add_instr->usesModifiers())
813 u_foreach_bit (i, mask) {
814 if (add_instr->operands[i].isConstant()) {
815 *offset = add_instr->operands[i].constantValue() * (uint32_t)(is_sub ? -1 : 1);
816 } else if (add_instr->operands[i].isTemp() &&
817 ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) {
818 *offset = ctx.info[add_instr->operands[i].tempId()].val * (uint32_t)(is_sub ? -1 : 1);
822 if (!add_instr->operands[!i].isTemp())
825 uint32_t offset2 = 0;
826 if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
829 *base = add_instr->operands[!i].getTemp();
838 skip_smem_offset_align(opt_ctx& ctx, SMEM_instruction* smem)
840 bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
841 if (soe && !smem->operands[1].isConstant())
843 /* We don't need to check the constant offset because the address seems to be calculated with
844 * (offset&-4 + const_offset&-4), not (offset+const_offset)&-4.
847 Operand& op = smem->operands[soe ? smem->operands.size() - 1 : 1];
848 if (!op.isTemp() || !ctx.info[op.tempId()].is_bitwise())
851 Instruction* bitwise_instr = ctx.info[op.tempId()].instr;
852 if (bitwise_instr->opcode != aco_opcode::s_and_b32)
855 if (bitwise_instr->operands[0].constantEquals(-4) &&
856 bitwise_instr->operands[1].isOfType(op.regClass().type()))
857 op.setTemp(bitwise_instr->operands[1].getTemp());
858 else if (bitwise_instr->operands[1].constantEquals(-4) &&
859 bitwise_instr->operands[0].isOfType(op.regClass().type()))
860 op.setTemp(bitwise_instr->operands[0].getTemp());
864 smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
866 /* skip &-4 before offset additions: load((a + 16) & -4, 0) */
867 if (!instr->operands.empty())
868 skip_smem_offset_align(ctx, &instr->smem());
870 /* propagate constants and combine additions */
871 if (!instr->operands.empty() && instr->operands[1].isTemp()) {
872 SMEM_instruction& smem = instr->smem();
873 ssa_info info = ctx.info[instr->operands[1].tempId()];
877 if (info.is_constant_or_literal(32) &&
878 ((ctx.program->gfx_level == GFX6 && info.val <= 0x3FF) ||
879 (ctx.program->gfx_level == GFX7 && info.val <= 0xFFFFFFFF) ||
880 (ctx.program->gfx_level >= GFX8 && info.val <= 0xFFFFF))) {
881 instr->operands[1] = Operand::c32(info.val);
882 } else if (parse_base_offset(ctx, instr.get(), 1, &base, &offset, true) &&
883 base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->gfx_level >= GFX9 &&
885 bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4);
887 if (ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) &&
888 ctx.info[smem.operands.back().tempId()].val == 0) {
889 smem.operands[1] = Operand::c32(offset);
890 smem.operands.back() = Operand(base);
893 SMEM_instruction* new_instr = create_instruction<SMEM_instruction>(
894 smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size());
895 new_instr->operands[0] = smem.operands[0];
896 new_instr->operands[1] = Operand::c32(offset);
897 if (smem.definitions.empty())
898 new_instr->operands[2] = smem.operands[2];
899 new_instr->operands.back() = Operand(base);
900 if (!smem.definitions.empty())
901 new_instr->definitions[0] = smem.definitions[0];
902 new_instr->sync = smem.sync;
903 new_instr->glc = smem.glc;
904 new_instr->dlc = smem.dlc;
905 new_instr->nv = smem.nv;
906 new_instr->disable_wqm = smem.disable_wqm;
907 instr.reset(new_instr);
912 /* skip &-4 after offset additions: load(a & -4, 16) */
913 if (!instr->operands.empty())
914 skip_smem_offset_align(ctx, &instr->smem());
918 get_constant_op(opt_ctx& ctx, ssa_info info, uint32_t bits)
921 return Operand::c32_or_c64(info.val, true);
922 return Operand::get_const(ctx.program->gfx_level, info.val, bits / 8u);
926 propagate_constants_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info, unsigned i)
928 if (!info.is_constant_or_literal(32))
931 assert(instr->operands[i].isTemp());
932 unsigned bits = get_operand_size(instr, i);
933 if (info.is_constant(bits)) {
934 instr->operands[i] = get_constant_op(ctx, info, bits);
938 /* The accumulation operand of dot product instructions ignores opsel. */
939 bool cannot_use_opsel =
940 (instr->opcode == aco_opcode::v_dot4_i32_i8 || instr->opcode == aco_opcode::v_dot2_i32_i16 ||
941 instr->opcode == aco_opcode::v_dot4_i32_iu8 || instr->opcode == aco_opcode::v_dot4_u32_u8 ||
942 instr->opcode == aco_opcode::v_dot2_u32_u16) &&
944 if (cannot_use_opsel)
947 /* try to fold inline constants */
948 VALU_instruction* vop3p = &instr->valu();
949 bool opsel_lo = vop3p->opsel_lo[i];
950 bool opsel_hi = vop3p->opsel_hi[i];
953 bool const_opsel[2] = {false, false};
954 for (unsigned j = 0; j < 2; j++) {
955 if ((unsigned)opsel_lo != j && (unsigned)opsel_hi != j)
956 continue; /* this half is unused */
958 uint16_t val = info.val >> (j ? 16 : 0);
959 Operand op = Operand::get_const(ctx.program->gfx_level, val, bits / 8u);
960 if (bits == 32 && op.isLiteral()) /* try sign extension */
961 op = Operand::get_const(ctx.program->gfx_level, val | 0xffff0000, 4);
962 if (bits == 32 && op.isLiteral()) { /* try shifting left */
963 op = Operand::get_const(ctx.program->gfx_level, val << 16, 4);
964 const_opsel[j] = true;
971 Operand const_lo = const_op[0];
972 Operand const_hi = const_op[1];
973 bool const_lo_opsel = const_opsel[0];
974 bool const_hi_opsel = const_opsel[1];
976 if (opsel_lo == opsel_hi) {
977 /* use the single 16bit value */
978 instr->operands[i] = opsel_lo ? const_hi : const_lo;
980 /* opsel must point the same for both halves */
981 opsel_lo = opsel_lo ? const_hi_opsel : const_lo_opsel;
983 } else if (const_lo == const_hi) {
984 /* both constants are the same */
985 instr->operands[i] = const_lo;
987 /* opsel must point the same for both halves */
988 opsel_lo = const_lo_opsel;
989 opsel_hi = const_lo_opsel;
990 } else if (const_lo.constantValue16(const_lo_opsel) ==
991 const_hi.constantValue16(!const_hi_opsel)) {
992 instr->operands[i] = const_hi;
994 /* redirect opsel selection */
995 opsel_lo = opsel_lo ? const_hi_opsel : !const_hi_opsel;
996 opsel_hi = opsel_hi ? const_hi_opsel : !const_hi_opsel;
997 } else if (const_hi.constantValue16(const_hi_opsel) ==
998 const_lo.constantValue16(!const_lo_opsel)) {
999 instr->operands[i] = const_lo;
1001 /* redirect opsel selection */
1002 opsel_lo = opsel_lo ? !const_lo_opsel : const_lo_opsel;
1003 opsel_hi = opsel_hi ? !const_lo_opsel : const_lo_opsel;
1004 } else if (bits == 16 && const_lo.constantValue() == (const_hi.constantValue() ^ (1 << 15))) {
1005 assert(const_lo_opsel == false && const_hi_opsel == false);
1007 /* const_lo == -const_hi */
1008 if (!can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i))
1011 instr->operands[i] = Operand::c16(const_lo.constantValue() & 0x7FFF);
1012 bool neg_lo = const_lo.constantValue() & (1 << 15);
1013 vop3p->neg_lo[i] ^= opsel_lo ^ neg_lo;
1014 vop3p->neg_hi[i] ^= opsel_hi ^ neg_lo;
1016 /* opsel must point to lo for both operands */
1021 vop3p->opsel_lo[i] = opsel_lo;
1022 vop3p->opsel_hi[i] = opsel_hi;
1026 fixed_to_exec(Operand op)
1028 return op.isFixed() && op.physReg() == exec;
1032 parse_extract(Instruction* instr)
1034 if (instr->opcode == aco_opcode::p_extract) {
1035 unsigned size = instr->operands[2].constantValue() / 8;
1036 unsigned offset = instr->operands[1].constantValue() * size;
1037 bool sext = instr->operands[3].constantEquals(1);
1038 return SubdwordSel(size, offset, sext);
1039 } else if (instr->opcode == aco_opcode::p_insert && instr->operands[1].constantEquals(0)) {
1040 return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
1041 } else if (instr->opcode == aco_opcode::p_extract_vector) {
1042 unsigned size = instr->definitions[0].bytes();
1043 unsigned offset = instr->operands[1].constantValue() * size;
1045 return SubdwordSel(size, offset, false);
1046 } else if (instr->opcode == aco_opcode::p_split_vector) {
1047 assert(instr->operands[0].bytes() == 4 && instr->definitions[1].bytes() == 2);
1048 return SubdwordSel(2, 2, false);
1051 return SubdwordSel();
1055 parse_insert(Instruction* instr)
1057 if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) &&
1058 instr->operands[1].constantEquals(0)) {
1059 return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
1060 } else if (instr->opcode == aco_opcode::p_insert) {
1061 unsigned size = instr->operands[2].constantValue() / 8;
1062 unsigned offset = instr->operands[1].constantValue() * size;
1063 return SubdwordSel(size, offset, false);
1065 return SubdwordSel();
1070 can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
1072 Temp tmp = info.instr->operands[0].getTemp();
1073 SubdwordSel sel = parse_extract(info.instr);
1077 } else if (sel.size() == 4) {
1079 } else if ((instr->opcode == aco_opcode::v_cvt_f32_u32 ||
1080 instr->opcode == aco_opcode::v_cvt_f32_i32) &&
1081 sel.size() == 1 && !sel.sign_extend()) {
1083 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() &&
1084 sel.offset() == 0 &&
1085 ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
1086 (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
1088 } else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
1089 !instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
1090 (instr->operands[!idx].is16bit() ||
1091 instr->operands[!idx].constantValue() <= UINT16_MAX)) {
1093 } else if (idx < 2 && can_use_SDWA(ctx.program->gfx_level, instr, true) &&
1094 (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
1095 if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword)
1098 } else if (instr->isVALU() && sel.size() == 2 && !instr->valu().opsel[idx] &&
1099 can_use_opsel(ctx.program->gfx_level, instr->opcode, idx)) {
1101 } else if (instr->opcode == aco_opcode::p_extract) {
1102 SubdwordSel instrSel = parse_extract(instr.get());
1104 /* the outer offset must be within extracted range */
1105 if (instrSel.offset() >= sel.size())
1108 /* don't remove the sign-extension when increasing the size further */
1109 if (instrSel.size() > sel.size() && !instrSel.sign_extend() && sel.sign_extend())
1118 /* Combine an p_extract (or p_insert, in some cases) instruction with instr.
1119 * instr(p_extract(...)) -> instr()
1122 apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
1124 Temp tmp = info.instr->operands[0].getTemp();
1125 SubdwordSel sel = parse_extract(info.instr);
1128 instr->operands[idx].set16bit(false);
1129 instr->operands[idx].set24bit(false);
1131 ctx.info[tmp.id()].label &= ~label_insert;
1133 if (sel.size() == 4) {
1134 /* full dword selection */
1135 } else if ((instr->opcode == aco_opcode::v_cvt_f32_u32 ||
1136 instr->opcode == aco_opcode::v_cvt_f32_i32) &&
1137 sel.size() == 1 && !sel.sign_extend()) {
1138 switch (sel.offset()) {
1139 case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
1140 case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
1141 case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
1142 case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
1144 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() &&
1145 sel.offset() == 0 &&
1146 ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
1147 (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
1148 /* The undesirable upper bits are already shifted out. */
1150 } else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
1151 !instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
1152 (instr->operands[!idx].is16bit() ||
1153 instr->operands[!idx].constantValue() <= UINT16_MAX)) {
1155 create_instruction<VALU_instruction>(aco_opcode::v_mad_u32_u16, Format::VOP3, 3, 1);
1156 mad->definitions[0] = instr->definitions[0];
1157 mad->operands[0] = instr->operands[0];
1158 mad->operands[1] = instr->operands[1];
1159 mad->operands[2] = Operand::zero();
1160 mad->valu().opsel[idx] = sel.offset();
1161 mad->pass_flags = instr->pass_flags;
1163 } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) &&
1164 (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
1165 convert_to_SDWA(ctx.program->gfx_level, instr);
1166 instr->sdwa().sel[idx] = sel;
1167 } else if (instr->isVALU()) {
1169 instr->valu().opsel[idx] = true;
1171 /* VOP12C cannot use opsel with SGPRs. */
1172 if (!instr->isVOP3() && !instr->isVINTERP_INREG() &&
1173 !info.instr->operands[0].isOfType(RegType::vgpr))
1174 instr->format = asVOP3(instr->format);
1176 } else if (instr->opcode == aco_opcode::p_extract) {
1177 SubdwordSel instrSel = parse_extract(instr.get());
1179 unsigned size = std::min(sel.size(), instrSel.size());
1180 unsigned offset = sel.offset() + instrSel.offset();
1181 unsigned sign_extend =
1182 instrSel.sign_extend() && (sel.sign_extend() || instrSel.size() <= sel.size());
1184 instr->operands[1] = Operand::c32(offset / size);
1185 instr->operands[2] = Operand::c32(size * 8u);
1186 instr->operands[3] = Operand::c32(sign_extend);
1190 /* These are the only labels worth keeping at the moment. */
1191 for (Definition& def : instr->definitions) {
1192 ctx.info[def.tempId()].label &=
1193 (label_mul | label_minmax | label_usedef | label_vopc | label_f2f32 | instr_mod_labels);
1194 if (ctx.info[def.tempId()].label & instr_usedef_labels)
1195 ctx.info[def.tempId()].instr = instr.get();
1200 check_sdwa_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1202 for (unsigned i = 0; i < instr->operands.size(); i++) {
1203 Operand op = instr->operands[i];
1206 ssa_info& info = ctx.info[op.tempId()];
1207 if (info.is_extract() && (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
1208 op.getTemp().type() == RegType::sgpr)) {
1209 if (!can_apply_extract(ctx, instr, i, info))
1210 info.label &= ~label_extract;
1216 does_fp_op_flush_denorms(opt_ctx& ctx, aco_opcode op)
1219 case aco_opcode::v_min_f32:
1220 case aco_opcode::v_max_f32:
1221 case aco_opcode::v_med3_f32:
1222 case aco_opcode::v_min3_f32:
1223 case aco_opcode::v_max3_f32:
1224 case aco_opcode::v_min_f16:
1225 case aco_opcode::v_max_f16: return ctx.program->gfx_level > GFX8;
1226 case aco_opcode::v_cndmask_b32:
1227 case aco_opcode::v_cndmask_b16:
1228 case aco_opcode::v_mov_b32:
1229 case aco_opcode::v_mov_b16: return false;
1230 default: return true;
1235 can_eliminate_fcanonicalize(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp tmp, unsigned idx)
1237 float_mode* fp = &ctx.fp_mode;
1238 if (ctx.info[tmp.id()].is_canonicalized() ||
1239 (tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
1242 aco_opcode op = instr->opcode;
1243 return can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, idx) &&
1244 does_fp_op_flush_denorms(ctx, op);
1248 can_eliminate_and_exec(opt_ctx& ctx, Temp tmp, unsigned pass_flags)
1250 if (ctx.info[tmp.id()].is_vopc()) {
1251 Instruction* vopc_instr = ctx.info[tmp.id()].instr;
1252 /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus
1253 * already produces the same result */
1254 return vopc_instr->pass_flags == pass_flags;
1256 if (ctx.info[tmp.id()].is_bitwise()) {
1257 Instruction* instr = ctx.info[tmp.id()].instr;
1258 if (instr->operands.size() != 2 || instr->pass_flags != pass_flags)
1260 if (!(instr->operands[0].isTemp() && instr->operands[1].isTemp()))
1262 if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_and_b64) {
1263 return can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), pass_flags) ||
1264 can_eliminate_and_exec(ctx, instr->operands[1].getTemp(), pass_flags);
1266 return can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), pass_flags) &&
1267 can_eliminate_and_exec(ctx, instr->operands[1].getTemp(), pass_flags);
1274 is_copy_label(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info, unsigned idx)
1276 return info.is_temp() ||
1277 (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp, idx));
1281 is_op_canonicalized(opt_ctx& ctx, Operand op)
1283 float_mode* fp = &ctx.fp_mode;
1284 if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) ||
1285 (op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
1288 if (op.isConstant() || (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(32))) {
1289 uint32_t val = op.isTemp() ? ctx.info[op.tempId()].val : op.constantValue();
1290 if (op.bytes() == 2)
1291 return (val & 0x7fff) == 0 || (val & 0x7fff) > 0x3ff;
1292 else if (op.bytes() == 4)
1293 return (val & 0x7fffffff) == 0 || (val & 0x7fffffff) > 0x7fffff;
1299 is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int64_t offset0, int64_t offset1)
1301 bool negative_unaligned_scratch_offset_bug = ctx.program->gfx_level == GFX10;
1302 int32_t min = ctx.program->dev.scratch_global_offset_min;
1303 int32_t max = ctx.program->dev.scratch_global_offset_max;
1305 int64_t offset = offset0 + offset1;
1307 bool has_vgpr_offset = instr && !instr->operands[0].isUndefined();
1308 if (negative_unaligned_scratch_offset_bug && has_vgpr_offset && offset < 0 && offset % 4)
1311 return offset >= min && offset <= max;
1315 detect_clamp(Instruction* instr, unsigned* clamped_idx)
1317 VALU_instruction& valu = instr->valu();
1318 if (valu.omod != 0 || valu.opsel != 0)
1322 bool found_zero = false, found_one = false;
1323 bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
1324 for (unsigned i = 0; i < 3; i++) {
1325 if (!valu.neg[i] && instr->operands[i].constantEquals(0))
1327 else if (!valu.neg[i] &&
1328 instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
1333 if (found_zero && found_one && instr->operands[idx].isTemp()) {
1342 label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1344 if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) {
1345 ASSERTED bool all_const = false;
1346 for (Operand& op : instr->operands)
1348 all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));
1349 perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get());
1351 ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 ||
1352 instr->opcode == aco_opcode::s_mov_b64 ||
1353 instr->opcode == aco_opcode::v_mov_b32;
1354 perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead",
1358 if (instr->isSMEM())
1359 smem_combine(ctx, instr);
1361 for (unsigned i = 0; i < instr->operands.size(); i++) {
1362 if (!instr->operands[i].isTemp())
1365 ssa_info info = ctx.info[instr->operands[i].tempId()];
1366 /* propagate undef */
1367 if (info.is_undefined() && is_phi(instr))
1368 instr->operands[i] = Operand(instr->operands[i].regClass());
1369 /* propagate reg->reg of same type */
1370 while (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {
1371 instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);
1372 info = ctx.info[info.temp.id()];
1375 /* PSEUDO: propagate temporaries */
1376 if (instr->isPseudo()) {
1377 while (info.is_temp()) {
1378 pseudo_propagate_temp(ctx, instr, info.temp, i);
1379 info = ctx.info[info.temp.id()];
1383 /* SALU / PSEUDO: propagate inline constants */
1384 if (instr->isSALU() || instr->isPseudo()) {
1385 unsigned bits = get_operand_size(instr, i);
1386 if ((info.is_constant(bits) || (info.is_literal(bits) && instr->isPseudo())) &&
1387 alu_can_accept_constant(instr, i)) {
1388 instr->operands[i] = get_constant_op(ctx, info, bits);
1393 /* VALU: propagate neg, abs & inline constants */
1394 else if (instr->isVALU()) {
1395 if (is_copy_label(ctx, instr, info, i) && info.temp.type() == RegType::vgpr &&
1396 valu_can_accept_vgpr(instr, i)) {
1397 instr->operands[i].setTemp(info.temp);
1398 info = ctx.info[info.temp.id()];
1400 /* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
1401 if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
1402 instr->operands.size() == 1) {
1403 instr->format = withoutDPP(instr->format);
1404 instr->operands[i].setTemp(info.temp);
1405 info = ctx.info[info.temp.id()];
1408 /* for instructions other than v_cndmask_b32, the size of the instruction should match the
1410 unsigned can_use_mod =
1411 instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
1413 can_use_mod && can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i);
1415 if (instr->isSDWA())
1416 can_use_mod = can_use_mod && instr->sdwa().sel[i].size() == 4;
1418 can_use_mod = can_use_mod && (instr->isDPP16() || can_use_VOP3(ctx, instr));
1420 unsigned bits = get_operand_size(instr, i);
1421 bool mod_bitsize_compat = instr->operands[i].bytes() * 8 == bits;
1423 if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32 && mod_bitsize_compat) {
1424 instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
1425 instr->operands[i].setTemp(info.temp);
1426 } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16 && mod_bitsize_compat) {
1427 instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
1428 instr->operands[i].setTemp(info.temp);
1429 } else if (info.is_neg() && can_use_mod && mod_bitsize_compat &&
1430 can_eliminate_fcanonicalize(ctx, instr, info.temp, i)) {
1431 if (!instr->isDPP() && !instr->isSDWA())
1432 instr->format = asVOP3(instr->format);
1433 instr->operands[i].setTemp(info.temp);
1434 if (!instr->valu().abs[i])
1435 instr->valu().neg[i] = true;
1437 if (info.is_abs() && can_use_mod && mod_bitsize_compat &&
1438 can_eliminate_fcanonicalize(ctx, instr, info.temp, i)) {
1439 if (!instr->isDPP() && !instr->isSDWA())
1440 instr->format = asVOP3(instr->format);
1441 instr->operands[i] = Operand(info.temp);
1442 instr->valu().abs[i] = true;
1446 if (instr->isVOP3P()) {
1447 propagate_constants_vop3p(ctx, instr, info, i);
1451 if (info.is_constant(bits) && alu_can_accept_constant(instr, i) &&
1452 (!instr->isSDWA() || ctx.program->gfx_level >= GFX9) && (!instr->isDPP() || i != 1)) {
1453 Operand op = get_constant_op(ctx, info, bits);
1454 perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,
1455 "v_cndmask_b32 with a constant selector", instr.get());
1456 if (i == 0 || instr->isSDWA() || instr->opcode == aco_opcode::v_readlane_b32 ||
1457 instr->opcode == aco_opcode::v_writelane_b32) {
1458 instr->format = withoutDPP(instr->format);
1459 instr->operands[i] = op;
1461 } else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) {
1462 instr->operands[i] = op;
1463 instr->valu().swapOperands(0, i);
1465 } else if (can_use_VOP3(ctx, instr)) {
1466 instr->format = asVOP3(instr->format);
1467 instr->operands[i] = op;
1473 /* MUBUF: propagate constants and combine additions */
1474 else if (instr->isMUBUF()) {
1475 MUBUF_instruction& mubuf = instr->mubuf();
1478 while (info.is_temp())
1479 info = ctx.info[info.temp.id()];
1481 /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
1482 * overflow for scratch accesses works only on GFX9+ and saddr overflow
1483 * never works. Since swizzling is the only thing that separates
1484 * scratch accesses and other accesses and swizzling changing how
1485 * addressing works significantly, this probably applies to swizzled
1486 * MUBUF accesses. */
1487 bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->gfx_level < GFX9;
1489 if (mubuf.offen && mubuf.idxen && i == 1 && info.is_vec() &&
1490 info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
1491 info.instr->operands[0].regClass() == v1 && info.instr->operands[1].isConstant() &&
1492 mubuf.offset + info.instr->operands[1].constantValue() < 4096) {
1493 instr->operands[1] = info.instr->operands[0];
1494 mubuf.offset += info.instr->operands[1].constantValue();
1495 mubuf.offen = false;
1497 } else if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) &&
1498 mubuf.offset + info.val < 4096) {
1499 assert(!mubuf.idxen);
1500 instr->operands[1] = Operand(v1);
1501 mubuf.offset += info.val;
1502 mubuf.offen = false;
1504 } else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) {
1505 instr->operands[2] = Operand::c32(0);
1506 mubuf.offset += info.val;
1508 } else if (mubuf.offen && i == 1 &&
1509 parse_base_offset(ctx, instr.get(), i, &base, &offset,
1510 vaddr_prevent_overflow) &&
1511 base.regClass() == v1 && mubuf.offset + offset < 4096) {
1512 assert(!mubuf.idxen);
1513 instr->operands[1].setTemp(base);
1514 mubuf.offset += offset;
1516 } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
1517 base.regClass() == s1 && mubuf.offset + offset < 4096 && !mubuf.swizzled) {
1518 instr->operands[i].setTemp(base);
1519 mubuf.offset += offset;
1524 else if (instr->isMTBUF()) {
1525 MTBUF_instruction& mtbuf = instr->mtbuf();
1526 while (info.is_temp())
1527 info = ctx.info[info.temp.id()];
1529 if (mtbuf.offen && mtbuf.idxen && i == 1 && info.is_vec() &&
1530 info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
1531 info.instr->operands[0].regClass() == v1 && info.instr->operands[1].isConstant() &&
1532 mtbuf.offset + info.instr->operands[1].constantValue() < 4096) {
1533 instr->operands[1] = info.instr->operands[0];
1534 mtbuf.offset += info.instr->operands[1].constantValue();
1535 mtbuf.offen = false;
1540 /* SCRATCH: propagate constants and combine additions */
1541 else if (instr->isScratch()) {
1542 FLAT_instruction& scratch = instr->scratch();
1545 while (info.is_temp())
1546 info = ctx.info[info.temp.id()];
1548 /* The hardware probably does: 'scratch_base + u2u64(saddr) + i2i64(offset)'. This means
1549 * we can't combine the addition if the unsigned addition overflows and offset is
1550 * positive. In theory, there is also issues if
1551 * 'ilt(offset, 0) && ige(saddr, 0) && ilt(saddr + offset, 0)', but that just
1552 * replaces an already out-of-bounds access with a larger one since 'saddr + offset'
1553 * would be larger than INT32_MAX.
1555 if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
1556 base.regClass() == instr->operands[i].regClass() &&
1557 is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) {
1558 instr->operands[i].setTemp(base);
1559 scratch.offset += (int32_t)offset;
1561 } else if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
1562 base.regClass() == instr->operands[i].regClass() && (int32_t)offset < 0 &&
1563 is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) {
1564 instr->operands[i].setTemp(base);
1565 scratch.offset += (int32_t)offset;
1567 } else if (i <= 1 && info.is_constant_or_literal(32) &&
1568 ctx.program->gfx_level >= GFX10_3 &&
1569 is_scratch_offset_valid(ctx, NULL, scratch.offset, (int32_t)info.val)) {
1570 /* GFX10.3+ can disable both SADDR and ADDR. */
1571 instr->operands[i] = Operand(instr->operands[i].regClass());
1572 scratch.offset += (int32_t)info.val;
1577 /* DS: combine additions */
1578 else if (instr->isDS()) {
1580 DS_instruction& ds = instr->ds();
1583 bool has_usable_ds_offset = ctx.program->gfx_level >= GFX7;
1584 if (has_usable_ds_offset && i == 0 &&
1585 parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
1586 base.regClass() == instr->operands[i].regClass() &&
1587 instr->opcode != aco_opcode::ds_swizzle_b32) {
1588 if (instr->opcode == aco_opcode::ds_write2_b32 ||
1589 instr->opcode == aco_opcode::ds_read2_b32 ||
1590 instr->opcode == aco_opcode::ds_write2_b64 ||
1591 instr->opcode == aco_opcode::ds_read2_b64 ||
1592 instr->opcode == aco_opcode::ds_write2st64_b32 ||
1593 instr->opcode == aco_opcode::ds_read2st64_b32 ||
1594 instr->opcode == aco_opcode::ds_write2st64_b64 ||
1595 instr->opcode == aco_opcode::ds_read2st64_b64) {
1596 bool is64bit = instr->opcode == aco_opcode::ds_write2_b64 ||
1597 instr->opcode == aco_opcode::ds_read2_b64 ||
1598 instr->opcode == aco_opcode::ds_write2st64_b64 ||
1599 instr->opcode == aco_opcode::ds_read2st64_b64;
1600 bool st64 = instr->opcode == aco_opcode::ds_write2st64_b32 ||
1601 instr->opcode == aco_opcode::ds_read2st64_b32 ||
1602 instr->opcode == aco_opcode::ds_write2st64_b64 ||
1603 instr->opcode == aco_opcode::ds_read2st64_b64;
1604 unsigned shifts = (is64bit ? 3 : 2) + (st64 ? 6 : 0);
1605 unsigned mask = BITFIELD_MASK(shifts);
1607 if ((offset & mask) == 0 && ds.offset0 + (offset >> shifts) <= 255 &&
1608 ds.offset1 + (offset >> shifts) <= 255) {
1609 instr->operands[i].setTemp(base);
1610 ds.offset0 += offset >> shifts;
1611 ds.offset1 += offset >> shifts;
1614 if (ds.offset0 + offset <= 65535) {
1615 instr->operands[i].setTemp(base);
1616 ds.offset0 += offset;
1622 else if (instr->isBranch()) {
1623 if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
1624 /* Flip the branch instruction to get rid of the scc_invert instruction */
1625 instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
1626 : aco_opcode::p_cbranch_z;
1627 instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
1632 /* if this instruction doesn't define anything, return */
1633 if (instr->definitions.empty()) {
1634 check_sdwa_extract(ctx, instr);
1638 if (instr->isVALU() || instr->isVINTRP()) {
1639 if (instr_info.can_use_output_modifiers[(int)instr->opcode] || instr->isVINTRP() ||
1640 instr->opcode == aco_opcode::v_cndmask_b32) {
1641 bool canonicalized = true;
1642 if (!does_fp_op_flush_denorms(ctx, instr->opcode)) {
1643 unsigned ops = instr->opcode == aco_opcode::v_cndmask_b32 ? 2 : instr->operands.size();
1644 for (unsigned i = 0; canonicalized && (i < ops); i++)
1645 canonicalized = is_op_canonicalized(ctx, instr->operands[i]);
1648 ctx.info[instr->definitions[0].tempId()].set_canonicalized();
1651 if (instr->isVOPC()) {
1652 ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());
1653 check_sdwa_extract(ctx, instr);
1656 if (instr->isVOP3P()) {
1657 ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
1662 switch (instr->opcode) {
1663 case aco_opcode::p_create_vector: {
1664 bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&
1665 instr->operands[0].regClass() == instr->definitions[0].regClass();
1667 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1671 /* expand vector operands */
1672 std::vector<Operand> ops;
1673 unsigned offset = 0;
1674 for (const Operand& op : instr->operands) {
1675 /* ensure that any expanded operands are properly aligned */
1676 bool aligned = offset % 4 == 0 || op.bytes() < 4;
1677 offset += op.bytes();
1678 if (aligned && op.isTemp() && ctx.info[op.tempId()].is_vec()) {
1679 Instruction* vec = ctx.info[op.tempId()].instr;
1680 for (const Operand& vec_op : vec->operands)
1681 ops.emplace_back(vec_op);
1683 ops.emplace_back(op);
1687 /* combine expanded operands to new vector */
1688 if (ops.size() != instr->operands.size()) {
1689 assert(ops.size() > instr->operands.size());
1690 Definition def = instr->definitions[0];
1691 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
1692 Format::PSEUDO, ops.size(), 1));
1693 for (unsigned i = 0; i < ops.size(); i++) {
1694 if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() &&
1695 ops[i].regClass() == ctx.info[ops[i].tempId()].temp.regClass())
1696 ops[i].setTemp(ctx.info[ops[i].tempId()].temp);
1697 instr->operands[i] = ops[i];
1699 instr->definitions[0] = def;
1701 for (unsigned i = 0; i < ops.size(); i++) {
1702 assert(instr->operands[i] == ops[i]);
1705 ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
1707 if (instr->operands.size() == 2) {
1708 /* check if this is created from split_vector */
1709 if (instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_split()) {
1710 Instruction* split = ctx.info[instr->operands[1].tempId()].instr;
1711 if (instr->operands[0].isTemp() &&
1712 instr->operands[0].getTemp() == split->definitions[0].getTemp())
1713 ctx.info[instr->definitions[0].tempId()].set_temp(split->operands[0].getTemp());
1718 case aco_opcode::p_split_vector: {
1719 ssa_info& info = ctx.info[instr->operands[0].tempId()];
1721 if (info.is_constant_or_literal(32)) {
1722 uint64_t val = info.val;
1723 for (Definition def : instr->definitions) {
1724 uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u);
1725 ctx.info[def.tempId()].set_constant(ctx.program->gfx_level, val & mask);
1726 val >>= def.bytes() * 8u;
1729 } else if (!info.is_vec()) {
1730 if (instr->definitions.size() == 2 && instr->operands[0].isTemp() &&
1731 instr->definitions[0].bytes() == instr->definitions[1].bytes()) {
1732 ctx.info[instr->definitions[1].tempId()].set_split(instr.get());
1733 if (instr->operands[0].bytes() == 4) {
1734 /* D16 subdword split */
1735 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1736 ctx.info[instr->definitions[1].tempId()].set_extract(instr.get());
1742 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1743 unsigned split_offset = 0;
1744 unsigned vec_offset = 0;
1745 unsigned vec_index = 0;
1746 for (unsigned i = 0; i < instr->definitions.size();
1747 split_offset += instr->definitions[i++].bytes()) {
1748 while (vec_offset < split_offset && vec_index < vec->operands.size())
1749 vec_offset += vec->operands[vec_index++].bytes();
1751 if (vec_offset != split_offset ||
1752 vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
1755 Operand vec_op = vec->operands[vec_index];
1756 if (vec_op.isConstant()) {
1757 ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->gfx_level,
1758 vec_op.constantValue64());
1759 } else if (vec_op.isUndefined()) {
1760 ctx.info[instr->definitions[i].tempId()].set_undefined();
1762 assert(vec_op.isTemp());
1763 ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());
1768 case aco_opcode::p_extract_vector: { /* mov */
1769 ssa_info& info = ctx.info[instr->operands[0].tempId()];
1770 const unsigned index = instr->operands[1].constantValue();
1771 const unsigned dst_offset = index * instr->definitions[0].bytes();
1773 if (info.is_vec()) {
1774 /* check if we index directly into a vector element */
1775 Instruction* vec = info.instr;
1776 unsigned offset = 0;
1778 for (const Operand& op : vec->operands) {
1779 if (offset < dst_offset) {
1780 offset += op.bytes();
1782 } else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
1785 instr->operands[0] = op;
1788 } else if (info.is_constant_or_literal(32)) {
1789 /* propagate constants */
1790 uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
1791 uint32_t val = (info.val >> (dst_offset * 8u)) & mask;
1792 instr->operands[0] =
1793 Operand::get_const(ctx.program->gfx_level, val, instr->definitions[0].bytes());
1797 if (instr->operands[0].bytes() != instr->definitions[0].bytes()) {
1798 if (instr->operands[0].size() != 1)
1802 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1804 ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
1808 /* convert this extract into a copy instruction */
1809 instr->opcode = aco_opcode::p_parallelcopy;
1810 instr->operands.pop_back();
1813 case aco_opcode::p_parallelcopy: /* propagate */
1814 if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() &&
1815 instr->operands[0].regClass() != instr->definitions[0].regClass()) {
1816 /* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so
1817 * duplicate the vector instead.
1819 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1820 aco_ptr<Instruction> old_copy = std::move(instr);
1822 instr.reset(create_instruction<Pseudo_instruction>(
1823 aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1));
1824 instr->definitions[0] = old_copy->definitions[0];
1825 std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin());
1826 for (unsigned i = 0; i < vec->operands.size(); i++) {
1827 Operand& op = instr->operands[i];
1828 if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&
1829 ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())
1830 op.setTemp(ctx.info[op.tempId()].temp);
1832 ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
1836 case aco_opcode::p_as_uniform:
1837 if (instr->definitions[0].isFixed()) {
1838 /* don't copy-propagate copies into fixed registers */
1839 } else if (instr->operands[0].isConstant()) {
1840 ctx.info[instr->definitions[0].tempId()].set_constant(
1841 ctx.program->gfx_level, instr->operands[0].constantValue64());
1842 } else if (instr->operands[0].isTemp()) {
1843 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1844 if (ctx.info[instr->operands[0].tempId()].is_canonicalized())
1845 ctx.info[instr->definitions[0].tempId()].set_canonicalized();
1847 assert(instr->operands[0].isFixed());
1850 case aco_opcode::v_mov_b32:
1851 if (instr->isDPP16()) {
1852 /* anything else doesn't make sense in SSA */
1853 assert(instr->dpp16().row_mask == 0xf && instr->dpp16().bank_mask == 0xf);
1854 ctx.info[instr->definitions[0].tempId()].set_dpp16(instr.get());
1855 } else if (instr->isDPP8()) {
1856 ctx.info[instr->definitions[0].tempId()].set_dpp8(instr.get());
1859 case aco_opcode::p_is_helper:
1860 if (!ctx.program->needs_wqm)
1861 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->gfx_level, 0u);
1863 case aco_opcode::v_mul_f64: ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); break;
1864 case aco_opcode::v_mul_f16:
1865 case aco_opcode::v_mul_f32:
1866 case aco_opcode::v_mul_legacy_f32: { /* omod */
1867 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
1869 /* TODO: try to move the negate/abs modifier to the consumer instead */
1870 bool uses_mods = instr->usesModifiers();
1871 bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
1873 for (unsigned i = 0; i < 2; i++) {
1874 if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
1875 if (!instr->isDPP() && !instr->isSDWA() && !instr->valu().opsel &&
1876 (instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) || /* 1.0 */
1877 instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */
1878 bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u);
1880 VALU_instruction* vop3 = instr->isVOP3() ? &instr->valu() : NULL;
1881 if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod))
1884 bool abs = vop3 && vop3->abs[i];
1885 bool neg = neg1 ^ (vop3 && vop3->neg[i]);
1887 Temp other = instr->operands[i].getTemp();
1888 if (abs && neg && other.type() == RegType::vgpr)
1889 ctx.info[instr->definitions[0].tempId()].set_neg_abs(other);
1890 else if (abs && !neg && other.type() == RegType::vgpr)
1891 ctx.info[instr->definitions[0].tempId()].set_abs(other);
1892 else if (!abs && neg && other.type() == RegType::vgpr)
1893 ctx.info[instr->definitions[0].tempId()].set_neg(other);
1894 else if (!abs && !neg)
1895 ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other);
1896 } else if (uses_mods || ((fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64
1897 : ctx.fp_mode.preserve_signed_zero_inf_nan32) &&
1898 instr->opcode != aco_opcode::v_mul_legacy_f32)) {
1899 continue; /* omod uses a legacy multiplication. */
1900 } else if (instr->operands[!i].constantValue() == 0u) { /* 0.0 */
1901 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->gfx_level, 0u);
1902 } else if ((fp16 ? ctx.fp_mode.denorm16_64 : ctx.fp_mode.denorm32) != fp_denorm_flush) {
1903 /* omod has no effect if denormals are enabled. */
1905 } else if (instr->operands[!i].constantValue() ==
1906 (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
1907 ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());
1908 } else if (instr->operands[!i].constantValue() ==
1909 (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
1910 ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());
1911 } else if (instr->operands[!i].constantValue() ==
1912 (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */
1913 ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());
1922 case aco_opcode::v_mul_lo_u16:
1923 case aco_opcode::v_mul_lo_u16_e64:
1924 case aco_opcode::v_mul_u32_u24:
1925 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
1927 case aco_opcode::v_med3_f16:
1928 case aco_opcode::v_med3_f32: { /* clamp */
1930 if (detect_clamp(instr.get(), &idx) && !instr->valu().abs && !instr->valu().neg)
1931 ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get());
1934 case aco_opcode::v_cndmask_b32:
1935 if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0xFFFFFFFF))
1936 ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
1937 else if (instr->operands[0].constantEquals(0) &&
1938 instr->operands[1].constantEquals(0x3f800000u))
1939 ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
1940 else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(1))
1941 ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
1944 case aco_opcode::v_cmp_lg_u32:
1945 if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
1946 instr->operands[0].constantEquals(0) && instr->operands[1].isTemp() &&
1947 ctx.info[instr->operands[1].tempId()].is_vcc())
1948 ctx.info[instr->definitions[0].tempId()].set_temp(
1949 ctx.info[instr->operands[1].tempId()].temp);
1951 case aco_opcode::p_linear_phi: {
1952 /* lower_bool_phis() can create phis like this */
1953 bool all_same_temp = instr->operands[0].isTemp();
1954 /* this check is needed when moving uniform loop counters out of a divergent loop */
1956 all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
1957 for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
1958 if (!instr->operands[i].isTemp() ||
1959 instr->operands[i].tempId() != instr->operands[0].tempId())
1960 all_same_temp = false;
1962 if (all_same_temp) {
1963 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1965 bool all_undef = instr->operands[0].isUndefined();
1966 for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {
1967 if (!instr->operands[i].isUndefined())
1971 ctx.info[instr->definitions[0].tempId()].set_undefined();
1975 case aco_opcode::v_add_u32:
1976 case aco_opcode::v_add_co_u32:
1977 case aco_opcode::v_add_co_u32_e64:
1978 case aco_opcode::s_add_i32:
1979 case aco_opcode::s_add_u32:
1980 case aco_opcode::v_subbrev_co_u32:
1981 case aco_opcode::v_sub_u32:
1982 case aco_opcode::v_sub_i32:
1983 case aco_opcode::v_sub_co_u32:
1984 case aco_opcode::v_sub_co_u32_e64:
1985 case aco_opcode::s_sub_u32:
1986 case aco_opcode::s_sub_i32:
1987 case aco_opcode::v_subrev_u32:
1988 case aco_opcode::v_subrev_co_u32:
1989 case aco_opcode::v_subrev_co_u32_e64:
1990 ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
1992 case aco_opcode::s_not_b32:
1993 case aco_opcode::s_not_b64:
1994 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1995 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1996 ctx.info[instr->definitions[1].tempId()].set_scc_invert(
1997 ctx.info[instr->operands[0].tempId()].temp);
1998 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1999 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
2000 ctx.info[instr->definitions[1].tempId()].set_scc_invert(
2001 ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
2003 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
2005 case aco_opcode::s_and_b32:
2006 case aco_opcode::s_and_b64:
2007 if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
2008 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
2009 /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a
2010 * uniform bool into divergent */
2011 ctx.info[instr->definitions[1].tempId()].set_temp(
2012 ctx.info[instr->operands[0].tempId()].temp);
2013 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
2014 ctx.info[instr->operands[0].tempId()].temp);
2016 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
2017 /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction
2018 * already produces the same SCC */
2019 ctx.info[instr->definitions[1].tempId()].set_temp(
2020 ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
2021 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
2022 ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
2024 } else if ((ctx.program->stage.num_sw_stages() > 1 ||
2025 ctx.program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER) &&
2026 instr->pass_flags == 1) {
2027 /* In case of merged shaders, pass_flags=1 means that all lanes are active (exec=-1), so
2028 * s_and is unnecessary. */
2029 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
2034 case aco_opcode::s_or_b32:
2035 case aco_opcode::s_or_b64:
2036 case aco_opcode::s_xor_b32:
2037 case aco_opcode::s_xor_b64:
2038 if (std::all_of(instr->operands.begin(), instr->operands.end(),
2039 [&ctx](const Operand& op)
2041 return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() ||
2042 ctx.info[op.tempId()].is_uniform_bitwise());
2044 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
2046 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
2048 case aco_opcode::s_lshl_b32:
2049 case aco_opcode::v_or_b32:
2050 case aco_opcode::v_lshlrev_b32:
2051 case aco_opcode::v_bcnt_u32_b32:
2052 case aco_opcode::v_and_b32:
2053 case aco_opcode::v_xor_b32:
2054 case aco_opcode::v_not_b32:
2055 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
2057 case aco_opcode::v_min_f32:
2058 case aco_opcode::v_min_f16:
2059 case aco_opcode::v_min_u32:
2060 case aco_opcode::v_min_i32:
2061 case aco_opcode::v_min_u16:
2062 case aco_opcode::v_min_i16:
2063 case aco_opcode::v_min_u16_e64:
2064 case aco_opcode::v_min_i16_e64:
2065 case aco_opcode::v_max_f32:
2066 case aco_opcode::v_max_f16:
2067 case aco_opcode::v_max_u32:
2068 case aco_opcode::v_max_i32:
2069 case aco_opcode::v_max_u16:
2070 case aco_opcode::v_max_i16:
2071 case aco_opcode::v_max_u16_e64:
2072 case aco_opcode::v_max_i16_e64:
2073 ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());
2075 case aco_opcode::s_cselect_b64:
2076 case aco_opcode::s_cselect_b32:
2077 if (instr->operands[0].constantEquals((unsigned)-1) && instr->operands[1].constantEquals(0)) {
2078 /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
2079 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
2081 if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
2082 /* Flip the operands to get rid of the scc_invert instruction */
2083 std::swap(instr->operands[0], instr->operands[1]);
2084 instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
2087 case aco_opcode::s_mul_i32:
2088 /* Testing every uint32_t shows that 0x3f800000*n is never a denormal.
2089 * This pattern is created from a uniform nir_op_b2f. */
2090 if (instr->operands[0].constantEquals(0x3f800000u))
2091 ctx.info[instr->definitions[0].tempId()].set_canonicalized();
2093 case aco_opcode::p_extract: {
2094 if (instr->definitions[0].bytes() == 4) {
2095 ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
2096 if (instr->operands[0].regClass() == v1 && parse_insert(instr.get()))
2097 ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
2101 case aco_opcode::p_insert: {
2102 if (instr->operands[0].bytes() == 4) {
2103 if (instr->operands[0].regClass() == v1)
2104 ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
2105 if (parse_extract(instr.get()))
2106 ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
2107 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
2111 case aco_opcode::ds_read_u8:
2112 case aco_opcode::ds_read_u8_d16:
2113 case aco_opcode::ds_read_u16:
2114 case aco_opcode::ds_read_u16_d16: {
2115 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
2118 case aco_opcode::v_mbcnt_lo_u32_b32: {
2119 if (instr->operands[0].constantEquals(-1) && instr->operands[1].constantEquals(0)) {
2120 if (ctx.program->wave_size == 32)
2121 ctx.info[instr->definitions[0].tempId()].set_subgroup_invocation(instr.get());
2123 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
2127 case aco_opcode::v_mbcnt_hi_u32_b32:
2128 case aco_opcode::v_mbcnt_hi_u32_b32_e64: {
2129 if (instr->operands[0].constantEquals(-1) && instr->operands[1].isTemp() &&
2130 ctx.info[instr->operands[1].tempId()].is_usedef()) {
2131 Instruction* usedef_instr = ctx.info[instr->operands[1].tempId()].instr;
2132 if (usedef_instr->opcode == aco_opcode::v_mbcnt_lo_u32_b32 &&
2133 usedef_instr->operands[0].constantEquals(-1) &&
2134 usedef_instr->operands[1].constantEquals(0))
2135 ctx.info[instr->definitions[0].tempId()].set_subgroup_invocation(instr.get());
2139 case aco_opcode::v_cvt_f16_f32: {
2140 if (instr->operands[0].isTemp())
2141 ctx.info[instr->operands[0].tempId()].set_f2f16(instr.get());
2144 case aco_opcode::v_cvt_f32_f16: {
2145 if (instr->operands[0].isTemp())
2146 ctx.info[instr->definitions[0].tempId()].set_f2f32(instr.get());
2152 /* Don't remove label_extract if we can't apply the extract to
2153 * neg/abs instructions because we'll likely combine it into another valu. */
2154 if (!(ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)))
2155 check_sdwa_extract(ctx, instr);
2159 original_temp_id(opt_ctx& ctx, Temp tmp)
2161 if (ctx.info[tmp.id()].is_temp())
2162 return ctx.info[tmp.id()].temp.id();
2168 decrease_op_uses_if_dead(opt_ctx& ctx, Instruction* instr)
2170 if (is_dead(ctx.uses, instr)) {
2171 for (const Operand& op : instr->operands) {
2173 ctx.uses[op.tempId()]--;
2179 decrease_uses(opt_ctx& ctx, Instruction* instr)
2181 ctx.uses[instr->definitions[0].tempId()]--;
2182 decrease_op_uses_if_dead(ctx, instr);
2186 copy_operand(opt_ctx& ctx, Operand op)
2189 ctx.uses[op.tempId()]++;
2194 follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false)
2196 if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels))
2198 if (!ignore_uses && ctx.uses[op.tempId()] > 1)
2201 Instruction* instr = ctx.info[op.tempId()].instr;
2203 if (instr->definitions.size() == 2) {
2204 assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
2205 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2209 for (Operand& operand : instr->operands) {
2210 if (fixed_to_exec(operand))
2217 /* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
2218 * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
2220 combine_ordering_test(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2222 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
2224 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2227 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
2229 bitarray8 opsel = 0;
2230 Instruction* op_instr[2];
2233 unsigned bitsize = 0;
2234 for (unsigned i = 0; i < 2; i++) {
2235 op_instr[i] = follow_operand(ctx, instr->operands[i], true);
2239 aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
2240 unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode);
2242 if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp)
2244 if (bitsize && op_bitsize != bitsize)
2246 if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())
2249 if (op_instr[i]->isSDWA() || op_instr[i]->isDPP())
2252 VALU_instruction& valu = op_instr[i]->valu();
2253 if (valu.neg[0] != valu.neg[1] || valu.abs[0] != valu.abs[1] ||
2254 valu.opsel[0] != valu.opsel[1])
2256 opsel[i] = valu.opsel[0];
2258 Temp op0 = op_instr[i]->operands[0].getTemp();
2259 Temp op1 = op_instr[i]->operands[1].getTemp();
2260 if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))
2264 bitsize = op_bitsize;
2267 if (op[1].type() == RegType::sgpr) {
2268 std::swap(op[0], op[1]);
2269 opsel[0].swap(opsel[1]);
2271 unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);
2272 if (num_sgprs > (ctx.program->gfx_level >= GFX10 ? 2 : 1))
2275 aco_opcode new_op = aco_opcode::num_opcodes;
2277 case 16: new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; break;
2278 case 32: new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; break;
2279 case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break;
2281 bool needs_vop3 = num_sgprs > 1 || (opsel[0] && op[0].type() != RegType::vgpr);
2282 VALU_instruction* new_instr = create_instruction<VALU_instruction>(
2283 new_op, needs_vop3 ? asVOP3(Format::VOPC) : Format::VOPC, 2, 1);
2285 new_instr->opsel = opsel;
2286 new_instr->operands[0] = copy_operand(ctx, Operand(op[0]));
2287 new_instr->operands[1] = copy_operand(ctx, Operand(op[1]));
2288 new_instr->definitions[0] = instr->definitions[0];
2289 new_instr->pass_flags = instr->pass_flags;
2291 decrease_uses(ctx, op_instr[0]);
2292 decrease_uses(ctx, op_instr[1]);
2294 ctx.info[instr->definitions[0].tempId()].label = 0;
2295 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2297 instr.reset(new_instr);
2302 /* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
2303 * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
2305 combine_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2307 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
2309 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2312 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
2313 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
2315 Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
2316 Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
2317 if (!nan_test || !cmp)
2319 if (nan_test->isSDWA() || cmp->isSDWA())
2322 if (get_f32_cmp(cmp->opcode) == expected_nan_test)
2323 std::swap(nan_test, cmp);
2324 else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
2327 if (!is_fp_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))
2330 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
2332 if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())
2335 unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());
2336 unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());
2337 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
2338 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
2339 VALU_instruction& cmp_valu = cmp->valu();
2340 VALU_instruction& nan_valu = nan_test->valu();
2341 if ((prop_cmp0 != prop_nan0 || cmp_valu.opsel[0] != nan_valu.opsel[0]) &&
2342 (prop_cmp0 != prop_nan1 || cmp_valu.opsel[0] != nan_valu.opsel[1]))
2344 if ((prop_cmp1 != prop_nan0 || cmp_valu.opsel[1] != nan_valu.opsel[0]) &&
2345 (prop_cmp1 != prop_nan1 || cmp_valu.opsel[1] != nan_valu.opsel[1]))
2347 if (prop_cmp0 == prop_cmp1 && cmp_valu.opsel[0] == cmp_valu.opsel[1])
2350 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
2351 VALU_instruction* new_instr = create_instruction<VALU_instruction>(
2352 new_op, cmp->isVOP3() ? asVOP3(Format::VOPC) : Format::VOPC, 2, 1);
2353 new_instr->neg = cmp_valu.neg;
2354 new_instr->abs = cmp_valu.abs;
2355 new_instr->clamp = cmp_valu.clamp;
2356 new_instr->omod = cmp_valu.omod;
2357 new_instr->opsel = cmp_valu.opsel;
2358 new_instr->operands[0] = copy_operand(ctx, cmp->operands[0]);
2359 new_instr->operands[1] = copy_operand(ctx, cmp->operands[1]);
2360 new_instr->definitions[0] = instr->definitions[0];
2361 new_instr->pass_flags = instr->pass_flags;
2363 decrease_uses(ctx, nan_test);
2364 decrease_uses(ctx, cmp);
2366 ctx.info[instr->definitions[0].tempId()].label = 0;
2367 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2369 instr.reset(new_instr);
2374 /* Optimize v_cmp of constant with subgroup invocation to a constant mask.
2375 * Ideally, we can trade v_cmp for a constant (or literal).
2376 * In a less ideal case, we trade v_cmp for a SALU instruction, which is still a win.
2379 optimize_cmp_subgroup_invocation(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2381 /* This optimization only applies to VOPC with 2 operands. */
2382 if (instr->operands.size() != 2)
2385 /* Find the constant operand or return early if there isn't one. */
2386 const int const_op_idx = instr->operands[0].isConstant() ? 0
2387 : instr->operands[1].isConstant() ? 1
2389 if (const_op_idx == -1)
2392 /* Find the operand that has the subgroup invocation. */
2393 const int mbcnt_op_idx = 1 - const_op_idx;
2394 const Operand mbcnt_op = instr->operands[mbcnt_op_idx];
2395 if (!mbcnt_op.isTemp() || !ctx.info[mbcnt_op.tempId()].is_subgroup_invocation())
2398 /* Adjust opcode so we don't have to care about const_op_idx below. */
2399 const aco_opcode op = const_op_idx == 0 ? get_swapped(instr->opcode) : instr->opcode;
2400 const unsigned wave_size = ctx.program->wave_size;
2401 const unsigned val = instr->operands[const_op_idx].constantValue();
2403 /* Find suitable constant bitmask corresponding to the value. */
2404 unsigned first_bit = 0, num_bits = 0;
2406 case aco_opcode::v_cmp_eq_u32:
2407 case aco_opcode::v_cmp_eq_i32:
2409 num_bits = val >= wave_size ? 0 : 1;
2411 case aco_opcode::v_cmp_le_u32:
2412 case aco_opcode::v_cmp_le_i32:
2414 num_bits = val >= wave_size ? wave_size : (val + 1);
2416 case aco_opcode::v_cmp_lt_u32:
2417 case aco_opcode::v_cmp_lt_i32:
2419 num_bits = val >= wave_size ? wave_size : val;
2421 case aco_opcode::v_cmp_ge_u32:
2422 case aco_opcode::v_cmp_ge_i32:
2424 num_bits = val >= wave_size ? 0 : (wave_size - val);
2426 case aco_opcode::v_cmp_gt_u32:
2427 case aco_opcode::v_cmp_gt_i32:
2428 first_bit = val + 1;
2429 num_bits = val >= wave_size ? 0 : (wave_size - val - 1);
2431 default: return false;
2434 Instruction* cpy = NULL;
2435 const uint64_t mask = BITFIELD64_RANGE(first_bit, num_bits);
2436 if (wave_size == 64 && mask > 0x7fffffff && mask != -1ull) {
2437 /* Mask can't be represented as a 64-bit constant or literal, use s_bfm_b64. */
2438 cpy = create_instruction<SOP2_instruction>(aco_opcode::s_bfm_b64, Format::SOP2, 2, 1);
2439 cpy->operands[0] = Operand::c32(num_bits);
2440 cpy->operands[1] = Operand::c32(first_bit);
2442 /* Copy mask as a literal constant. */
2444 create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1);
2445 cpy->operands[0] = wave_size == 32 ? Operand::c32((uint32_t)mask) : Operand::c64(mask);
2448 cpy->definitions[0] = instr->definitions[0];
2449 ctx.info[instr->definitions[0].tempId()].label = 0;
2450 decrease_uses(ctx, ctx.info[mbcnt_op.tempId()].instr);
2457 is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value)
2459 if (op.isConstant()) {
2460 *value = op.constantValue64();
2462 } else if (op.isTemp()) {
2463 unsigned id = original_temp_id(ctx, op.getTemp());
2464 if (!ctx.info[id].is_constant_or_literal(bit_size))
2466 *value = get_constant_op(ctx, ctx.info[id], bit_size).constantValue64();
2473 is_constant_nan(uint64_t value, unsigned bit_size)
2476 return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff);
2477 else if (bit_size == 32)
2478 return ((value >> 23) & 0xff) == 0xff && (value & 0x7fffff);
2480 return ((value >> 52) & 0x7ff) == 0x7ff && (value & 0xfffffffffffff);
2483 /* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
2484 * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
2486 combine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2488 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
2490 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2493 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
2495 Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
2496 Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
2498 if (!nan_test || !cmp || nan_test->isSDWA() || cmp->isSDWA() || nan_test->isDPP() ||
2502 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
2503 if (get_f32_cmp(cmp->opcode) == expected_nan_test)
2504 std::swap(nan_test, cmp);
2505 else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
2508 unsigned bit_size = get_cmp_bitsize(cmp->opcode);
2509 if (!is_fp_cmp(cmp->opcode) || get_cmp_bitsize(nan_test->opcode) != bit_size)
2512 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
2514 if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())
2517 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
2518 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
2519 if (prop_nan0 != prop_nan1)
2522 VALU_instruction& vop3 = nan_test->valu();
2523 if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel[0] != vop3.opsel[1])
2526 int constant_operand = -1;
2527 for (unsigned i = 0; i < 2; i++) {
2528 if (cmp->operands[i].isTemp() &&
2529 original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0 &&
2530 cmp->valu().opsel[i] == nan_test->valu().opsel[0]) {
2531 constant_operand = !i;
2535 if (constant_operand == -1)
2538 uint64_t constant_value;
2539 if (!is_operand_constant(ctx, cmp->operands[constant_operand], bit_size, &constant_value))
2541 if (is_constant_nan(constant_value >> (cmp->valu().opsel[constant_operand] * 16), bit_size))
2544 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
2545 Instruction* new_instr = create_instruction<VALU_instruction>(new_op, cmp->format, 2, 1);
2546 new_instr->valu().neg = cmp->valu().neg;
2547 new_instr->valu().abs = cmp->valu().abs;
2548 new_instr->valu().clamp = cmp->valu().clamp;
2549 new_instr->valu().omod = cmp->valu().omod;
2550 new_instr->valu().opsel = cmp->valu().opsel;
2551 new_instr->operands[0] = copy_operand(ctx, cmp->operands[0]);
2552 new_instr->operands[1] = copy_operand(ctx, cmp->operands[1]);
2553 new_instr->definitions[0] = instr->definitions[0];
2554 new_instr->pass_flags = instr->pass_flags;
2556 decrease_uses(ctx, nan_test);
2557 decrease_uses(ctx, cmp);
2559 ctx.info[instr->definitions[0].tempId()].label = 0;
2560 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2562 instr.reset(new_instr);
2567 /* s_not(cmp(a, b)) -> get_inverse(cmp)(a, b) */
2569 combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2571 if (ctx.uses[instr->definitions[1].tempId()])
2573 if (!instr->operands[0].isTemp() || ctx.uses[instr->operands[0].tempId()] != 1)
2576 Instruction* cmp = follow_operand(ctx, instr->operands[0]);
2580 aco_opcode new_opcode = get_inverse(cmp->opcode);
2581 if (new_opcode == aco_opcode::num_opcodes)
2584 /* Invert compare instruction and assign this instruction's definition */
2585 cmp->opcode = new_opcode;
2586 ctx.info[instr->definitions[0].tempId()] = ctx.info[cmp->definitions[0].tempId()];
2587 std::swap(instr->definitions[0], cmp->definitions[0]);
2589 ctx.uses[instr->operands[0].tempId()]--;
2593 /* op1(op2(1, 2), 0) if swap = false
2594 * op1(0, op2(1, 2)) if swap = true */
2596 match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap,
2597 const char* shuffle_str, Operand operands[3], bitarray8& neg, bitarray8& abs,
2598 bitarray8& opsel, bool* op1_clamp, uint8_t* op1_omod, bool* inbetween_neg,
2599 bool* inbetween_abs, bool* inbetween_opsel, bool* precise)
2602 if (op1_instr->opcode != op1)
2605 Instruction* op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
2606 if (!op2_instr || op2_instr->opcode != op2)
2609 VALU_instruction* op1_valu = op1_instr->isVALU() ? &op1_instr->valu() : NULL;
2610 VALU_instruction* op2_valu = op2_instr->isVALU() ? &op2_instr->valu() : NULL;
2612 if (op1_instr->isSDWA() || op2_instr->isSDWA())
2614 if (op1_instr->isDPP() || op2_instr->isDPP())
2617 /* don't support inbetween clamp/omod */
2618 if (op2_valu && (op2_valu->clamp || op2_valu->omod))
2621 /* get operands and modifiers and check inbetween modifiers */
2622 *op1_clamp = op1_valu ? (bool)op1_valu->clamp : false;
2623 *op1_omod = op1_valu ? (unsigned)op1_valu->omod : 0u;
2626 *inbetween_neg = op1_valu ? op1_valu->neg[swap] : false;
2627 else if (op1_valu && op1_valu->neg[swap])
2631 *inbetween_abs = op1_valu ? op1_valu->abs[swap] : false;
2632 else if (op1_valu && op1_valu->abs[swap])
2635 if (inbetween_opsel)
2636 *inbetween_opsel = op1_valu ? op1_valu->opsel[swap] : false;
2637 else if (op1_valu && op1_valu->opsel[swap])
2640 *precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise();
2643 shuffle[shuffle_str[0] - '0'] = 0;
2644 shuffle[shuffle_str[1] - '0'] = 1;
2645 shuffle[shuffle_str[2] - '0'] = 2;
2647 operands[shuffle[0]] = op1_instr->operands[!swap];
2648 neg[shuffle[0]] = op1_valu ? op1_valu->neg[!swap] : false;
2649 abs[shuffle[0]] = op1_valu ? op1_valu->abs[!swap] : false;
2650 opsel[shuffle[0]] = op1_valu ? op1_valu->opsel[!swap] : false;
2652 for (unsigned i = 0; i < 2; i++) {
2653 operands[shuffle[i + 1]] = op2_instr->operands[i];
2654 neg[shuffle[i + 1]] = op2_valu ? op2_valu->neg[i] : false;
2655 abs[shuffle[i + 1]] = op2_valu ? op2_valu->abs[i] : false;
2656 opsel[shuffle[i + 1]] = op2_valu ? op2_valu->opsel[i] : false;
2659 /* check operands */
2660 if (!check_vop3_operands(ctx, 3, operands))
2667 create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
2668 Operand operands[3], uint8_t neg, uint8_t abs, uint8_t opsel, bool clamp,
2671 VALU_instruction* new_instr = create_instruction<VALU_instruction>(opcode, Format::VOP3, 3, 1);
2672 new_instr->neg = neg;
2673 new_instr->abs = abs;
2674 new_instr->clamp = clamp;
2675 new_instr->omod = omod;
2676 new_instr->opsel = opsel;
2677 new_instr->operands[0] = operands[0];
2678 new_instr->operands[1] = operands[1];
2679 new_instr->operands[2] = operands[2];
2680 new_instr->definitions[0] = instr->definitions[0];
2681 new_instr->pass_flags = instr->pass_flags;
2682 ctx.info[instr->definitions[0].tempId()].label = 0;
2684 instr.reset(new_instr);
2688 combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op,
2689 const char* shuffle, uint8_t ops)
2691 for (unsigned swap = 0; swap < 2; swap++) {
2692 if (!((1 << swap) & ops))
2695 Operand operands[3];
2696 bool clamp, precise;
2697 bitarray8 neg = 0, abs = 0, opsel = 0;
2699 if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg,
2700 abs, opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
2701 ctx.uses[instr->operands[swap].tempId()]--;
2702 create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
2709 /* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */
2711 combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2713 bool is_or = instr->opcode == aco_opcode::v_or_b32;
2714 aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32;
2716 if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32,
2719 if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32,
2722 if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2))
2724 if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))
2727 if (instr->isSDWA() || instr->isDPP())
2730 /* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)
2731 * v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b)
2732 * v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b)
2733 * v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b)
2735 for (unsigned i = 0; i < 2; i++) {
2736 Instruction* extins = follow_operand(ctx, instr->operands[i]);
2741 Operand operands[3];
2743 if (extins->opcode == aco_opcode::p_insert &&
2744 (extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) {
2747 Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue());
2749 (extins->opcode == aco_opcode::p_insert ||
2750 (extins->opcode == aco_opcode::p_extract &&
2751 extins->operands[3].constantEquals(0))) &&
2752 extins->operands[1].constantEquals(0)) {
2753 op = aco_opcode::v_and_or_b32;
2754 operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu);
2759 operands[0] = extins->operands[0];
2760 operands[2] = instr->operands[!i];
2762 if (!check_vop3_operands(ctx, 3, operands))
2765 uint8_t neg = 0, abs = 0, opsel = 0, omod = 0;
2767 if (instr->isVOP3())
2768 clamp = instr->valu().clamp;
2770 ctx.uses[instr->operands[i].tempId()]--;
2771 create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod);
2778 /* v_xor(a, s_not(b)) -> v_xnor(a, b)
2779 * v_xor(a, v_not(b)) -> v_xnor(a, b)
2782 combine_xor_not(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2784 if (instr->usesModifiers())
2787 for (unsigned i = 0; i < 2; i++) {
2788 Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
2790 (op_instr->opcode != aco_opcode::v_not_b32 &&
2791 op_instr->opcode != aco_opcode::s_not_b32) ||
2792 op_instr->usesModifiers() || op_instr->operands[0].isLiteral())
2795 instr->opcode = aco_opcode::v_xnor_b32;
2796 instr->operands[i] = copy_operand(ctx, op_instr->operands[0]);
2797 decrease_uses(ctx, op_instr);
2798 if (instr->operands[0].isOfType(RegType::vgpr))
2799 std::swap(instr->operands[0], instr->operands[1]);
2800 if (!instr->operands[1].isOfType(RegType::vgpr))
2801 instr->format = asVOP3(instr->format);
2809 /* v_not(v_xor(a, b)) -> v_xnor(a, b) */
2811 combine_not_xor(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2813 if (instr->usesModifiers())
2816 Instruction* op_instr = follow_operand(ctx, instr->operands[0]);
2817 if (!op_instr || op_instr->opcode != aco_opcode::v_xor_b32 || op_instr->isSDWA())
2820 ctx.uses[instr->operands[0].tempId()]--;
2821 std::swap(instr->definitions[0], op_instr->definitions[0]);
2822 op_instr->opcode = aco_opcode::v_xnor_b32;
2828 combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode op3src,
2831 /* TODO: this can handle SDWA min/max instructions by using opsel */
2833 /* min(min(a, b), c) -> min3(a, b, c)
2834 * max(max(a, b), c) -> max3(a, b, c)
2835 * gfx11: min(-min(a, b), c) -> maxmin(-a, -b, c)
2836 * gfx11: max(-max(a, b), c) -> minmax(-a, -b, c)
2838 for (unsigned swap = 0; swap < 2; swap++) {
2839 Operand operands[3];
2840 bool clamp, precise;
2841 bitarray8 opsel = 0, neg = 0, abs = 0;
2844 if (match_op3_for_vop3(ctx, instr->opcode, instr->opcode, instr.get(), swap, "120", operands,
2845 neg, abs, opsel, &clamp, &omod, &inbetween_neg, NULL, NULL,
2848 (minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
2849 ctx.uses[instr->operands[swap].tempId()]--;
2850 if (inbetween_neg) {
2853 create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
2855 create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
2861 /* min(-max(a, b), c) -> min3(-a, -b, c)
2862 * max(-min(a, b), c) -> max3(-a, -b, c)
2863 * gfx11: min(max(a, b), c) -> maxmin(a, b, c)
2864 * gfx11: max(min(a, b), c) -> minmax(a, b, c)
2866 for (unsigned swap = 0; swap < 2; swap++) {
2867 Operand operands[3];
2868 bool clamp, precise;
2869 bitarray8 opsel = 0, neg = 0, abs = 0;
2872 if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "120", operands, neg,
2873 abs, opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&
2875 (minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
2876 ctx.uses[instr->operands[swap].tempId()]--;
2877 if (inbetween_neg) {
2880 create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
2882 create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
2890 /* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
2891 * s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
2892 * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
2893 * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
2894 * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
2895 * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
2897 combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2900 if (!instr->operands[0].isTemp())
2902 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2905 Instruction* op2_instr = follow_operand(ctx, instr->operands[0]);
2908 switch (op2_instr->opcode) {
2909 case aco_opcode::s_and_b32:
2910 case aco_opcode::s_or_b32:
2911 case aco_opcode::s_xor_b32:
2912 case aco_opcode::s_and_b64:
2913 case aco_opcode::s_or_b64:
2914 case aco_opcode::s_xor_b64: break;
2915 default: return false;
2918 /* create instruction */
2919 std::swap(instr->definitions[0], op2_instr->definitions[0]);
2920 std::swap(instr->definitions[1], op2_instr->definitions[1]);
2921 ctx.uses[instr->operands[0].tempId()]--;
2922 ctx.info[op2_instr->definitions[0].tempId()].label = 0;
2924 switch (op2_instr->opcode) {
2925 case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break;
2926 case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break;
2927 case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break;
2928 case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break;
2929 case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break;
2930 case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break;
2937 /* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)
2938 * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
2939 * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
2940 * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
2942 combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2944 if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
2947 for (unsigned i = 0; i < 2; i++) {
2948 Instruction* op2_instr = follow_operand(ctx, instr->operands[i]);
2949 if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 &&
2950 op2_instr->opcode != aco_opcode::s_not_b64))
2952 if (ctx.uses[op2_instr->definitions[1].tempId()])
2955 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2956 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2959 ctx.uses[instr->operands[i].tempId()]--;
2960 instr->operands[0] = instr->operands[!i];
2961 instr->operands[1] = op2_instr->operands[0];
2962 ctx.info[instr->definitions[0].tempId()].label = 0;
2964 switch (instr->opcode) {
2965 case aco_opcode::s_and_b32: instr->opcode = aco_opcode::s_andn2_b32; break;
2966 case aco_opcode::s_or_b32: instr->opcode = aco_opcode::s_orn2_b32; break;
2967 case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_andn2_b64; break;
2968 case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_orn2_b64; break;
2977 /* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
2979 combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2981 if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
2984 for (unsigned i = 0; i < 2; i++) {
2985 Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true);
2986 if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
2987 ctx.uses[op2_instr->definitions[1].tempId()])
2989 if (!op2_instr->operands[1].isConstant())
2992 uint32_t shift = op2_instr->operands[1].constantValue();
2993 if (shift < 1 || shift > 4)
2996 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2997 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
3000 instr->operands[1] = instr->operands[!i];
3001 instr->operands[0] = copy_operand(ctx, op2_instr->operands[0]);
3002 decrease_uses(ctx, op2_instr);
3003 ctx.info[instr->definitions[0].tempId()].label = 0;
3005 instr->opcode = std::array<aco_opcode, 4>{
3006 aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32,
3007 aco_opcode::s_lshl4_add_u32}[shift - 1];
3014 /* s_abs_i32(s_sub_[iu]32(a, b)) -> s_absdiff_i32(a, b)
3015 * s_abs_i32(s_add_[iu]32(a, #b)) -> s_absdiff_i32(a, -b)
3018 combine_sabsdiff(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3020 if (!instr->operands[0].isTemp() || !ctx.info[instr->operands[0].tempId()].is_add_sub())
3023 Instruction* op_instr = follow_operand(ctx, instr->operands[0], false);
3027 if (op_instr->opcode == aco_opcode::s_add_i32 || op_instr->opcode == aco_opcode::s_add_u32) {
3028 for (unsigned i = 0; i < 2; i++) {
3030 if (op_instr->operands[!i].isLiteral() ||
3031 !is_operand_constant(ctx, op_instr->operands[i], 32, &constant))
3034 if (op_instr->operands[i].isTemp())
3035 ctx.uses[op_instr->operands[i].tempId()]--;
3036 op_instr->operands[0] = op_instr->operands[!i];
3037 op_instr->operands[1] = Operand::c32(-int32_t(constant));
3044 op_instr->opcode = aco_opcode::s_absdiff_i32;
3045 std::swap(instr->definitions[0], op_instr->definitions[0]);
3046 std::swap(instr->definitions[1], op_instr->definitions[1]);
3047 ctx.uses[instr->operands[0].tempId()]--;
3053 combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
3055 if (instr->usesModifiers())
3058 for (unsigned i = 0; i < 2; i++) {
3059 if (!((1 << i) & ops))
3061 if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2i() &&
3062 ctx.uses[instr->operands[i].tempId()] == 1) {
3064 aco_ptr<Instruction> new_instr;
3065 if (instr->operands[!i].isTemp() &&
3066 instr->operands[!i].getTemp().type() == RegType::vgpr) {
3067 new_instr.reset(create_instruction<VALU_instruction>(new_op, Format::VOP2, 3, 2));
3068 } else if (ctx.program->gfx_level >= GFX10 ||
3069 (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
3071 create_instruction<VALU_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
3075 ctx.uses[instr->operands[i].tempId()]--;
3076 new_instr->definitions[0] = instr->definitions[0];
3077 if (instr->definitions.size() == 2) {
3078 new_instr->definitions[1] = instr->definitions[1];
3080 new_instr->definitions[1] =
3081 Definition(ctx.program->allocateTmp(ctx.program->lane_mask));
3082 /* Make sure the uses vector is large enough and the number of
3083 * uses properly initialized to 0.
3085 ctx.uses.push_back(0);
3087 new_instr->operands[0] = Operand::zero();
3088 new_instr->operands[1] = instr->operands[!i];
3089 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
3090 new_instr->pass_flags = instr->pass_flags;
3091 instr = std::move(new_instr);
3092 ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
3101 combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3103 if (instr->usesModifiers())
3106 for (unsigned i = 0; i < 2; i++) {
3107 Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
3108 if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
3109 !op_instr->usesModifiers() && op_instr->operands[0].isTemp() &&
3110 op_instr->operands[0].getTemp().type() == RegType::vgpr &&
3111 op_instr->operands[1].constantEquals(0)) {
3112 aco_ptr<Instruction> new_instr{
3113 create_instruction<VALU_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)};
3114 ctx.uses[instr->operands[i].tempId()]--;
3115 new_instr->operands[0] = op_instr->operands[0];
3116 new_instr->operands[1] = instr->operands[!i];
3117 new_instr->definitions[0] = instr->definitions[0];
3118 new_instr->pass_flags = instr->pass_flags;
3119 instr = std::move(new_instr);
3120 ctx.info[instr->definitions[0].tempId()].label = 0;
3130 get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3,
3131 aco_opcode* med3, aco_opcode* minmax, bool* some_gfx9_only)
3134 #define MINMAX(type, gfx9) \
3135 case aco_opcode::v_min_##type: \
3136 case aco_opcode::v_max_##type: \
3137 *min = aco_opcode::v_min_##type; \
3138 *max = aco_opcode::v_max_##type; \
3139 *med3 = aco_opcode::v_med3_##type; \
3140 *min3 = aco_opcode::v_min3_##type; \
3141 *max3 = aco_opcode::v_max3_##type; \
3142 *minmax = op == *min ? aco_opcode::v_maxmin_##type : aco_opcode::v_minmax_##type; \
3143 *some_gfx9_only = gfx9; \
3145 #define MINMAX_INT16(type, gfx9) \
3146 case aco_opcode::v_min_##type: \
3147 case aco_opcode::v_max_##type: \
3148 *min = aco_opcode::v_min_##type; \
3149 *max = aco_opcode::v_max_##type; \
3150 *med3 = aco_opcode::v_med3_##type; \
3151 *min3 = aco_opcode::v_min3_##type; \
3152 *max3 = aco_opcode::v_max3_##type; \
3153 *minmax = aco_opcode::num_opcodes; \
3154 *some_gfx9_only = gfx9; \
3156 #define MINMAX_INT16_E64(type, gfx9) \
3157 case aco_opcode::v_min_##type##_e64: \
3158 case aco_opcode::v_max_##type##_e64: \
3159 *min = aco_opcode::v_min_##type##_e64; \
3160 *max = aco_opcode::v_max_##type##_e64; \
3161 *med3 = aco_opcode::v_med3_##type; \
3162 *min3 = aco_opcode::v_min3_##type; \
3163 *max3 = aco_opcode::v_max3_##type; \
3164 *minmax = aco_opcode::num_opcodes; \
3165 *some_gfx9_only = gfx9; \
3171 MINMAX_INT16(u16, true)
3172 MINMAX_INT16(i16, true)
3173 MINMAX_INT16_E64(u16, true)
3174 MINMAX_INT16_E64(i16, true)
3175 #undef MINMAX_INT16_E64
3178 default: return false;
3183 * v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
3184 * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
3187 combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode min, aco_opcode max,
3190 /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
3191 * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
3192 * minVal > maxVal, which means we can always select it to a v_med3_f32 */
3193 aco_opcode other_op;
3194 if (instr->opcode == min)
3196 else if (instr->opcode == max)
3201 for (unsigned swap = 0; swap < 2; swap++) {
3202 Operand operands[3];
3203 bool clamp, precise;
3204 bitarray8 opsel = 0, neg = 0, abs = 0;
3206 if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg,
3207 abs, opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
3208 /* max(min(src, upper), lower) returns upper if src is NaN, but
3209 * med3(src, lower, upper) returns lower.
3211 if (precise && instr->opcode != min &&
3212 (min == aco_opcode::v_min_f16 || min == aco_opcode::v_min_f32))
3215 int const0_idx = -1, const1_idx = -1;
3216 uint32_t const0 = 0, const1 = 0;
3217 for (int i = 0; i < 3; i++) {
3219 bool hi16 = opsel & (1 << i);
3220 if (operands[i].isConstant()) {
3221 val = hi16 ? operands[i].constantValue16(true) : operands[i].constantValue();
3222 } else if (operands[i].isTemp() &&
3223 ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {
3224 val = ctx.info[operands[i].tempId()].val >> (hi16 ? 16 : 0);
3228 if (const0_idx >= 0) {
3236 if (const0_idx < 0 || const1_idx < 0)
3239 int lower_idx = const0_idx;
3241 case aco_opcode::v_min_f32:
3242 case aco_opcode::v_min_f16: {
3243 float const0_f, const1_f;
3244 if (min == aco_opcode::v_min_f32) {
3245 memcpy(&const0_f, &const0, 4);
3246 memcpy(&const1_f, &const1, 4);
3248 const0_f = _mesa_half_to_float(const0);
3249 const1_f = _mesa_half_to_float(const1);
3251 if (abs[const0_idx])
3252 const0_f = fabsf(const0_f);
3253 if (abs[const1_idx])
3254 const1_f = fabsf(const1_f);
3255 if (neg[const0_idx])
3256 const0_f = -const0_f;
3257 if (neg[const1_idx])
3258 const1_f = -const1_f;
3259 lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
3262 case aco_opcode::v_min_u32: {
3263 lower_idx = const0 < const1 ? const0_idx : const1_idx;
3266 case aco_opcode::v_min_u16:
3267 case aco_opcode::v_min_u16_e64: {
3268 lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;
3271 case aco_opcode::v_min_i32: {
3273 const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
3275 const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
3276 lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
3279 case aco_opcode::v_min_i16:
3280 case aco_opcode::v_min_i16_e64: {
3281 int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;
3282 int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;
3283 lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
3288 int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
3290 if (instr->opcode == min) {
3291 if (upper_idx != 0 || lower_idx == 0)
3294 if (upper_idx == 0 || lower_idx != 0)
3298 ctx.uses[instr->operands[swap].tempId()]--;
3299 create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);
3309 apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3311 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
3312 instr->opcode == aco_opcode::v_lshrrev_b64 ||
3313 instr->opcode == aco_opcode::v_ashrrev_i64;
3315 /* find candidates and create the set of sgprs already read */
3316 unsigned sgpr_ids[2] = {0, 0};
3317 uint32_t operand_mask = 0;
3318 bool has_literal = false;
3319 for (unsigned i = 0; i < instr->operands.size(); i++) {
3320 if (instr->operands[i].isLiteral())
3322 if (!instr->operands[i].isTemp())
3324 if (instr->operands[i].getTemp().type() == RegType::sgpr) {
3325 if (instr->operands[i].tempId() != sgpr_ids[0])
3326 sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
3328 ssa_info& info = ctx.info[instr->operands[i].tempId()];
3329 if (is_copy_label(ctx, instr, info, i) && info.temp.type() == RegType::sgpr)
3330 operand_mask |= 1u << i;
3331 if (info.is_extract() && info.instr->operands[0].getTemp().type() == RegType::sgpr)
3332 operand_mask |= 1u << i;
3334 unsigned max_sgprs = 1;
3335 if (ctx.program->gfx_level >= GFX10 && !is_shift64)
3340 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
3342 /* keep on applying sgprs until there is nothing left to be done */
3343 while (operand_mask) {
3344 uint32_t sgpr_idx = 0;
3345 uint32_t sgpr_info_id = 0;
3346 uint32_t mask = operand_mask;
3349 unsigned i = u_bit_scan(&mask);
3350 uint16_t uses = ctx.uses[instr->operands[i].tempId()];
3351 if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {
3353 sgpr_info_id = instr->operands[i].tempId();
3356 operand_mask &= ~(1u << sgpr_idx);
3358 ssa_info& info = ctx.info[sgpr_info_id];
3360 /* Applying two sgprs require making it VOP3, so don't do it unless it's
3361 * definitively beneficial.
3362 * TODO: this is too conservative because later the use count could be reduced to 1 */
3363 if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() &&
3364 !instr->isSDWA() && instr->format != Format::VOP3P)
3367 Temp sgpr = info.is_extract() ? info.instr->operands[0].getTemp() : info.temp;
3368 bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];
3369 if (new_sgpr && num_sgprs >= max_sgprs)
3373 instr->format = withoutDPP(instr->format);
3375 if (sgpr_idx == 1 && instr->isDPP())
3378 if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
3379 info.is_extract()) {
3380 /* can_apply_extract() checks SGPR encoding restrictions */
3381 if (info.is_extract() && can_apply_extract(ctx, instr, sgpr_idx, info))
3382 apply_extract(ctx, instr, sgpr_idx, info);
3383 else if (info.is_extract())
3385 instr->operands[sgpr_idx] = Operand(sgpr);
3386 } else if (can_swap_operands(instr, &instr->opcode) && !instr->valu().opsel[sgpr_idx]) {
3387 instr->operands[sgpr_idx] = instr->operands[0];
3388 instr->operands[0] = Operand(sgpr);
3389 instr->valu().opsel[0].swap(instr->valu().opsel[sgpr_idx]);
3390 /* swap bits using a 4-entry LUT */
3391 uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;
3392 operand_mask = (operand_mask & ~0x3) | swapped;
3393 } else if (can_use_VOP3(ctx, instr) && !info.is_extract()) {
3394 instr->format = asVOP3(instr->format);
3395 instr->operands[sgpr_idx] = Operand(sgpr);
3401 sgpr_ids[num_sgprs++] = sgpr.id();
3402 ctx.uses[sgpr_info_id]--;
3403 ctx.uses[sgpr.id()]++;
3405 /* TODO: handle when it's a VGPR */
3406 if ((ctx.info[sgpr.id()].label & (label_extract | label_temp)) &&
3407 ctx.info[sgpr.id()].temp.type() == RegType::sgpr)
3408 operand_mask |= 1u << sgpr_idx;
3412 /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
3414 apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3416 if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||
3417 !instr_info.can_use_output_modifiers[(int)instr->opcode])
3420 bool can_vop3 = can_use_VOP3(ctx, instr);
3422 instr->opcode == aco_opcode::v_fma_mix_f32 || instr->opcode == aco_opcode::v_fma_mixlo_f16;
3423 if (!instr->isSDWA() && !is_mad_mix && !can_vop3)
3426 /* SDWA omod is GFX9+. */
3427 bool can_use_omod = (can_vop3 || ctx.program->gfx_level >= GFX9) && !instr->isVOP3P();
3429 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
3431 uint64_t omod_labels = label_omod2 | label_omod4 | label_omod5;
3432 if (!def_info.is_clamp() && !(can_use_omod && (def_info.label & omod_labels)))
3434 /* if the omod/clamp instruction is dead, then the single user of this
3435 * instruction is a different instruction */
3436 if (!ctx.uses[def_info.instr->definitions[0].tempId()])
3439 if (def_info.instr->definitions[0].bytes() != instr->definitions[0].bytes())
3442 /* MADs/FMAs are created later, so we don't have to update the original add */
3443 assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
3445 if (!instr->isSDWA() && !instr->isVOP3P())
3446 instr->format = asVOP3(instr->format);
3448 if (!def_info.is_clamp() && (instr->valu().clamp || instr->valu().omod))
3451 if (def_info.is_omod2())
3452 instr->valu().omod = 1;
3453 else if (def_info.is_omod4())
3454 instr->valu().omod = 2;
3455 else if (def_info.is_omod5())
3456 instr->valu().omod = 3;
3457 else if (def_info.is_clamp())
3458 instr->valu().clamp = true;
3460 instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
3461 ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert | label_f2f16;
3462 ctx.uses[def_info.instr->definitions[0].tempId()]--;
3467 /* Combine an p_insert (or p_extract, in some cases) instruction with instr.
3468 * p_insert(instr(...)) -> instr_insert().
3471 apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3473 if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1)
3476 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
3477 if (!def_info.is_insert())
3479 /* if the insert instruction is dead, then the single user of this
3480 * instruction is a different instruction */
3481 if (!ctx.uses[def_info.instr->definitions[0].tempId()])
3484 /* MADs/FMAs are created later, so we don't have to update the original add */
3485 assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
3487 SubdwordSel sel = parse_insert(def_info.instr);
3490 if (!can_use_SDWA(ctx.program->gfx_level, instr, true))
3493 convert_to_SDWA(ctx.program->gfx_level, instr);
3494 if (instr->sdwa().dst_sel.size() != 4)
3496 instr->sdwa().dst_sel = sel;
3498 instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
3499 ctx.info[instr->definitions[0].tempId()].label = 0;
3500 ctx.uses[def_info.instr->definitions[0].tempId()]--;
3505 /* Remove superfluous extract after ds_read like so:
3506 * p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN()
3509 apply_ds_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
3511 /* Check if p_extract has a usedef operand and is the only user. */
3512 if (!ctx.info[extract->operands[0].tempId()].is_usedef() ||
3513 ctx.uses[extract->operands[0].tempId()] > 1)
3516 /* Check if the usedef is a DS instruction. */
3517 Instruction* ds = ctx.info[extract->operands[0].tempId()].instr;
3518 if (ds->format != Format::DS)
3521 unsigned extract_idx = extract->operands[1].constantValue();
3522 unsigned bits_extracted = extract->operands[2].constantValue();
3523 unsigned sign_ext = extract->operands[3].constantValue();
3524 unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
3526 /* TODO: These are doable, but probably don't occur too often. */
3527 if (extract_idx || sign_ext || dst_bitsize != 32)
3530 unsigned bits_loaded = 0;
3531 if (ds->opcode == aco_opcode::ds_read_u8 || ds->opcode == aco_opcode::ds_read_u8_d16)
3533 else if (ds->opcode == aco_opcode::ds_read_u16 || ds->opcode == aco_opcode::ds_read_u16_d16)
3538 /* Shrink the DS load if the extracted bit size is smaller. */
3539 bits_loaded = MIN2(bits_loaded, bits_extracted);
3541 /* Change the DS opcode so it writes the full register. */
3542 if (bits_loaded == 8)
3543 ds->opcode = aco_opcode::ds_read_u8;
3544 else if (bits_loaded == 16)
3545 ds->opcode = aco_opcode::ds_read_u16;
3547 unreachable("Forgot to add DS opcode above.");
3549 /* The DS now produces the exact same thing as the extract, remove the extract. */
3550 std::swap(ds->definitions[0], extract->definitions[0]);
3551 ctx.uses[extract->definitions[0].tempId()] = 0;
3552 ctx.info[ds->definitions[0].tempId()].label = 0;
3556 /* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */
3558 combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3560 if (instr->usesModifiers())
3563 for (unsigned i = 0; i < 2; i++) {
3564 Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
3565 if (op_instr && op_instr->opcode == aco_opcode::v_subbrev_co_u32 &&
3566 op_instr->operands[0].constantEquals(0) && op_instr->operands[1].constantEquals(0) &&
3567 !op_instr->usesModifiers()) {
3569 aco_ptr<Instruction> new_instr;
3570 if (instr->operands[!i].isTemp() &&
3571 instr->operands[!i].getTemp().type() == RegType::vgpr) {
3573 create_instruction<VALU_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1));
3574 } else if (ctx.program->gfx_level >= GFX10 ||
3575 (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
3576 new_instr.reset(create_instruction<VALU_instruction>(aco_opcode::v_cndmask_b32,
3577 asVOP3(Format::VOP2), 3, 1));
3582 new_instr->operands[0] = Operand::zero();
3583 new_instr->operands[1] = instr->operands[!i];
3584 new_instr->operands[2] = copy_operand(ctx, op_instr->operands[2]);
3585 new_instr->definitions[0] = instr->definitions[0];
3586 new_instr->pass_flags = instr->pass_flags;
3587 instr = std::move(new_instr);
3588 decrease_uses(ctx, op_instr);
3589 ctx.info[instr->definitions[0].tempId()].label = 0;
3597 /* v_and(a, not(b)) -> v_bfi_b32(b, 0, a)
3598 * v_or(a, not(b)) -> v_bfi_b32(b, a, -1)
3601 combine_v_andor_not(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3603 if (instr->usesModifiers())
3606 for (unsigned i = 0; i < 2; i++) {
3607 Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
3608 if (op_instr && !op_instr->usesModifiers() &&
3609 (op_instr->opcode == aco_opcode::v_not_b32 ||
3610 op_instr->opcode == aco_opcode::s_not_b32)) {
3613 op_instr->operands[0],
3615 instr->operands[!i],
3617 if (instr->opcode == aco_opcode::v_or_b32) {
3618 ops[1] = instr->operands[!i];
3619 ops[2] = Operand::c32(-1);
3621 if (!check_vop3_operands(ctx, 3, ops))
3624 Instruction* new_instr =
3625 create_instruction<VALU_instruction>(aco_opcode::v_bfi_b32, Format::VOP3, 3, 1);
3627 if (op_instr->operands[0].isTemp())
3628 ctx.uses[op_instr->operands[0].tempId()]++;
3629 for (unsigned j = 0; j < 3; j++)
3630 new_instr->operands[j] = ops[j];
3631 new_instr->definitions[0] = instr->definitions[0];
3632 new_instr->pass_flags = instr->pass_flags;
3633 instr.reset(new_instr);
3634 decrease_uses(ctx, op_instr);
3635 ctx.info[instr->definitions[0].tempId()].label = 0;
3643 /* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c)
3644 * v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c)
3645 * v_sub(c, s_lshl(a, b)) -> v_mad_i32_i24(a, -(1<<b), c)
3646 * v_sub(c, v_lshlrev(a, b)) -> v_mad_i32_i24(b, -(1<<a), c)
3649 combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr, bool is_sub)
3651 if (instr->usesModifiers())
3654 /* Substractions: start at operand 1 to avoid mixup such as
3655 * turning v_sub(v_lshlrev(a, b), c) into v_mad_i32_i24(b, -(1<<a), c)
3657 unsigned start_op_idx = is_sub ? 1 : 0;
3659 /* Don't allow 24-bit operands on subtraction because
3660 * v_mad_i32_i24 applies a sign extension.
3662 bool allow_24bit = !is_sub;
3664 for (unsigned i = start_op_idx; i < 2; i++) {
3665 Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
3669 if (op_instr->opcode != aco_opcode::s_lshl_b32 &&
3670 op_instr->opcode != aco_opcode::v_lshlrev_b32)
3673 int shift_op_idx = op_instr->opcode == aco_opcode::s_lshl_b32 ? 1 : 0;
3675 if (op_instr->operands[shift_op_idx].isConstant() &&
3676 ((allow_24bit && op_instr->operands[!shift_op_idx].is24bit()) ||
3677 op_instr->operands[!shift_op_idx].is16bit())) {
3678 uint32_t multiplier = 1 << (op_instr->operands[shift_op_idx].constantValue() % 32u);
3680 multiplier = -multiplier;
3681 if (is_sub ? (multiplier < 0xff800000) : (multiplier > 0xffffff))
3685 op_instr->operands[!shift_op_idx],
3686 Operand::c32(multiplier),
3687 instr->operands[!i],
3689 if (!check_vop3_operands(ctx, 3, ops))
3692 ctx.uses[instr->operands[i].tempId()]--;
3694 aco_opcode mad_op = is_sub ? aco_opcode::v_mad_i32_i24 : aco_opcode::v_mad_u32_u24;
3695 aco_ptr<VALU_instruction> new_instr{
3696 create_instruction<VALU_instruction>(mad_op, Format::VOP3, 3, 1)};
3697 for (unsigned op_idx = 0; op_idx < 3; ++op_idx)
3698 new_instr->operands[op_idx] = ops[op_idx];
3699 new_instr->definitions[0] = instr->definitions[0];
3700 new_instr->pass_flags = instr->pass_flags;
3701 instr = std::move(new_instr);
3702 ctx.info[instr->definitions[0].tempId()].label = 0;
3711 propagate_swizzles(VALU_instruction* instr, bool opsel_lo, bool opsel_hi)
3713 /* propagate swizzles which apply to a result down to the instruction's operands:
3714 * result = a.xy + b.xx -> result.yx = a.yx + b.xx */
3715 uint8_t tmp_lo = instr->opsel_lo;
3716 uint8_t tmp_hi = instr->opsel_hi;
3717 uint8_t neg_lo = instr->neg_lo;
3718 uint8_t neg_hi = instr->neg_hi;
3719 if (opsel_lo == 1) {
3720 instr->opsel_lo = tmp_hi;
3721 instr->neg_lo = neg_hi;
3723 if (opsel_hi == 0) {
3724 instr->opsel_hi = tmp_lo;
3725 instr->neg_hi = neg_lo;
3730 combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3732 VALU_instruction* vop3p = &instr->valu();
3735 if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) &&
3736 vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1 &&
3737 !vop3p->opsel_lo[1] && !vop3p->opsel_hi[1]) {
3739 ssa_info& info = ctx.info[instr->operands[0].tempId()];
3740 if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) {
3741 VALU_instruction* candidate = &ctx.info[instr->operands[0].tempId()].instr->valu();
3742 candidate->clamp = true;
3743 propagate_swizzles(candidate, vop3p->opsel_lo[0], vop3p->opsel_hi[0]);
3744 instr->definitions[0].swapTemp(candidate->definitions[0]);
3745 ctx.info[candidate->definitions[0].tempId()].instr = candidate;
3746 ctx.uses[instr->definitions[0].tempId()]--;
3751 /* check for fneg modifiers */
3752 for (unsigned i = 0; i < instr->operands.size(); i++) {
3753 if (!can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i))
3755 Operand& op = instr->operands[i];
3759 ssa_info& info = ctx.info[op.tempId()];
3760 if (info.is_vop3p() && info.instr->opcode == aco_opcode::v_pk_mul_f16 &&
3761 info.instr->operands[1].constantEquals(0x3C00)) {
3763 VALU_instruction* fneg = &info.instr->valu();
3765 if (fneg->opsel_lo[1] || fneg->opsel_hi[1])
3769 for (unsigned j = 0; j < instr->operands.size(); j++)
3770 ops[j] = instr->operands[j];
3771 ops[i] = info.instr->operands[0];
3772 if (!check_vop3_operands(ctx, instr->operands.size(), ops))
3777 instr->operands[i] = fneg->operands[0];
3779 /* opsel_lo/hi is either 0 or 1:
3780 * if 0 - pick selection from fneg->lo
3781 * if 1 - pick selection from fneg->hi
3783 bool opsel_lo = vop3p->opsel_lo[i];
3784 bool opsel_hi = vop3p->opsel_hi[i];
3785 bool neg_lo = fneg->neg_lo[0] ^ fneg->neg_lo[1];
3786 bool neg_hi = fneg->neg_hi[0] ^ fneg->neg_hi[1];
3787 vop3p->neg_lo[i] ^= opsel_lo ? neg_hi : neg_lo;
3788 vop3p->neg_hi[i] ^= opsel_hi ? neg_hi : neg_lo;
3789 vop3p->opsel_lo[i] ^= opsel_lo ? !fneg->opsel_hi[0] : fneg->opsel_lo[0];
3790 vop3p->opsel_hi[i] ^= opsel_hi ? !fneg->opsel_hi[0] : fneg->opsel_lo[0];
3792 if (--ctx.uses[fneg->definitions[0].tempId()])
3793 ctx.uses[fneg->operands[0].tempId()]++;
3797 if (instr->opcode == aco_opcode::v_pk_add_f16 || instr->opcode == aco_opcode::v_pk_add_u16) {
3798 bool fadd = instr->opcode == aco_opcode::v_pk_add_f16;
3799 if (fadd && instr->definitions[0].isPrecise())
3802 Instruction* mul_instr = nullptr;
3803 unsigned add_op_idx = 0;
3804 bitarray8 mul_neg_lo = 0, mul_neg_hi = 0, mul_opsel_lo = 0, mul_opsel_hi = 0;
3805 uint32_t uses = UINT32_MAX;
3807 /* find the 'best' mul instruction to combine with the add */
3808 for (unsigned i = 0; i < 2; i++) {
3809 Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
3813 if (ctx.info[instr->operands[i].tempId()].is_vop3p()) {
3815 if (op_instr->opcode != aco_opcode::v_pk_mul_f16 ||
3816 op_instr->definitions[0].isPrecise())
3819 if (op_instr->opcode != aco_opcode::v_pk_mul_lo_u16)
3823 Operand op[3] = {op_instr->operands[0], op_instr->operands[1], instr->operands[1 - i]};
3824 if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
3827 /* no clamp allowed between mul and add */
3828 if (op_instr->valu().clamp)
3831 mul_instr = op_instr;
3833 uses = ctx.uses[instr->operands[i].tempId()];
3834 mul_neg_lo = mul_instr->valu().neg_lo;
3835 mul_neg_hi = mul_instr->valu().neg_hi;
3836 mul_opsel_lo = mul_instr->valu().opsel_lo;
3837 mul_opsel_hi = mul_instr->valu().opsel_hi;
3838 } else if (instr->operands[i].bytes() == 2) {
3839 if ((fadd && (op_instr->opcode != aco_opcode::v_mul_f16 ||
3840 op_instr->definitions[0].isPrecise())) ||
3841 (!fadd && op_instr->opcode != aco_opcode::v_mul_lo_u16 &&
3842 op_instr->opcode != aco_opcode::v_mul_lo_u16_e64))
3845 if (op_instr->valu().clamp || op_instr->valu().omod || op_instr->valu().abs)
3848 if (op_instr->isDPP() || (op_instr->isSDWA() && (op_instr->sdwa().sel[0].size() < 2 ||
3849 op_instr->sdwa().sel[1].size() < 2)))
3852 Operand op[3] = {op_instr->operands[0], op_instr->operands[1], instr->operands[1 - i]};
3853 if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
3856 mul_instr = op_instr;
3858 uses = ctx.uses[instr->operands[i].tempId()];
3859 mul_neg_lo = mul_instr->valu().neg;
3860 mul_neg_hi = mul_instr->valu().neg;
3861 if (mul_instr->isSDWA()) {
3862 for (unsigned j = 0; j < 2; j++)
3863 mul_opsel_lo[j] = mul_instr->sdwa().sel[j].offset();
3865 mul_opsel_lo = mul_instr->valu().opsel;
3867 mul_opsel_hi = mul_opsel_lo;
3874 /* turn mul + packed add into v_pk_fma_f16 */
3875 aco_opcode mad = fadd ? aco_opcode::v_pk_fma_f16 : aco_opcode::v_pk_mad_u16;
3876 aco_ptr<VALU_instruction> fma{create_instruction<VALU_instruction>(mad, Format::VOP3P, 3, 1)};
3877 fma->operands[0] = copy_operand(ctx, mul_instr->operands[0]);
3878 fma->operands[1] = copy_operand(ctx, mul_instr->operands[1]);
3879 fma->operands[2] = instr->operands[add_op_idx];
3880 fma->clamp = vop3p->clamp;
3881 fma->neg_lo = mul_neg_lo;
3882 fma->neg_hi = mul_neg_hi;
3883 fma->opsel_lo = mul_opsel_lo;
3884 fma->opsel_hi = mul_opsel_hi;
3885 propagate_swizzles(fma.get(), vop3p->opsel_lo[1 - add_op_idx],
3886 vop3p->opsel_hi[1 - add_op_idx]);
3887 fma->opsel_lo[2] = vop3p->opsel_lo[add_op_idx];
3888 fma->opsel_hi[2] = vop3p->opsel_hi[add_op_idx];
3889 fma->neg_lo[2] = vop3p->neg_lo[add_op_idx];
3890 fma->neg_hi[2] = vop3p->neg_hi[add_op_idx];
3891 fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx];
3892 fma->neg_hi[1] = fma->neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx];
3893 fma->definitions[0] = instr->definitions[0];
3894 fma->pass_flags = instr->pass_flags;
3895 instr = std::move(fma);
3896 ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
3897 decrease_uses(ctx, mul_instr);
3903 can_use_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3905 if (ctx.program->gfx_level < GFX9)
3908 /* v_mad_mix* on GFX9 always flushes denormals for 16-bit inputs/outputs */
3909 if (ctx.program->gfx_level == GFX9 && ctx.fp_mode.denorm16_64)
3912 switch (instr->opcode) {
3913 case aco_opcode::v_add_f32:
3914 case aco_opcode::v_sub_f32:
3915 case aco_opcode::v_subrev_f32:
3916 case aco_opcode::v_mul_f32:
3917 case aco_opcode::v_fma_f32: break;
3918 case aco_opcode::v_fma_mix_f32:
3919 case aco_opcode::v_fma_mixlo_f16: return true;
3920 default: return false;
3923 if (instr->opcode == aco_opcode::v_fma_f32 && !ctx.program->dev.fused_mad_mix &&
3924 instr->definitions[0].isPrecise())
3927 return !instr->valu().omod && !instr->isSDWA() && !instr->isDPP();
3931 to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3933 bool is_add = instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32;
3935 aco_ptr<VALU_instruction> vop3p{
3936 create_instruction<VALU_instruction>(aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1)};
3938 for (unsigned i = 0; i < instr->operands.size(); i++) {
3939 vop3p->operands[is_add + i] = instr->operands[i];
3940 vop3p->neg_lo[is_add + i] = instr->valu().neg[i];
3941 vop3p->neg_hi[is_add + i] = instr->valu().abs[i];
3943 if (instr->opcode == aco_opcode::v_mul_f32) {
3944 vop3p->operands[2] = Operand::zero();
3945 vop3p->neg_lo[2] = true;
3946 } else if (is_add) {
3947 vop3p->operands[0] = Operand::c32(0x3f800000);
3948 if (instr->opcode == aco_opcode::v_sub_f32)
3949 vop3p->neg_lo[2] ^= true;
3950 else if (instr->opcode == aco_opcode::v_subrev_f32)
3951 vop3p->neg_lo[1] ^= true;
3953 vop3p->definitions[0] = instr->definitions[0];
3954 vop3p->clamp = instr->valu().clamp;
3955 vop3p->pass_flags = instr->pass_flags;
3956 instr = std::move(vop3p);
3958 ctx.info[instr->definitions[0].tempId()].label &= label_f2f16 | label_clamp | label_mul;
3959 if (ctx.info[instr->definitions[0].tempId()].label & label_mul)
3960 ctx.info[instr->definitions[0].tempId()].instr = instr.get();
3964 combine_output_conversion(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3966 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
3967 if (!def_info.is_f2f16())
3969 Instruction* conv = def_info.instr;
3971 if (!can_use_mad_mix(ctx, instr) || ctx.uses[instr->definitions[0].tempId()] != 1)
3974 if (!ctx.uses[conv->definitions[0].tempId()])
3977 if (conv->usesModifiers())
3980 if (!instr->isVOP3P())
3981 to_mad_mix(ctx, instr);
3983 instr->opcode = aco_opcode::v_fma_mixlo_f16;
3984 instr->definitions[0].swapTemp(conv->definitions[0]);
3985 if (conv->definitions[0].isPrecise())
3986 instr->definitions[0].setPrecise(true);
3987 ctx.info[instr->definitions[0].tempId()].label &= label_clamp;
3988 ctx.uses[conv->definitions[0].tempId()]--;
3994 combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3996 if (!can_use_mad_mix(ctx, instr))
3999 for (unsigned i = 0; i < instr->operands.size(); i++) {
4000 if (!instr->operands[i].isTemp())
4002 Temp tmp = instr->operands[i].getTemp();
4003 if (!ctx.info[tmp.id()].is_f2f32())
4006 Instruction* conv = ctx.info[tmp.id()].instr;
4007 if (conv->valu().clamp || conv->valu().omod) {
4009 } else if (conv->isSDWA() &&
4010 (conv->sdwa().dst_sel.size() != 4 || conv->sdwa().sel[0].size() != 2)) {
4012 } else if (conv->isDPP()) {
4016 if (get_operand_size(instr, i) != 32)
4019 /* Conversion to VOP3P will add inline constant operands, but that shouldn't affect
4020 * check_vop3_operands(). */
4022 for (unsigned j = 0; j < instr->operands.size(); j++)
4023 op[j] = instr->operands[j];
4024 op[i] = conv->operands[0];
4025 if (!check_vop3_operands(ctx, instr->operands.size(), op))
4028 if (!instr->isVOP3P()) {
4030 instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32;
4031 to_mad_mix(ctx, instr);
4035 if (--ctx.uses[tmp.id()])
4036 ctx.uses[conv->operands[0].tempId()]++;
4037 instr->operands[i].setTemp(conv->operands[0].getTemp());
4038 if (conv->definitions[0].isPrecise())
4039 instr->definitions[0].setPrecise(true);
4040 instr->valu().opsel_hi[i] = true;
4041 if (conv->isSDWA() && conv->sdwa().sel[0].offset() == 2)
4042 instr->valu().opsel_lo[i] = true;
4044 instr->valu().opsel_lo[i] = conv->valu().opsel[0];
4045 bool neg = conv->valu().neg[0];
4046 bool abs = conv->valu().abs[0];
4047 if (!instr->valu().abs[i]) {
4048 instr->valu().neg[i] ^= neg;
4049 instr->valu().abs[i] = abs;
4054 // TODO: we could possibly move the whole label_instruction pass to combine_instruction:
4055 // this would mean that we'd have to fix the instruction uses while value propagation
4057 /* also returns true for inf */
4059 is_pow_of_two(opt_ctx& ctx, Operand op)
4061 if (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(op.bytes() * 8))
4062 return is_pow_of_two(ctx, get_constant_op(ctx, ctx.info[op.tempId()], op.bytes() * 8));
4063 else if (!op.isConstant())
4066 uint64_t val = op.constantValue64();
4068 if (op.bytes() == 4) {
4069 uint32_t exponent = (val & 0x7f800000) >> 23;
4070 uint32_t fraction = val & 0x007fffff;
4071 return (exponent >= 127) && (fraction == 0);
4072 } else if (op.bytes() == 2) {
4073 uint32_t exponent = (val & 0x7c00) >> 10;
4074 uint32_t fraction = val & 0x03ff;
4075 return (exponent >= 15) && (fraction == 0);
4077 assert(op.bytes() == 8);
4078 uint64_t exponent = (val & UINT64_C(0x7ff0000000000000)) >> 52;
4079 uint64_t fraction = val & UINT64_C(0x000fffffffffffff);
4080 return (exponent >= 1023) && (fraction == 0);
4085 combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
4087 if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
4090 if (instr->isVALU()) {
4091 /* Apply SDWA. Do this after label_instruction() so it can remove
4092 * label_extract if not all instructions can take SDWA. */
4093 for (unsigned i = 0; i < instr->operands.size(); i++) {
4094 Operand& op = instr->operands[i];
4097 ssa_info& info = ctx.info[op.tempId()];
4098 if (!info.is_extract())
4100 /* if there are that many uses, there are likely better combinations */
4101 // TODO: delay applying extract to a point where we know better
4102 if (ctx.uses[op.tempId()] > 4) {
4103 info.label &= ~label_extract;
4106 if (info.is_extract() &&
4107 (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
4108 instr->operands[i].getTemp().type() == RegType::sgpr) &&
4109 can_apply_extract(ctx, instr, i, info)) {
4110 /* Increase use count of the extract's operand if the extract still has uses. */
4111 apply_extract(ctx, instr, i, info);
4112 if (--ctx.uses[instr->operands[i].tempId()])
4113 ctx.uses[info.instr->operands[0].tempId()]++;
4114 instr->operands[i].setTemp(info.instr->operands[0].getTemp());
4118 if (can_apply_sgprs(ctx, instr))
4119 apply_sgprs(ctx, instr);
4120 combine_mad_mix(ctx, instr);
4121 while (apply_omod_clamp(ctx, instr) || combine_output_conversion(ctx, instr))
4123 apply_insert(ctx, instr);
4126 if (instr->isVOP3P() && instr->opcode != aco_opcode::v_fma_mix_f32 &&
4127 instr->opcode != aco_opcode::v_fma_mixlo_f16)
4128 return combine_vop3p(ctx, instr);
4130 if (instr->isSDWA() || instr->isDPP())
4133 if (instr->opcode == aco_opcode::p_extract) {
4134 ssa_info& info = ctx.info[instr->operands[0].tempId()];
4135 if (info.is_extract() && can_apply_extract(ctx, instr, 0, info)) {
4136 apply_extract(ctx, instr, 0, info);
4137 if (--ctx.uses[instr->operands[0].tempId()])
4138 ctx.uses[info.instr->operands[0].tempId()]++;
4139 instr->operands[0].setTemp(info.instr->operands[0].getTemp());
4142 apply_ds_extract(ctx, instr);
4145 if (instr->isVOPC()) {
4146 if (optimize_cmp_subgroup_invocation(ctx, instr))
4150 /* TODO: There are still some peephole optimizations that could be done:
4151 * - abs(a - b) -> s_absdiff_i32
4152 * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
4153 * - patterns for v_alignbit_b32 and v_alignbyte_b32
4154 * These aren't probably too interesting though.
4155 * There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but
4156 * probably more useful than the previously mentioned optimizations.
4157 * The various comparison optimizations also currently only work with 32-bit
4160 /* neg(mul(a, b)) -> mul(neg(a), b), abs(mul(a, b)) -> mul(abs(a), abs(b)) */
4161 if ((ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)) &&
4162 ctx.uses[instr->operands[1].tempId()] == 1) {
4163 Temp val = ctx.info[instr->definitions[0].tempId()].temp;
4165 if (!ctx.info[val.id()].is_mul())
4168 Instruction* mul_instr = ctx.info[val.id()].instr;
4170 if (mul_instr->operands[0].isLiteral())
4172 if (mul_instr->valu().clamp)
4174 if (mul_instr->isSDWA() || mul_instr->isDPP())
4176 if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32 &&
4177 ctx.fp_mode.preserve_signed_zero_inf_nan32)
4179 if (mul_instr->definitions[0].bytes() != instr->definitions[0].bytes())
4182 /* convert to mul(neg(a), b), mul(abs(a), abs(b)) or mul(neg(abs(a)), abs(b)) */
4183 ctx.uses[mul_instr->definitions[0].tempId()]--;
4184 Definition def = instr->definitions[0];
4185 bool is_neg = ctx.info[instr->definitions[0].tempId()].is_neg();
4186 bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
4187 uint32_t pass_flags = instr->pass_flags;
4188 Format format = mul_instr->format == Format::VOP2 ? asVOP3(Format::VOP2) : mul_instr->format;
4189 instr.reset(create_instruction<VALU_instruction>(mul_instr->opcode, format,
4190 mul_instr->operands.size(), 1));
4191 std::copy(mul_instr->operands.cbegin(), mul_instr->operands.cend(), instr->operands.begin());
4192 instr->pass_flags = pass_flags;
4193 instr->definitions[0] = def;
4194 VALU_instruction& new_mul = instr->valu();
4195 VALU_instruction& mul = mul_instr->valu();
4196 new_mul.neg = mul.neg;
4197 new_mul.abs = mul.abs;
4198 new_mul.omod = mul.omod;
4199 new_mul.opsel = mul.opsel;
4200 new_mul.opsel_lo = mul.opsel_lo;
4201 new_mul.opsel_hi = mul.opsel_hi;
4203 new_mul.neg[0] = new_mul.neg[1] = false;
4204 new_mul.abs[0] = new_mul.abs[1] = true;
4206 new_mul.neg[0] ^= is_neg;
4207 new_mul.clamp = false;
4209 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
4213 /* combine mul+add -> mad */
4215 (instr->opcode == aco_opcode::v_fma_mix_f32 ||
4216 instr->opcode == aco_opcode::v_fma_mixlo_f16) &&
4217 !instr->valu().neg_lo[0] &&
4218 ((instr->operands[0].constantEquals(0x3f800000) && !instr->valu().opsel_hi[0]) ||
4219 (instr->operands[0].constantEquals(0x3C00) && instr->valu().opsel_hi[0] &&
4220 !instr->valu().opsel_lo[0]));
4221 bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 ||
4222 instr->opcode == aco_opcode::v_subrev_f32;
4223 bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 ||
4224 instr->opcode == aco_opcode::v_subrev_f16;
4225 bool mad64 = instr->opcode == aco_opcode::v_add_f64;
4226 if (is_add_mix || mad16 || mad32 || mad64) {
4227 Instruction* mul_instr = nullptr;
4228 unsigned add_op_idx = 0;
4229 uint32_t uses = UINT32_MAX;
4230 bool emit_fma = false;
4231 /* find the 'best' mul instruction to combine with the add */
4232 for (unsigned i = is_add_mix ? 1 : 0; i < instr->operands.size(); i++) {
4233 if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_mul())
4235 ssa_info& info = ctx.info[instr->operands[i].tempId()];
4237 /* no clamp/omod allowed between mul and add */
4238 if (info.instr->isVOP3() && (info.instr->valu().clamp || info.instr->valu().omod))
4240 if (info.instr->isVOP3P() && info.instr->valu().clamp)
4242 /* v_fma_mix_f32/etc can't do omod */
4243 if (info.instr->isVOP3P() && instr->isVOP3() && instr->valu().omod)
4245 /* don't promote fp16 to fp32 or remove fp32->fp16->fp32 conversions */
4246 if (is_add_mix && info.instr->definitions[0].bytes() == 2)
4249 if (get_operand_size(instr, i) != info.instr->definitions[0].bytes() * 8)
4252 bool legacy = info.instr->opcode == aco_opcode::v_mul_legacy_f32;
4253 bool mad_mix = is_add_mix || info.instr->isVOP3P();
4255 /* Multiplication by power-of-two should never need rounding. 1/power-of-two also works,
4256 * but using fma removes denormal flushing (0xfffffe * 0.5 + 0x810001a2).
4258 bool is_fma_precise = is_pow_of_two(ctx, info.instr->operands[0]) ||
4259 is_pow_of_two(ctx, info.instr->operands[1]);
4261 bool has_fma = mad16 || mad64 || (legacy && ctx.program->gfx_level >= GFX10_3) ||
4262 (mad32 && !legacy && !mad_mix && ctx.program->dev.has_fast_fma32) ||
4263 (mad_mix && ctx.program->dev.fused_mad_mix);
4264 bool has_mad = mad_mix ? !ctx.program->dev.fused_mad_mix
4265 : ((mad32 && ctx.program->gfx_level < GFX10_3) ||
4266 (mad16 && ctx.program->gfx_level <= GFX9));
4269 (!(info.instr->definitions[0].isPrecise() || instr->definitions[0].isPrecise()) ||
4272 has_mad && (mad_mix || mad32 ? ctx.fp_mode.denorm32 : ctx.fp_mode.denorm16_64) == 0;
4273 if (mad_mix && legacy)
4275 if (!can_use_fma && !can_use_mad)
4278 unsigned candidate_add_op_idx = is_add_mix ? (3 - i) : (1 - i);
4279 Operand op[3] = {info.instr->operands[0], info.instr->operands[1],
4280 instr->operands[candidate_add_op_idx]};
4281 if (info.instr->isSDWA() || info.instr->isDPP() || !check_vop3_operands(ctx, 3, op) ||
4282 ctx.uses[instr->operands[i].tempId()] > uses)
4285 if (ctx.uses[instr->operands[i].tempId()] == uses) {
4286 unsigned cur_idx = mul_instr->definitions[0].tempId();
4287 unsigned new_idx = info.instr->definitions[0].tempId();
4288 if (cur_idx > new_idx)
4292 mul_instr = info.instr;
4293 add_op_idx = candidate_add_op_idx;
4294 uses = ctx.uses[instr->operands[i].tempId()];
4295 emit_fma = !can_use_mad;
4299 /* turn mul+add into v_mad/v_fma */
4300 Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1],
4301 instr->operands[add_op_idx]};
4302 ctx.uses[mul_instr->definitions[0].tempId()]--;
4303 if (ctx.uses[mul_instr->definitions[0].tempId()]) {
4305 ctx.uses[op[0].tempId()]++;
4307 ctx.uses[op[1].tempId()]++;
4310 bool neg[3] = {false, false, false};
4311 bool abs[3] = {false, false, false};
4314 bitarray8 opsel_lo = 0;
4315 bitarray8 opsel_hi = 0;
4316 bitarray8 opsel = 0;
4317 unsigned mul_op_idx = (instr->isVOP3P() ? 3 : 1) - add_op_idx;
4319 VALU_instruction& valu_mul = mul_instr->valu();
4320 neg[0] = valu_mul.neg[0];
4321 neg[1] = valu_mul.neg[1];
4322 abs[0] = valu_mul.abs[0];
4323 abs[1] = valu_mul.abs[1];
4324 opsel_lo = valu_mul.opsel_lo & 0x3;
4325 opsel_hi = valu_mul.opsel_hi & 0x3;
4326 opsel = valu_mul.opsel & 0x3;
4328 VALU_instruction& valu = instr->valu();
4329 neg[2] = valu.neg[add_op_idx];
4330 abs[2] = valu.abs[add_op_idx];
4331 opsel_lo[2] = valu.opsel_lo[add_op_idx];
4332 opsel_hi[2] = valu.opsel_hi[add_op_idx];
4333 opsel[2] = valu.opsel[add_op_idx];
4334 opsel[3] = valu.opsel[3];
4337 /* abs of the multiplication result */
4338 if (valu.abs[mul_op_idx]) {
4344 /* neg of the multiplication result */
4345 neg[1] ^= valu.neg[mul_op_idx];
4347 if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
4348 neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
4349 else if (instr->opcode == aco_opcode::v_subrev_f32 ||
4350 instr->opcode == aco_opcode::v_subrev_f16)
4351 neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
4353 aco_ptr<Instruction> add_instr = std::move(instr);
4354 aco_ptr<VALU_instruction> mad;
4355 if (add_instr->isVOP3P() || mul_instr->isVOP3P()) {
4359 aco_opcode mad_op = add_instr->definitions[0].bytes() == 2 ? aco_opcode::v_fma_mixlo_f16
4360 : aco_opcode::v_fma_mix_f32;
4361 mad.reset(create_instruction<VALU_instruction>(mad_op, Format::VOP3P, 3, 1));
4366 aco_opcode mad_op = emit_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
4367 if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32) {
4368 assert(emit_fma == (ctx.program->gfx_level >= GFX10_3));
4369 mad_op = emit_fma ? aco_opcode::v_fma_legacy_f32 : aco_opcode::v_mad_legacy_f32;
4371 mad_op = emit_fma ? (ctx.program->gfx_level == GFX8 ? aco_opcode::v_fma_legacy_f16
4372 : aco_opcode::v_fma_f16)
4373 : (ctx.program->gfx_level == GFX8 ? aco_opcode::v_mad_legacy_f16
4374 : aco_opcode::v_mad_f16);
4376 mad_op = aco_opcode::v_fma_f64;
4379 mad.reset(create_instruction<VALU_instruction>(mad_op, Format::VOP3, 3, 1));
4382 for (unsigned i = 0; i < 3; i++) {
4383 mad->operands[i] = op[i];
4384 mad->neg[i] = neg[i];
4385 mad->abs[i] = abs[i];
4389 mad->opsel_lo = opsel_lo;
4390 mad->opsel_hi = opsel_hi;
4392 mad->definitions[0] = add_instr->definitions[0];
4393 mad->definitions[0].setPrecise(add_instr->definitions[0].isPrecise() ||
4394 mul_instr->definitions[0].isPrecise());
4395 mad->pass_flags = add_instr->pass_flags;
4397 instr = std::move(mad);
4399 /* mark this ssa_def to be re-checked for profitability and literals */
4400 ctx.mad_infos.emplace_back(std::move(add_instr), mul_instr->definitions[0].tempId());
4401 ctx.info[instr->definitions[0].tempId()].set_mad(ctx.mad_infos.size() - 1);
4405 /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
4406 else if (((instr->opcode == aco_opcode::v_mul_f32 &&
4407 !ctx.fp_mode.preserve_signed_zero_inf_nan32) ||
4408 instr->opcode == aco_opcode::v_mul_legacy_f32) &&
4409 !instr->usesModifiers() && !ctx.fp_mode.must_flush_denorms32) {
4410 for (unsigned i = 0; i < 2; i++) {
4411 if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
4412 ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() &&
4413 instr->operands[!i].getTemp().type() == RegType::vgpr) {
4414 ctx.uses[instr->operands[i].tempId()]--;
4415 ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
4417 aco_ptr<VALU_instruction> new_instr{
4418 create_instruction<VALU_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
4419 new_instr->operands[0] = Operand::zero();
4420 new_instr->operands[1] = instr->operands[!i];
4421 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
4422 new_instr->definitions[0] = instr->definitions[0];
4423 new_instr->pass_flags = instr->pass_flags;
4424 instr = std::move(new_instr);
4425 ctx.info[instr->definitions[0].tempId()].label = 0;
4429 } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->gfx_level >= GFX9) {
4430 if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012",
4432 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32,
4434 } else if (combine_add_or_then_and_lshl(ctx, instr)) {
4435 } else if (combine_v_andor_not(ctx, instr)) {
4437 } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->gfx_level >= GFX10) {
4438 if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012",
4440 } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32,
4442 } else if (combine_xor_not(ctx, instr)) {
4444 } else if (instr->opcode == aco_opcode::v_not_b32 && ctx.program->gfx_level >= GFX10) {
4445 combine_not_xor(ctx, instr);
4446 } else if (instr->opcode == aco_opcode::v_add_u16) {
4447 combine_three_valu_op(
4448 ctx, instr, aco_opcode::v_mul_lo_u16,
4449 ctx.program->gfx_level == GFX8 ? aco_opcode::v_mad_legacy_u16 : aco_opcode::v_mad_u16,
4451 } else if (instr->opcode == aco_opcode::v_add_u16_e64) {
4452 combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16_e64, aco_opcode::v_mad_u16, "120",
4454 } else if (instr->opcode == aco_opcode::v_add_u32) {
4455 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
4456 } else if (combine_add_bcnt(ctx, instr)) {
4457 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
4458 aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
4459 } else if (ctx.program->gfx_level >= GFX9 && !instr->usesModifiers()) {
4460 if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120",
4462 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32,
4464 } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32,
4466 } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32,
4468 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
4470 } else if (combine_add_or_then_and_lshl(ctx, instr)) {
4473 } else if (instr->opcode == aco_opcode::v_add_co_u32 ||
4474 instr->opcode == aco_opcode::v_add_co_u32_e64) {
4475 bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0;
4476 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
4477 } else if (!carry_out && combine_add_bcnt(ctx, instr)) {
4478 } else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
4479 aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
4480 } else if (!carry_out && combine_add_lshl(ctx, instr, false)) {
4482 } else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 ||
4483 instr->opcode == aco_opcode::v_sub_co_u32_e64) {
4485 instr->opcode != aco_opcode::v_sub_u32 && ctx.uses[instr->definitions[1].tempId()] > 0;
4486 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2)) {
4487 } else if (!carry_out && combine_add_lshl(ctx, instr, true)) {
4489 } else if (instr->opcode == aco_opcode::v_subrev_u32 ||
4490 instr->opcode == aco_opcode::v_subrev_co_u32 ||
4491 instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
4492 combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
4493 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->gfx_level >= GFX9) {
4494 combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120",
4496 } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) &&
4497 ctx.program->gfx_level >= GFX9) {
4498 combine_salu_lshl_add(ctx, instr);
4499 } else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
4500 if (!combine_salu_not_bitwise(ctx, instr))
4501 combine_inverse_comparison(ctx, instr);
4502 } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
4503 instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
4504 if (combine_ordering_test(ctx, instr)) {
4505 } else if (combine_comparison_ordering(ctx, instr)) {
4506 } else if (combine_constant_comparison_ordering(ctx, instr)) {
4507 } else if (combine_salu_n2(ctx, instr)) {
4509 } else if (instr->opcode == aco_opcode::s_abs_i32) {
4510 combine_sabsdiff(ctx, instr);
4511 } else if (instr->opcode == aco_opcode::v_and_b32) {
4512 if (combine_and_subbrev(ctx, instr)) {
4513 } else if (combine_v_andor_not(ctx, instr)) {
4515 } else if (instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) {
4516 /* set existing v_fma_f32 with label_mad so we can create v_fmamk_f32/v_fmaak_f32.
4517 * since ctx.uses[mad_info::mul_temp_id] is always 0, we don't have to worry about
4518 * select_instruction() using mad_info::add_instr.
4520 ctx.mad_infos.emplace_back(nullptr, 0);
4521 ctx.info[instr->definitions[0].tempId()].set_mad(ctx.mad_infos.size() - 1);
4522 } else if (instr->opcode == aco_opcode::v_med3_f32 || instr->opcode == aco_opcode::v_med3_f16) {
4524 if (detect_clamp(instr.get(), &idx)) {
4525 instr->format = asVOP3(Format::VOP2);
4526 instr->operands[0] = instr->operands[idx];
4527 instr->operands[1] = Operand::zero();
4529 instr->opcode == aco_opcode::v_med3_f32 ? aco_opcode::v_add_f32 : aco_opcode::v_add_f16;
4530 instr->valu().clamp = true;
4531 instr->valu().abs = (uint8_t)instr->valu().abs[idx];
4532 instr->valu().neg = (uint8_t)instr->valu().neg[idx];
4533 instr->operands.pop_back();
4536 aco_opcode min, max, min3, max3, med3, minmax;
4537 bool some_gfx9_only;
4538 if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &minmax,
4540 (!some_gfx9_only || ctx.program->gfx_level >= GFX9)) {
4541 if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,
4542 instr->opcode == min ? min3 : max3, minmax)) {
4544 combine_clamp(ctx, instr, min, max, med3);
4551 to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
4553 /* Check every operand to make sure they are suitable. */
4554 for (Operand& op : instr->operands) {
4557 if (!ctx.info[op.tempId()].is_uniform_bool() && !ctx.info[op.tempId()].is_uniform_bitwise())
4561 switch (instr->opcode) {
4562 case aco_opcode::s_and_b32:
4563 case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break;
4564 case aco_opcode::s_or_b32:
4565 case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_or_b32; break;
4566 case aco_opcode::s_xor_b32:
4567 case aco_opcode::s_xor_b64: instr->opcode = aco_opcode::s_absdiff_i32; break;
4569 /* Don't transform other instructions. They are very unlikely to appear here. */
4573 for (Operand& op : instr->operands) {
4574 ctx.uses[op.tempId()]--;
4576 if (ctx.info[op.tempId()].is_uniform_bool()) {
4577 /* Just use the uniform boolean temp. */
4578 op.setTemp(ctx.info[op.tempId()].temp);
4579 } else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
4580 /* Use the SCC definition of the predecessor instruction.
4581 * This allows the predecessor to get picked up by the same optimization (if it has no
4582 * divergent users), and it also makes sure that the current instruction will keep working
4583 * even if the predecessor won't be transformed.
4585 Instruction* pred_instr = ctx.info[op.tempId()].instr;
4586 assert(pred_instr->definitions.size() >= 2);
4587 assert(pred_instr->definitions[1].isFixed() &&
4588 pred_instr->definitions[1].physReg() == scc);
4589 op.setTemp(pred_instr->definitions[1].getTemp());
4591 unreachable("Invalid operand on uniform bitwise instruction.");
4594 ctx.uses[op.tempId()]++;
4597 instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
4598 assert(instr->operands[0].regClass() == s1);
4599 assert(instr->operands[1].regClass() == s1);
4604 select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
4606 const uint32_t threshold = 4;
4608 if (is_dead(ctx.uses, instr.get())) {
4613 /* convert split_vector into a copy or extract_vector if only one definition is ever used */
4614 if (instr->opcode == aco_opcode::p_split_vector) {
4615 unsigned num_used = 0;
4617 unsigned split_offset = 0;
4618 for (unsigned i = 0, offset = 0; i < instr->definitions.size();
4619 offset += instr->definitions[i++].bytes()) {
4620 if (ctx.uses[instr->definitions[i].tempId()]) {
4623 split_offset = offset;
4627 if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
4628 ctx.uses[instr->operands[0].tempId()] == 1) {
4629 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
4633 for (Operand& vec_op : vec->operands) {
4634 if (off == split_offset) {
4638 off += vec_op.bytes();
4640 if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {
4641 ctx.uses[instr->operands[0].tempId()]--;
4642 for (Operand& vec_op : vec->operands) {
4643 if (vec_op.isTemp())
4644 ctx.uses[vec_op.tempId()]--;
4647 ctx.uses[op.tempId()]++;
4649 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
4650 aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
4651 extract->operands[0] = op;
4652 extract->definitions[0] = instr->definitions[idx];
4653 instr = std::move(extract);
4659 if (!done && num_used == 1 &&
4660 instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
4661 split_offset % instr->definitions[idx].bytes() == 0) {
4662 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
4663 aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
4664 extract->operands[0] = instr->operands[0];
4665 extract->operands[1] =
4666 Operand::c32((uint32_t)split_offset / instr->definitions[idx].bytes());
4667 extract->definitions[0] = instr->definitions[idx];
4668 instr = std::move(extract);
4672 mad_info* mad_info = NULL;
4673 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
4674 mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val];
4675 /* re-check mad instructions */
4676 if (ctx.uses[mad_info->mul_temp_id] && mad_info->add_instr) {
4677 ctx.uses[mad_info->mul_temp_id]++;
4678 if (instr->operands[0].isTemp())
4679 ctx.uses[instr->operands[0].tempId()]--;
4680 if (instr->operands[1].isTemp())
4681 ctx.uses[instr->operands[1].tempId()]--;
4682 instr.swap(mad_info->add_instr);
4685 /* check literals */
4686 else if (!instr->isDPP() && !instr->isVOP3P() && instr->opcode != aco_opcode::v_fma_f64 &&
4687 instr->opcode != aco_opcode::v_mad_legacy_f32 &&
4688 instr->opcode != aco_opcode::v_fma_legacy_f32) {
4689 /* FMA can only take literals on GFX10+ */
4690 if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
4691 ctx.program->gfx_level < GFX10)
4693 /* There are no v_fmaak_legacy_f16/v_fmamk_legacy_f16 and on chips where VOP3 can take
4694 * literals (GFX10+), these instructions don't exist.
4696 if (instr->opcode == aco_opcode::v_fma_legacy_f16)
4699 uint32_t literal_mask = 0;
4700 uint32_t fp16_mask = 0;
4701 uint32_t sgpr_mask = 0;
4702 uint32_t vgpr_mask = 0;
4703 uint32_t literal_uses = UINT32_MAX;
4704 uint32_t literal_value = 0;
4706 /* Iterate in reverse to prefer v_madak/v_fmaak. */
4707 for (int i = 2; i >= 0; i--) {
4708 Operand& op = instr->operands[i];
4711 if (ctx.info[op.tempId()].is_literal(get_operand_size(instr, i))) {
4712 uint32_t new_literal = ctx.info[op.tempId()].val;
4713 float value = uif(new_literal);
4714 uint16_t fp16_val = _mesa_float_to_half(value);
4715 bool is_denorm = (fp16_val & 0x7fff) != 0 && (fp16_val & 0x7fff) <= 0x3ff;
4716 if (_mesa_half_to_float(fp16_val) == value &&
4717 (!is_denorm || (ctx.fp_mode.denorm16_64 & fp_denorm_keep_in)))
4718 fp16_mask |= 1 << i;
4720 if (!literal_mask || literal_value == new_literal) {
4721 literal_value = new_literal;
4722 literal_uses = MIN2(literal_uses, ctx.uses[op.tempId()]);
4723 literal_mask |= 1 << i;
4727 sgpr_mask |= op.isOfType(RegType::sgpr) << i;
4728 vgpr_mask |= op.isOfType(RegType::vgpr) << i;
4731 /* The constant bus limitations before GFX10 disallows SGPRs. */
4732 if (sgpr_mask && ctx.program->gfx_level < GFX10)
4735 /* Encoding needs a vgpr. */
4739 /* v_madmk/v_fmamk needs a vgpr in the third source. */
4740 if (!(literal_mask & 0b100) && !(vgpr_mask & 0b100))
4743 /* opsel with GFX11+ is the only modifier supported by fmamk/fmaak*/
4744 if (instr->valu().abs || instr->valu().neg || instr->valu().omod || instr->valu().clamp ||
4745 (instr->valu().opsel && ctx.program->gfx_level < GFX11))
4748 if (instr->valu().opsel & ~vgpr_mask)
4751 /* We can't use three unique fp16 literals */
4752 if (fp16_mask == 0b111)
4755 if ((instr->opcode == aco_opcode::v_fma_f32 ||
4756 (instr->opcode == aco_opcode::v_mad_f32 && !instr->definitions[0].isPrecise())) &&
4757 !instr->valu().omod && ctx.program->gfx_level >= GFX10 &&
4758 util_bitcount(fp16_mask) > std::max<uint32_t>(util_bitcount(literal_mask), 1)) {
4759 assert(ctx.program->dev.fused_mad_mix);
4760 u_foreach_bit (i, fp16_mask)
4761 ctx.uses[instr->operands[i].tempId()]--;
4762 mad_info->fp16_mask = fp16_mask;
4766 /* Limit the number of literals to apply to not increase the code
4767 * size too much, but always apply literals for v_mad->v_madak
4768 * because both instructions are 64-bit and this doesn't increase
4770 * TODO: try to apply the literals earlier to lower the number of
4771 * uses below threshold
4773 if (literal_mask && (literal_uses < threshold || (literal_mask & 0b100))) {
4774 u_foreach_bit (i, literal_mask)
4775 ctx.uses[instr->operands[i].tempId()]--;
4776 mad_info->literal_mask = literal_mask;
4782 /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions
4783 * when it isn't beneficial */
4784 if (instr->isBranch() && instr->operands.size() && instr->operands[0].isTemp() &&
4785 instr->operands[0].isFixed() && instr->operands[0].physReg() == scc) {
4786 ctx.info[instr->operands[0].tempId()].set_scc_needed();
4788 } else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
4789 instr->opcode == aco_opcode::s_cselect_b32) &&
4790 instr->operands[2].isTemp()) {
4791 ctx.info[instr->operands[2].tempId()].set_scc_needed();
4794 /* check for literals */
4795 if (!instr->isSALU() && !instr->isVALU())
4798 /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
4799 if (instr->definitions.size() && ctx.uses[instr->definitions[0].tempId()] == 0 &&
4800 ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
4801 bool transform_done = to_uniform_bool_instr(ctx, instr);
4803 if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
4804 /* Swap the two definition IDs in order to avoid overusing the SCC.
4805 * This reduces extra moves generated by RA. */
4806 uint32_t def0_id = instr->definitions[0].getTemp().id();
4807 uint32_t def1_id = instr->definitions[1].getTemp().id();
4808 instr->definitions[0].setTemp(Temp(def1_id, s1));
4809 instr->definitions[1].setTemp(Temp(def0_id, s1));
4815 /* This optimization is done late in order to be able to apply otherwise
4816 * unsafe optimizations such as the inverse comparison optimization.
4818 if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_and_b64) {
4819 if (instr->operands[0].isTemp() && fixed_to_exec(instr->operands[1]) &&
4820 ctx.uses[instr->operands[0].tempId()] == 1 &&
4821 ctx.uses[instr->definitions[1].tempId()] == 0 &&
4822 can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), instr->pass_flags)) {
4823 ctx.uses[instr->operands[0].tempId()]--;
4824 ctx.info[instr->operands[0].tempId()].instr->definitions[0].setTemp(
4825 instr->definitions[0].getTemp());
4831 /* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */
4832 if (instr->isVALU() && !instr->isDPP()) {
4833 for (unsigned i = 0; i < instr->operands.size(); i++) {
4834 if (!instr->operands[i].isTemp())
4836 ssa_info info = ctx.info[instr->operands[i].tempId()];
4838 if (!info.is_dpp() || info.instr->pass_flags != instr->pass_flags)
4841 /* We won't eliminate the DPP mov if the operand is used twice */
4842 bool op_used_twice = false;
4843 for (unsigned j = 0; j < instr->operands.size(); j++)
4844 op_used_twice |= i != j && instr->operands[i] == instr->operands[j];
4849 if (!can_swap_operands(instr, &instr->opcode, 0, i))
4851 instr->valu().swapOperands(0, i);
4854 if (!can_use_DPP(ctx.program->gfx_level, instr, info.is_dpp8()))
4857 bool dpp8 = info.is_dpp8();
4858 bool input_mods = can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, 0) &&
4859 get_operand_size(instr, 0) == 32;
4860 bool mov_uses_mods = info.instr->valu().neg[0] || info.instr->valu().abs[0];
4861 if (((dpp8 && ctx.program->gfx_level < GFX11) || !input_mods) && mov_uses_mods)
4864 convert_to_DPP(ctx.program->gfx_level, instr, dpp8);
4867 DPP8_instruction* dpp = &instr->dpp8();
4868 for (unsigned j = 0; j < 8; ++j)
4869 dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j];
4871 instr->format = asVOP3(instr->format);
4873 DPP16_instruction* dpp = &instr->dpp16();
4874 dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl;
4875 dpp->bound_ctrl = info.instr->dpp16().bound_ctrl;
4878 instr->valu().neg[0] ^= info.instr->valu().neg[0] && !instr->valu().abs[0];
4879 instr->valu().abs[0] |= info.instr->valu().abs[0];
4881 if (--ctx.uses[info.instr->definitions[0].tempId()])
4882 ctx.uses[info.instr->operands[0].tempId()]++;
4883 instr->operands[0].setTemp(info.instr->operands[0].getTemp());
4888 /* Use v_fma_mix for f2f32/f2f16 if it has higher throughput.
4889 * Do this late to not disturb other optimizations.
4891 if ((instr->opcode == aco_opcode::v_cvt_f32_f16 || instr->opcode == aco_opcode::v_cvt_f16_f32) &&
4892 ctx.program->gfx_level >= GFX11 && ctx.program->wave_size == 64 && !instr->valu().omod &&
4894 bool is_f2f16 = instr->opcode == aco_opcode::v_cvt_f16_f32;
4895 Instruction* fma = create_instruction<VALU_instruction>(
4896 is_f2f16 ? aco_opcode::v_fma_mixlo_f16 : aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1);
4897 fma->definitions[0] = instr->definitions[0];
4898 fma->operands[0] = instr->operands[0];
4899 fma->valu().opsel_hi[0] = !is_f2f16;
4900 fma->valu().opsel_lo[0] = instr->valu().opsel[0];
4901 fma->valu().clamp = instr->valu().clamp;
4902 fma->valu().abs[0] = instr->valu().abs[0];
4903 fma->valu().neg[0] = instr->valu().neg[0];
4904 fma->operands[1] = Operand::c32(fui(1.0f));
4905 fma->operands[2] = Operand::zero();
4906 /* fma_mix is only dual issued if dst and acc type match */
4907 fma->valu().opsel_hi[2] = is_f2f16;
4908 fma->valu().neg[2] = true;
4910 ctx.info[instr->definitions[0].tempId()].label = 0;
4913 if (instr->isSDWA() || (instr->isVOP3() && ctx.program->gfx_level < GFX10) ||
4914 (instr->isVOP3P() && ctx.program->gfx_level < GFX10))
4915 return; /* some encodings can't ever take literals */
4917 /* we do not apply the literals yet as we don't know if it is profitable */
4918 Operand current_literal(s1);
4920 unsigned literal_id = 0;
4921 unsigned literal_uses = UINT32_MAX;
4922 Operand literal(s1);
4923 unsigned num_operands = 1;
4924 if (instr->isSALU() || (ctx.program->gfx_level >= GFX10 &&
4925 (can_use_VOP3(ctx, instr) || instr->isVOP3P()) && !instr->isDPP()))
4926 num_operands = instr->operands.size();
4927 /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
4928 else if (instr->isVALU() && instr->operands.size() >= 3)
4931 unsigned sgpr_ids[2] = {0, 0};
4932 bool is_literal_sgpr = false;
4935 /* choose a literal to apply */
4936 for (unsigned i = 0; i < num_operands; i++) {
4937 Operand op = instr->operands[i];
4938 unsigned bits = get_operand_size(instr, i);
4940 if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&
4941 op.tempId() != sgpr_ids[0])
4942 sgpr_ids[!!sgpr_ids[0]] = op.tempId();
4944 if (op.isLiteral()) {
4945 current_literal = op;
4947 } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) {
4951 if (!alu_can_accept_constant(instr, i))
4954 if (ctx.uses[op.tempId()] < literal_uses) {
4955 is_literal_sgpr = op.getTemp().type() == RegType::sgpr;
4957 literal = Operand::c32(ctx.info[op.tempId()].val);
4958 literal_uses = ctx.uses[op.tempId()];
4959 literal_id = op.tempId();
4962 mask |= (op.tempId() == literal_id) << i;
4965 /* don't go over the constant bus limit */
4966 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
4967 instr->opcode == aco_opcode::v_lshrrev_b64 ||
4968 instr->opcode == aco_opcode::v_ashrrev_i64;
4969 unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;
4970 if (ctx.program->gfx_level >= GFX10 && !is_shift64)
4971 const_bus_limit = 2;
4973 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
4974 if (num_sgprs == const_bus_limit && !is_literal_sgpr)
4977 if (literal_id && literal_uses < threshold &&
4978 (current_literal.isUndefined() ||
4979 (current_literal.size() == literal.size() &&
4980 current_literal.constantValue() == literal.constantValue()))) {
4981 /* mark the literal to be applied */
4983 unsigned i = u_bit_scan(&mask);
4984 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)
4985 ctx.uses[instr->operands[i].tempId()]--;
4991 sopk_opcode_for_sopc(aco_opcode opcode)
4994 case aco_opcode::s_cmp_##op##_i32: return aco_opcode::s_cmpk_##op##_i32; \
4995 case aco_opcode::s_cmp_##op##_u32: return aco_opcode::s_cmpk_##op##_u32;
5003 default: return aco_opcode::num_opcodes;
5009 sopc_is_signed(aco_opcode opcode)
5012 case aco_opcode::s_cmp_##op##_i32: return true; \
5013 case aco_opcode::s_cmp_##op##_u32: return false;
5021 default: unreachable("Not a valid SOPC instruction.");
5027 sopc_32_swapped(aco_opcode opcode)
5029 #define SOPC(op1, op2) \
5030 case aco_opcode::s_cmp_##op1##_i32: return aco_opcode::s_cmp_##op2##_i32; \
5031 case aco_opcode::s_cmp_##op1##_u32: return aco_opcode::s_cmp_##op2##_u32;
5039 default: return aco_opcode::num_opcodes;
5045 try_convert_sopc_to_sopk(aco_ptr<Instruction>& instr)
5047 if (sopk_opcode_for_sopc(instr->opcode) == aco_opcode::num_opcodes)
5050 if (instr->operands[0].isLiteral()) {
5051 std::swap(instr->operands[0], instr->operands[1]);
5052 instr->opcode = sopc_32_swapped(instr->opcode);
5055 if (!instr->operands[1].isLiteral())
5058 if (instr->operands[0].isFixed() && instr->operands[0].physReg() >= 128)
5061 uint32_t value = instr->operands[1].constantValue();
5063 const uint32_t i16_mask = 0xffff8000u;
5065 bool value_is_i16 = (value & i16_mask) == 0 || (value & i16_mask) == i16_mask;
5066 bool value_is_u16 = !(value & 0xffff0000u);
5068 if (!value_is_i16 && !value_is_u16)
5071 if (!value_is_i16 && sopc_is_signed(instr->opcode)) {
5072 if (instr->opcode == aco_opcode::s_cmp_lg_i32)
5073 instr->opcode = aco_opcode::s_cmp_lg_u32;
5074 else if (instr->opcode == aco_opcode::s_cmp_eq_i32)
5075 instr->opcode = aco_opcode::s_cmp_eq_u32;
5078 } else if (!value_is_u16 && !sopc_is_signed(instr->opcode)) {
5079 if (instr->opcode == aco_opcode::s_cmp_lg_u32)
5080 instr->opcode = aco_opcode::s_cmp_lg_i32;
5081 else if (instr->opcode == aco_opcode::s_cmp_eq_u32)
5082 instr->opcode = aco_opcode::s_cmp_eq_i32;
5087 static_assert(sizeof(SOPK_instruction) <= sizeof(SOPC_instruction),
5088 "Invalid direct instruction cast.");
5089 instr->format = Format::SOPK;
5090 SOPK_instruction* instr_sopk = &instr->sopk();
5092 instr_sopk->imm = instr_sopk->operands[1].constantValue() & 0xffff;
5093 instr_sopk->opcode = sopk_opcode_for_sopc(instr_sopk->opcode);
5094 instr_sopk->operands.pop_back();
5098 unswizzle_vop3p_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
5100 /* This opt is only beneficial for v_pk_fma_f16 because we can use v_pk_fmac_f16 if the
5101 * instruction doesn't use swizzles. */
5102 if (instr->opcode != aco_opcode::v_pk_fma_f16)
5105 VALU_instruction& vop3p = instr->valu();
5107 unsigned literal_swizzle = ~0u;
5108 for (unsigned i = 0; i < instr->operands.size(); i++) {
5109 if (!instr->operands[i].isLiteral())
5111 unsigned new_swizzle = vop3p.opsel_lo[i] | (vop3p.opsel_hi[i] << 1);
5112 if (literal_swizzle != ~0u && new_swizzle != literal_swizzle)
5113 return; /* Literal swizzles conflict. */
5114 literal_swizzle = new_swizzle;
5117 if (literal_swizzle == 0b10 || literal_swizzle == ~0u)
5118 return; /* already unswizzled */
5120 for (unsigned i = 0; i < instr->operands.size(); i++) {
5121 if (!instr->operands[i].isLiteral())
5123 uint32_t literal = instr->operands[i].constantValue();
5124 literal = (literal >> (16 * (literal_swizzle & 0x1)) & 0xffff) |
5125 (literal >> (8 * (literal_swizzle & 0x2)) << 16);
5126 instr->operands[i] = Operand::literal32(literal);
5127 vop3p.opsel_lo[i] = false;
5128 vop3p.opsel_hi[i] = true;
5133 apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
5135 /* Cleanup Dead Instructions */
5139 /* apply literals on MAD */
5140 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
5141 mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val];
5142 const bool madak = (info->literal_mask & 0b100);
5143 bool has_dead_literal = false;
5144 u_foreach_bit (i, info->literal_mask | info->fp16_mask)
5145 has_dead_literal |= ctx.uses[instr->operands[i].tempId()] == 0;
5147 if (has_dead_literal && info->fp16_mask) {
5148 instr->format = Format::VOP3P;
5149 instr->opcode = aco_opcode::v_fma_mix_f32;
5151 uint32_t literal = 0;
5152 bool second = false;
5153 u_foreach_bit (i, info->fp16_mask) {
5154 float value = uif(ctx.info[instr->operands[i].tempId()].val);
5155 literal |= _mesa_float_to_half(value) << (second * 16);
5156 instr->valu().opsel_lo[i] = second;
5157 instr->valu().opsel_hi[i] = true;
5161 for (unsigned i = 0; i < 3; i++) {
5162 if (info->fp16_mask & (1 << i))
5163 instr->operands[i] = Operand::literal32(literal);
5166 ctx.instructions.emplace_back(std::move(instr));
5170 if (has_dead_literal || madak) {
5171 aco_opcode new_op = madak ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
5172 if (instr->opcode == aco_opcode::v_fma_f32)
5173 new_op = madak ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
5174 else if (instr->opcode == aco_opcode::v_mad_f16 ||
5175 instr->opcode == aco_opcode::v_mad_legacy_f16)
5176 new_op = madak ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
5177 else if (instr->opcode == aco_opcode::v_fma_f16)
5178 new_op = madak ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
5180 uint32_t literal = ctx.info[instr->operands[ffs(info->literal_mask) - 1].tempId()].val;
5181 instr->format = Format::VOP2;
5182 instr->opcode = new_op;
5183 for (unsigned i = 0; i < 3; i++) {
5184 if (info->literal_mask & (1 << i))
5185 instr->operands[i] = Operand::literal32(literal);
5187 if (madak) { /* add literal -> madak */
5188 if (!instr->operands[1].isOfType(RegType::vgpr))
5189 instr->valu().swapOperands(0, 1);
5190 } else { /* mul literal -> madmk */
5191 if (!(info->literal_mask & 0b10))
5192 instr->valu().swapOperands(0, 1);
5193 instr->valu().swapOperands(1, 2);
5195 ctx.instructions.emplace_back(std::move(instr));
5200 /* apply literals on other SALU/VALU */
5201 if (instr->isSALU() || instr->isVALU()) {
5202 for (unsigned i = 0; i < instr->operands.size(); i++) {
5203 Operand op = instr->operands[i];
5204 unsigned bits = get_operand_size(instr, i);
5205 if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
5206 Operand literal = Operand::literal32(ctx.info[op.tempId()].val);
5207 instr->format = withoutDPP(instr->format);
5208 if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)
5209 instr->format = asVOP3(instr->format);
5210 instr->operands[i] = literal;
5215 if (instr->isSOPC())
5216 try_convert_sopc_to_sopk(instr);
5218 /* allow more s_addk_i32 optimizations if carry isn't used */
5219 if (instr->opcode == aco_opcode::s_add_u32 && ctx.uses[instr->definitions[1].tempId()] == 0 &&
5220 (instr->operands[0].isLiteral() || instr->operands[1].isLiteral()))
5221 instr->opcode = aco_opcode::s_add_i32;
5223 if (instr->isVOP3P())
5224 unswizzle_vop3p_literals(ctx, instr);
5226 ctx.instructions.emplace_back(std::move(instr));
5230 optimize(Program* program)
5233 ctx.program = program;
5234 std::vector<ssa_info> info(program->peekAllocationId());
5235 ctx.info = info.data();
5237 /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
5238 for (Block& block : program->blocks) {
5239 ctx.fp_mode = block.fp_mode;
5240 for (aco_ptr<Instruction>& instr : block.instructions)
5241 label_instruction(ctx, instr);
5244 ctx.uses = dead_code_analysis(program);
5246 /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
5247 for (Block& block : program->blocks) {
5248 ctx.fp_mode = block.fp_mode;
5249 for (aco_ptr<Instruction>& instr : block.instructions)
5250 combine_instruction(ctx, instr);
5253 /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
5254 for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend();
5256 Block* block = &(*block_rit);
5257 ctx.fp_mode = block->fp_mode;
5258 for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend();
5260 select_instruction(ctx, *instr_rit);
5263 /* 4. Add literals to instructions */
5264 for (Block& block : program->blocks) {
5265 ctx.instructions.reserve(block.instructions.size());
5266 ctx.fp_mode = block.fp_mode;
5267 for (aco_ptr<Instruction>& instr : block.instructions)
5268 apply_literals(ctx, instr);
5269 block.instructions = std::move(ctx.instructions);