2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "aco_builder.h"
28 #include "util/half_float.h"
29 #include "util/memstream.h"
39 perfwarn(Program* program, bool cond, const char* msg, Instruction* instr)
44 struct u_memstream mem;
45 u_memstream_open(&mem, &out, &outsize);
46 FILE* const memf = u_memstream_get(&mem);
48 fprintf(memf, "%s: ", msg);
49 aco_print_instr(program->gfx_level, instr, memf);
50 u_memstream_close(&mem);
52 aco_perfwarn(program, out);
55 if (debug_flags & DEBUG_PERFWARN)
62 * The optimizer works in 4 phases:
63 * (1) The first pass collects information for each ssa-def,
64 * propagates reg->reg operands of the same type, inline constants
65 * and neg/abs input modifiers.
66 * (2) The second pass combines instructions like mad, omod, clamp and
67 * propagates sgpr's on VALU instructions.
68 * This pass depends on information collected in the first pass.
69 * (3) The third pass goes backwards, and selects instructions,
70 * i.e. decides if a mad instruction is profitable and eliminates dead code.
71 * (4) The fourth pass cleans up the sequence: literals get applied and dead
72 * instructions are removed from the sequence.
76 aco_ptr<Instruction> add_instr;
78 uint16_t literal_mask;
81 mad_info(aco_ptr<Instruction> instr, uint32_t id)
82 : add_instr(std::move(instr)), mul_temp_id(id), literal_mask(0), fp16_mask(0)
88 label_constant_32bit = 1 << 1,
89 /* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and
90 * 32-bit operations but this shouldn't cause any issues because we don't
91 * look through any conversions */
96 label_literal = 1 << 6,
100 label_omod5 = 1 << 10,
101 label_clamp = 1 << 12,
102 label_undefined = 1 << 14,
105 label_add_sub = 1 << 17,
106 label_bitwise = 1 << 18,
107 label_minmax = 1 << 19,
108 label_vopc = 1 << 20,
109 label_uniform_bool = 1 << 21,
110 label_constant_64bit = 1 << 22,
111 label_uniform_bitwise = 1 << 23,
112 label_scc_invert = 1 << 24,
113 label_scc_needed = 1 << 26,
115 label_fcanonicalize = 1 << 28,
116 label_constant_16bit = 1 << 29,
117 label_usedef = 1 << 30, /* generic label */
118 label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */
119 label_canonicalized = 1ull << 32,
120 label_extract = 1ull << 33,
121 label_insert = 1ull << 34,
122 label_dpp16 = 1ull << 35,
123 label_dpp8 = 1ull << 36,
124 label_f2f32 = 1ull << 37,
125 label_f2f16 = 1ull << 38,
126 label_split = 1ull << 39,
127 label_subgroup_invocation = 1ull << 40,
130 static constexpr uint64_t instr_usedef_labels =
131 label_vec | label_mul | label_add_sub | label_vop3p | label_bitwise | label_uniform_bitwise |
132 label_minmax | label_vopc | label_usedef | label_extract | label_dpp16 | label_dpp8 |
133 label_f2f32 | label_subgroup_invocation;
134 static constexpr uint64_t instr_mod_labels =
135 label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16;
137 static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels | label_split;
138 static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f |
139 label_uniform_bool | label_scc_invert | label_b2i |
141 static constexpr uint32_t val_labels =
142 label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal | label_mad;
144 static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect");
145 static_assert((instr_labels & val_labels) == 0, "labels cannot intersect");
146 static_assert((temp_labels & val_labels) == 0, "labels cannot intersect");
156 ssa_info() : label(0) {}
158 void add_label(Label new_label)
160 /* Since all the instr_usedef_labels use instr for the same thing
161 * (indicating the defining instruction), there is usually no need to
162 * clear any other instr labels. */
163 if (new_label & instr_usedef_labels)
164 label &= ~(instr_mod_labels | temp_labels | val_labels); /* instr, temp and val alias */
166 if (new_label & instr_mod_labels) {
167 label &= ~instr_labels;
168 label &= ~(temp_labels | val_labels); /* instr, temp and val alias */
171 if (new_label & temp_labels) {
172 label &= ~temp_labels;
173 label &= ~(instr_labels | val_labels); /* instr, temp and val alias */
176 uint32_t const_labels =
177 label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;
178 if (new_label & const_labels) {
179 label &= ~val_labels | const_labels;
180 label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
181 } else if (new_label & val_labels) {
182 label &= ~val_labels;
183 label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
189 void set_vec(Instruction* vec)
191 add_label(label_vec);
195 bool is_vec() { return label & label_vec; }
197 void set_constant(amd_gfx_level gfx_level, uint64_t constant)
199 Operand op16 = Operand::c16(constant);
200 Operand op32 = Operand::get_const(gfx_level, constant, 4);
201 add_label(label_literal);
204 /* check that no upper bits are lost in case of packed 16bit constants */
205 if (gfx_level >= GFX8 && !op16.isLiteral() &&
206 op16.constantValue16(true) == ((constant >> 16) & 0xffff))
207 add_label(label_constant_16bit);
209 if (!op32.isLiteral())
210 add_label(label_constant_32bit);
212 if (Operand::is_constant_representable(constant, 8))
213 add_label(label_constant_64bit);
215 if (label & label_constant_64bit) {
216 val = Operand::c64(constant).constantValue();
218 label &= ~(label_literal | label_constant_16bit | label_constant_32bit);
222 bool is_constant(unsigned bits)
225 case 8: return label & label_literal;
226 case 16: return label & label_constant_16bit;
227 case 32: return label & label_constant_32bit;
228 case 64: return label & label_constant_64bit;
233 bool is_literal(unsigned bits)
235 bool is_lit = label & label_literal;
237 case 8: return false;
238 case 16: return is_lit && ~(label & label_constant_16bit);
239 case 32: return is_lit && ~(label & label_constant_32bit);
240 case 64: return false;
245 bool is_constant_or_literal(unsigned bits)
248 return label & label_constant_64bit;
250 return label & label_literal;
253 void set_abs(Temp abs_temp)
255 add_label(label_abs);
259 bool is_abs() { return label & label_abs; }
261 void set_neg(Temp neg_temp)
263 add_label(label_neg);
267 bool is_neg() { return label & label_neg; }
269 void set_neg_abs(Temp neg_abs_temp)
271 add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));
275 void set_mul(Instruction* mul)
277 add_label(label_mul);
281 bool is_mul() { return label & label_mul; }
283 void set_temp(Temp tmp)
285 add_label(label_temp);
289 bool is_temp() { return label & label_temp; }
291 void set_mad(uint32_t mad_info_idx)
293 add_label(label_mad);
297 bool is_mad() { return label & label_mad; }
299 void set_omod2(Instruction* mul)
301 add_label(label_omod2);
305 bool is_omod2() { return label & label_omod2; }
307 void set_omod4(Instruction* mul)
309 add_label(label_omod4);
313 bool is_omod4() { return label & label_omod4; }
315 void set_omod5(Instruction* mul)
317 add_label(label_omod5);
321 bool is_omod5() { return label & label_omod5; }
323 void set_clamp(Instruction* med3)
325 add_label(label_clamp);
329 bool is_clamp() { return label & label_clamp; }
331 void set_f2f16(Instruction* conv)
333 add_label(label_f2f16);
337 bool is_f2f16() { return label & label_f2f16; }
339 void set_undefined() { add_label(label_undefined); }
341 bool is_undefined() { return label & label_undefined; }
343 void set_vcc(Temp vcc_val)
345 add_label(label_vcc);
349 bool is_vcc() { return label & label_vcc; }
351 void set_b2f(Temp b2f_val)
353 add_label(label_b2f);
357 bool is_b2f() { return label & label_b2f; }
359 void set_add_sub(Instruction* add_sub_instr)
361 add_label(label_add_sub);
362 instr = add_sub_instr;
365 bool is_add_sub() { return label & label_add_sub; }
367 void set_bitwise(Instruction* bitwise_instr)
369 add_label(label_bitwise);
370 instr = bitwise_instr;
373 bool is_bitwise() { return label & label_bitwise; }
375 void set_uniform_bitwise() { add_label(label_uniform_bitwise); }
377 bool is_uniform_bitwise() { return label & label_uniform_bitwise; }
379 void set_minmax(Instruction* minmax_instr)
381 add_label(label_minmax);
382 instr = minmax_instr;
385 bool is_minmax() { return label & label_minmax; }
387 void set_vopc(Instruction* vopc_instr)
389 add_label(label_vopc);
393 bool is_vopc() { return label & label_vopc; }
395 void set_scc_needed() { add_label(label_scc_needed); }
397 bool is_scc_needed() { return label & label_scc_needed; }
399 void set_scc_invert(Temp scc_inv)
401 add_label(label_scc_invert);
405 bool is_scc_invert() { return label & label_scc_invert; }
407 void set_uniform_bool(Temp uniform_bool)
409 add_label(label_uniform_bool);
413 bool is_uniform_bool() { return label & label_uniform_bool; }
415 void set_b2i(Temp b2i_val)
417 add_label(label_b2i);
421 bool is_b2i() { return label & label_b2i; }
423 void set_usedef(Instruction* label_instr)
425 add_label(label_usedef);
429 bool is_usedef() { return label & label_usedef; }
431 void set_vop3p(Instruction* vop3p_instr)
433 add_label(label_vop3p);
437 bool is_vop3p() { return label & label_vop3p; }
439 void set_fcanonicalize(Temp tmp)
441 add_label(label_fcanonicalize);
445 bool is_fcanonicalize() { return label & label_fcanonicalize; }
447 void set_canonicalized() { add_label(label_canonicalized); }
449 bool is_canonicalized() { return label & label_canonicalized; }
451 void set_f2f32(Instruction* cvt)
453 add_label(label_f2f32);
457 bool is_f2f32() { return label & label_f2f32; }
459 void set_extract(Instruction* extract)
461 add_label(label_extract);
465 bool is_extract() { return label & label_extract; }
467 void set_insert(Instruction* insert)
469 add_label(label_insert);
473 bool is_insert() { return label & label_insert; }
475 void set_dpp16(Instruction* mov)
477 add_label(label_dpp16);
481 void set_dpp8(Instruction* mov)
483 add_label(label_dpp8);
487 bool is_dpp() { return label & (label_dpp16 | label_dpp8); }
488 bool is_dpp16() { return label & label_dpp16; }
489 bool is_dpp8() { return label & label_dpp8; }
491 void set_split(Instruction* split)
493 add_label(label_split);
497 bool is_split() { return label & label_split; }
499 void set_subgroup_invocation(Instruction* label_instr)
501 add_label(label_subgroup_invocation);
505 bool is_subgroup_invocation() { return label & label_subgroup_invocation; }
511 std::vector<aco_ptr<Instruction>> instructions;
513 std::pair<uint32_t, Temp> last_literal;
514 std::vector<mad_info> mad_infos;
515 std::vector<uint16_t> uses;
519 can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
524 if (instr->isVOP3P())
527 if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->gfx_level < GFX10)
533 if (instr->isDPP() && ctx.program->gfx_level < GFX11)
536 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
537 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
538 instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
539 instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
540 instr->opcode != aco_opcode::v_readlane_b32 &&
541 instr->opcode != aco_opcode::v_writelane_b32 &&
542 instr->opcode != aco_opcode::v_readfirstlane_b32;
546 pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsigned index)
548 if (instr->definitions.empty())
552 instr->opcode == aco_opcode::p_as_uniform ||
553 std::all_of(instr->definitions.begin(), instr->definitions.end(),
554 [](const Definition& def) { return def.regClass().type() == RegType::vgpr; });
556 /* don't propagate VGPRs into SGPR instructions */
557 if (temp.type() == RegType::vgpr && !vgpr)
560 bool can_accept_sgpr =
561 ctx.program->gfx_level >= GFX9 ||
562 std::none_of(instr->definitions.begin(), instr->definitions.end(),
563 [](const Definition& def) { return def.regClass().is_subdword(); });
565 switch (instr->opcode) {
566 case aco_opcode::p_phi:
567 case aco_opcode::p_linear_phi:
568 case aco_opcode::p_parallelcopy:
569 case aco_opcode::p_create_vector:
570 if (temp.bytes() != instr->operands[index].bytes())
573 case aco_opcode::p_extract_vector:
574 case aco_opcode::p_extract:
575 if (temp.type() == RegType::sgpr && !can_accept_sgpr)
578 case aco_opcode::p_split_vector: {
579 if (temp.type() == RegType::sgpr && !can_accept_sgpr)
581 /* don't increase the vector size */
582 if (temp.bytes() > instr->operands[index].bytes())
584 /* We can decrease the vector size as smaller temporaries are only
585 * propagated by p_as_uniform instructions.
586 * If this propagation leads to invalid IR or hits the assertion below,
587 * it means that some undefined bytes within a dword are begin accessed
588 * and a bug in instruction_selection is likely. */
589 int decrease = instr->operands[index].bytes() - temp.bytes();
590 while (decrease > 0) {
591 decrease -= instr->definitions.back().bytes();
592 instr->definitions.pop_back();
594 assert(decrease == 0);
597 case aco_opcode::p_as_uniform:
598 if (temp.regClass() == instr->definitions[0].regClass())
599 instr->opcode = aco_opcode::p_parallelcopy;
601 default: return false;
604 instr->operands[index].setTemp(temp);
608 /* This expects the DPP modifier to be removed. */
610 can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
612 assert(instr->isVALU());
613 if (instr->isSDWA() && ctx.program->gfx_level < GFX9)
615 return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
616 instr->opcode != aco_opcode::v_readlane_b32 &&
617 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
618 instr->opcode != aco_opcode::v_writelane_b32 &&
619 instr->opcode != aco_opcode::v_writelane_b32_e64 &&
620 instr->opcode != aco_opcode::v_permlane16_b32 &&
621 instr->opcode != aco_opcode::v_permlanex16_b32 &&
622 instr->opcode != aco_opcode::v_interp_p1_f32 &&
623 instr->opcode != aco_opcode::v_interp_p2_f32 &&
624 instr->opcode != aco_opcode::v_interp_mov_f32 &&
625 instr->opcode != aco_opcode::v_interp_p1ll_f16 &&
626 instr->opcode != aco_opcode::v_interp_p1lv_f16 &&
627 instr->opcode != aco_opcode::v_interp_p2_legacy_f16 &&
628 instr->opcode != aco_opcode::v_interp_p2_f16 &&
629 instr->opcode != aco_opcode::v_interp_p10_f32_inreg &&
630 instr->opcode != aco_opcode::v_interp_p2_f32_inreg &&
631 instr->opcode != aco_opcode::v_interp_p10_f16_f32_inreg &&
632 instr->opcode != aco_opcode::v_interp_p2_f16_f32_inreg &&
633 instr->opcode != aco_opcode::v_interp_p10_rtz_f16_f32_inreg &&
634 instr->opcode != aco_opcode::v_interp_p2_rtz_f16_f32_inreg;
638 is_operand_vgpr(Operand op)
640 return op.isTemp() && op.getTemp().type() == RegType::vgpr;
643 /* only covers special cases */
645 alu_can_accept_constant(const aco_ptr<Instruction>& instr, unsigned operand)
647 /* Fixed operands can't accept constants because we need them
648 * to be in their fixed register.
650 assert(instr->operands.size() > operand);
651 if (instr->operands[operand].isFixed())
654 /* SOPP instructions can't use constants. */
658 switch (instr->opcode) {
659 case aco_opcode::v_mac_f32:
660 case aco_opcode::v_writelane_b32:
661 case aco_opcode::v_writelane_b32_e64:
662 case aco_opcode::v_cndmask_b32: return operand != 2;
663 case aco_opcode::s_addk_i32:
664 case aco_opcode::s_mulk_i32:
665 case aco_opcode::p_wqm:
666 case aco_opcode::p_extract_vector:
667 case aco_opcode::p_split_vector:
668 case aco_opcode::v_readlane_b32:
669 case aco_opcode::v_readlane_b32_e64:
670 case aco_opcode::v_readfirstlane_b32:
671 case aco_opcode::p_extract:
672 case aco_opcode::p_insert: return operand != 0;
673 case aco_opcode::p_bpermute_gfx6:
674 case aco_opcode::p_bpermute_gfx10w64:
675 case aco_opcode::p_bpermute_gfx11w64:
676 case aco_opcode::p_interp_gfx11:
677 case aco_opcode::p_dual_src_export_gfx11:
678 case aco_opcode::v_interp_p1_f32:
679 case aco_opcode::v_interp_p2_f32:
680 case aco_opcode::v_interp_mov_f32:
681 case aco_opcode::v_interp_p1ll_f16:
682 case aco_opcode::v_interp_p1lv_f16:
683 case aco_opcode::v_interp_p2_legacy_f16:
684 case aco_opcode::v_interp_p10_f32_inreg:
685 case aco_opcode::v_interp_p2_f32_inreg:
686 case aco_opcode::v_interp_p10_f16_f32_inreg:
687 case aco_opcode::v_interp_p2_f16_f32_inreg:
688 case aco_opcode::v_interp_p10_rtz_f16_f32_inreg:
689 case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return false;
690 default: return true;
695 valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
697 if (instr->opcode == aco_opcode::v_readlane_b32 ||
698 instr->opcode == aco_opcode::v_readlane_b32_e64 ||
699 instr->opcode == aco_opcode::v_writelane_b32 ||
700 instr->opcode == aco_opcode::v_writelane_b32_e64)
702 if (instr->opcode == aco_opcode::v_permlane16_b32 ||
703 instr->opcode == aco_opcode::v_permlanex16_b32)
708 /* check constant bus and literal limitations */
710 check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand* operands)
712 int limit = ctx.program->gfx_level >= GFX10 ? 2 : 1;
713 Operand literal32(s1);
714 Operand literal64(s2);
715 unsigned num_sgprs = 0;
716 unsigned sgpr[] = {0, 0};
718 for (unsigned i = 0; i < num_operands; i++) {
719 Operand op = operands[i];
721 if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {
722 /* two reads of the same SGPR count as 1 to the limit */
723 if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
725 sgpr[num_sgprs++] = op.tempId();
730 } else if (op.isLiteral()) {
731 if (ctx.program->gfx_level < GFX10)
734 if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())
736 if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())
739 /* Any number of 32-bit literals counts as only 1 to the limit. Same
740 * (but separately) for 64-bit literals. */
741 if (op.size() == 1 && literal32.isUndefined()) {
744 } else if (op.size() == 2 && literal64.isUndefined()) {
758 parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* base, uint32_t* offset,
759 bool prevent_overflow)
761 Operand op = instr->operands[op_index];
765 Temp tmp = op.getTemp();
766 if (!ctx.info[tmp.id()].is_add_sub())
769 Instruction* add_instr = ctx.info[tmp.id()].instr;
773 switch (add_instr->opcode) {
774 case aco_opcode::v_add_u32:
775 case aco_opcode::v_add_co_u32:
776 case aco_opcode::v_add_co_u32_e64:
777 case aco_opcode::s_add_i32:
778 case aco_opcode::s_add_u32: break;
779 case aco_opcode::v_sub_u32:
780 case aco_opcode::v_sub_i32:
781 case aco_opcode::v_sub_co_u32:
782 case aco_opcode::v_sub_co_u32_e64:
783 case aco_opcode::s_sub_u32:
784 case aco_opcode::s_sub_i32:
788 case aco_opcode::v_subrev_u32:
789 case aco_opcode::v_subrev_co_u32:
790 case aco_opcode::v_subrev_co_u32_e64:
794 default: return false;
796 if (prevent_overflow && !add_instr->definitions[0].isNUW())
799 if (add_instr->usesModifiers())
802 u_foreach_bit (i, mask) {
803 if (add_instr->operands[i].isConstant()) {
804 *offset = add_instr->operands[i].constantValue() * (uint32_t)(is_sub ? -1 : 1);
805 } else if (add_instr->operands[i].isTemp() &&
806 ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) {
807 *offset = ctx.info[add_instr->operands[i].tempId()].val * (uint32_t)(is_sub ? -1 : 1);
811 if (!add_instr->operands[!i].isTemp())
814 uint32_t offset2 = 0;
815 if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
818 *base = add_instr->operands[!i].getTemp();
827 skip_smem_offset_align(opt_ctx& ctx, SMEM_instruction* smem)
829 bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
830 if (soe && !smem->operands[1].isConstant())
832 /* We don't need to check the constant offset because the address seems to be calculated with
833 * (offset&-4 + const_offset&-4), not (offset+const_offset)&-4.
836 Operand& op = smem->operands[soe ? smem->operands.size() - 1 : 1];
837 if (!op.isTemp() || !ctx.info[op.tempId()].is_bitwise())
840 Instruction* bitwise_instr = ctx.info[op.tempId()].instr;
841 if (bitwise_instr->opcode != aco_opcode::s_and_b32)
844 if (bitwise_instr->operands[0].constantEquals(-4) &&
845 bitwise_instr->operands[1].isOfType(op.regClass().type()))
846 op.setTemp(bitwise_instr->operands[1].getTemp());
847 else if (bitwise_instr->operands[1].constantEquals(-4) &&
848 bitwise_instr->operands[0].isOfType(op.regClass().type()))
849 op.setTemp(bitwise_instr->operands[0].getTemp());
853 smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
855 /* skip &-4 before offset additions: load((a + 16) & -4, 0) */
856 if (!instr->operands.empty())
857 skip_smem_offset_align(ctx, &instr->smem());
859 /* propagate constants and combine additions */
860 if (!instr->operands.empty() && instr->operands[1].isTemp()) {
861 SMEM_instruction& smem = instr->smem();
862 ssa_info info = ctx.info[instr->operands[1].tempId()];
866 if (info.is_constant_or_literal(32) &&
867 ((ctx.program->gfx_level == GFX6 && info.val <= 0x3FF) ||
868 (ctx.program->gfx_level == GFX7 && info.val <= 0xFFFFFFFF) ||
869 (ctx.program->gfx_level >= GFX8 && info.val <= 0xFFFFF))) {
870 instr->operands[1] = Operand::c32(info.val);
871 } else if (parse_base_offset(ctx, instr.get(), 1, &base, &offset, true) &&
872 base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->gfx_level >= GFX9 &&
874 bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4);
876 if (ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) &&
877 ctx.info[smem.operands.back().tempId()].val == 0) {
878 smem.operands[1] = Operand::c32(offset);
879 smem.operands.back() = Operand(base);
882 SMEM_instruction* new_instr = create_instruction<SMEM_instruction>(
883 smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size());
884 new_instr->operands[0] = smem.operands[0];
885 new_instr->operands[1] = Operand::c32(offset);
886 if (smem.definitions.empty())
887 new_instr->operands[2] = smem.operands[2];
888 new_instr->operands.back() = Operand(base);
889 if (!smem.definitions.empty())
890 new_instr->definitions[0] = smem.definitions[0];
891 new_instr->sync = smem.sync;
892 new_instr->glc = smem.glc;
893 new_instr->dlc = smem.dlc;
894 new_instr->nv = smem.nv;
895 new_instr->disable_wqm = smem.disable_wqm;
896 instr.reset(new_instr);
901 /* skip &-4 after offset additions: load(a & -4, 16) */
902 if (!instr->operands.empty())
903 skip_smem_offset_align(ctx, &instr->smem());
907 get_constant_op(opt_ctx& ctx, ssa_info info, uint32_t bits)
910 return Operand::c32_or_c64(info.val, true);
911 return Operand::get_const(ctx.program->gfx_level, info.val, bits / 8u);
915 propagate_constants_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info, unsigned i)
917 if (!info.is_constant_or_literal(32))
920 assert(instr->operands[i].isTemp());
921 unsigned bits = get_operand_size(instr, i);
922 if (info.is_constant(bits)) {
923 instr->operands[i] = get_constant_op(ctx, info, bits);
927 /* The accumulation operand of dot product instructions ignores opsel. */
928 bool cannot_use_opsel =
929 (instr->opcode == aco_opcode::v_dot4_i32_i8 || instr->opcode == aco_opcode::v_dot2_i32_i16 ||
930 instr->opcode == aco_opcode::v_dot4_i32_iu8 || instr->opcode == aco_opcode::v_dot4_u32_u8 ||
931 instr->opcode == aco_opcode::v_dot2_u32_u16) &&
933 if (cannot_use_opsel)
936 /* try to fold inline constants */
937 VALU_instruction* vop3p = &instr->valu();
938 bool opsel_lo = vop3p->opsel_lo[i];
939 bool opsel_hi = vop3p->opsel_hi[i];
942 bool const_opsel[2] = {false, false};
943 for (unsigned j = 0; j < 2; j++) {
944 if ((unsigned)opsel_lo != j && (unsigned)opsel_hi != j)
945 continue; /* this half is unused */
947 uint16_t val = info.val >> (j ? 16 : 0);
948 Operand op = Operand::get_const(ctx.program->gfx_level, val, bits / 8u);
949 if (bits == 32 && op.isLiteral()) /* try sign extension */
950 op = Operand::get_const(ctx.program->gfx_level, val | 0xffff0000, 4);
951 if (bits == 32 && op.isLiteral()) { /* try shifting left */
952 op = Operand::get_const(ctx.program->gfx_level, val << 16, 4);
953 const_opsel[j] = true;
960 Operand const_lo = const_op[0];
961 Operand const_hi = const_op[1];
962 bool const_lo_opsel = const_opsel[0];
963 bool const_hi_opsel = const_opsel[1];
965 if (opsel_lo == opsel_hi) {
966 /* use the single 16bit value */
967 instr->operands[i] = opsel_lo ? const_hi : const_lo;
969 /* opsel must point the same for both halves */
970 opsel_lo = opsel_lo ? const_hi_opsel : const_lo_opsel;
972 } else if (const_lo == const_hi) {
973 /* both constants are the same */
974 instr->operands[i] = const_lo;
976 /* opsel must point the same for both halves */
977 opsel_lo = const_lo_opsel;
978 opsel_hi = const_lo_opsel;
979 } else if (const_lo.constantValue16(const_lo_opsel) ==
980 const_hi.constantValue16(!const_hi_opsel)) {
981 instr->operands[i] = const_hi;
983 /* redirect opsel selection */
984 opsel_lo = opsel_lo ? const_hi_opsel : !const_hi_opsel;
985 opsel_hi = opsel_hi ? const_hi_opsel : !const_hi_opsel;
986 } else if (const_hi.constantValue16(const_hi_opsel) ==
987 const_lo.constantValue16(!const_lo_opsel)) {
988 instr->operands[i] = const_lo;
990 /* redirect opsel selection */
991 opsel_lo = opsel_lo ? !const_lo_opsel : const_lo_opsel;
992 opsel_hi = opsel_hi ? !const_lo_opsel : const_lo_opsel;
993 } else if (bits == 16 && const_lo.constantValue() == (const_hi.constantValue() ^ (1 << 15))) {
994 assert(const_lo_opsel == false && const_hi_opsel == false);
996 /* const_lo == -const_hi */
997 if (!can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i))
1000 instr->operands[i] = Operand::c16(const_lo.constantValue() & 0x7FFF);
1001 bool neg_lo = const_lo.constantValue() & (1 << 15);
1002 vop3p->neg_lo[i] ^= opsel_lo ^ neg_lo;
1003 vop3p->neg_hi[i] ^= opsel_hi ^ neg_lo;
1005 /* opsel must point to lo for both operands */
1010 vop3p->opsel_lo[i] = opsel_lo;
1011 vop3p->opsel_hi[i] = opsel_hi;
1015 fixed_to_exec(Operand op)
1017 return op.isFixed() && op.physReg() == exec;
1021 parse_extract(Instruction* instr)
1023 if (instr->opcode == aco_opcode::p_extract) {
1024 unsigned size = instr->operands[2].constantValue() / 8;
1025 unsigned offset = instr->operands[1].constantValue() * size;
1026 bool sext = instr->operands[3].constantEquals(1);
1027 return SubdwordSel(size, offset, sext);
1028 } else if (instr->opcode == aco_opcode::p_insert && instr->operands[1].constantEquals(0)) {
1029 return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
1030 } else if (instr->opcode == aco_opcode::p_extract_vector) {
1031 unsigned size = instr->definitions[0].bytes();
1032 unsigned offset = instr->operands[1].constantValue() * size;
1034 return SubdwordSel(size, offset, false);
1035 } else if (instr->opcode == aco_opcode::p_split_vector) {
1036 assert(instr->operands[0].bytes() == 4 && instr->definitions[1].bytes() == 2);
1037 return SubdwordSel(2, 2, false);
1040 return SubdwordSel();
1044 parse_insert(Instruction* instr)
1046 if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) &&
1047 instr->operands[1].constantEquals(0)) {
1048 return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
1049 } else if (instr->opcode == aco_opcode::p_insert) {
1050 unsigned size = instr->operands[2].constantValue() / 8;
1051 unsigned offset = instr->operands[1].constantValue() * size;
1052 return SubdwordSel(size, offset, false);
1054 return SubdwordSel();
1059 can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
1061 Temp tmp = info.instr->operands[0].getTemp();
1062 SubdwordSel sel = parse_extract(info.instr);
1066 } else if (sel.size() == 4) {
1068 } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) {
1070 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() &&
1071 sel.offset() == 0 &&
1072 ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
1073 (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
1075 } else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
1076 !instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
1077 (instr->operands[!idx].is16bit() ||
1078 instr->operands[!idx].constantValue() <= UINT16_MAX)) {
1080 } else if (idx < 2 && can_use_SDWA(ctx.program->gfx_level, instr, true) &&
1081 (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
1082 if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword)
1085 } else if (instr->isVALU() && sel.size() == 2 && !instr->valu().opsel[idx] &&
1086 can_use_opsel(ctx.program->gfx_level, instr->opcode, idx)) {
1088 } else if (instr->opcode == aco_opcode::p_extract) {
1089 SubdwordSel instrSel = parse_extract(instr.get());
1091 /* the outer offset must be within extracted range */
1092 if (instrSel.offset() >= sel.size())
1095 /* don't remove the sign-extension when increasing the size further */
1096 if (instrSel.size() > sel.size() && !instrSel.sign_extend() && sel.sign_extend())
1105 /* Combine an p_extract (or p_insert, in some cases) instruction with instr.
1106 * instr(p_extract(...)) -> instr()
1109 apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
1111 Temp tmp = info.instr->operands[0].getTemp();
1112 SubdwordSel sel = parse_extract(info.instr);
1115 instr->operands[idx].set16bit(false);
1116 instr->operands[idx].set24bit(false);
1118 ctx.info[tmp.id()].label &= ~label_insert;
1120 if (sel.size() == 4) {
1121 /* full dword selection */
1122 } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) {
1123 switch (sel.offset()) {
1124 case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
1125 case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
1126 case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
1127 case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
1129 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() &&
1130 sel.offset() == 0 &&
1131 ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
1132 (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
1133 /* The undesirable upper bits are already shifted out. */
1135 } else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
1136 !instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
1137 (instr->operands[!idx].is16bit() ||
1138 instr->operands[!idx].constantValue() <= UINT16_MAX)) {
1140 create_instruction<VALU_instruction>(aco_opcode::v_mad_u32_u16, Format::VOP3, 3, 1);
1141 mad->definitions[0] = instr->definitions[0];
1142 mad->operands[0] = instr->operands[0];
1143 mad->operands[1] = instr->operands[1];
1144 mad->operands[2] = Operand::zero();
1145 mad->valu().opsel[idx] = sel.offset();
1146 mad->pass_flags = instr->pass_flags;
1148 } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) &&
1149 (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
1150 convert_to_SDWA(ctx.program->gfx_level, instr);
1151 instr->sdwa().sel[idx] = sel;
1152 } else if (instr->isVALU()) {
1154 instr->valu().opsel[idx] = true;
1156 /* VOP12C cannot use opsel with SGPRs. */
1157 if (!instr->isVOP3() && !instr->isVINTERP_INREG() &&
1158 !info.instr->operands[0].isOfType(RegType::vgpr))
1159 instr->format = asVOP3(instr->format);
1161 } else if (instr->opcode == aco_opcode::p_extract) {
1162 SubdwordSel instrSel = parse_extract(instr.get());
1164 unsigned size = std::min(sel.size(), instrSel.size());
1165 unsigned offset = sel.offset() + instrSel.offset();
1166 unsigned sign_extend =
1167 instrSel.sign_extend() && (sel.sign_extend() || instrSel.size() <= sel.size());
1169 instr->operands[1] = Operand::c32(offset / size);
1170 instr->operands[2] = Operand::c32(size * 8u);
1171 instr->operands[3] = Operand::c32(sign_extend);
1175 /* These are the only labels worth keeping at the moment. */
1176 for (Definition& def : instr->definitions) {
1177 ctx.info[def.tempId()].label &=
1178 (label_mul | label_minmax | label_usedef | label_vopc | label_f2f32 | instr_mod_labels);
1179 if (ctx.info[def.tempId()].label & instr_usedef_labels)
1180 ctx.info[def.tempId()].instr = instr.get();
1185 check_sdwa_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1187 for (unsigned i = 0; i < instr->operands.size(); i++) {
1188 Operand op = instr->operands[i];
1191 ssa_info& info = ctx.info[op.tempId()];
1192 if (info.is_extract() && (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
1193 op.getTemp().type() == RegType::sgpr)) {
1194 if (!can_apply_extract(ctx, instr, i, info))
1195 info.label &= ~label_extract;
1201 does_fp_op_flush_denorms(opt_ctx& ctx, aco_opcode op)
1204 case aco_opcode::v_min_f32:
1205 case aco_opcode::v_max_f32:
1206 case aco_opcode::v_med3_f32:
1207 case aco_opcode::v_min3_f32:
1208 case aco_opcode::v_max3_f32:
1209 case aco_opcode::v_min_f16:
1210 case aco_opcode::v_max_f16: return ctx.program->gfx_level > GFX8;
1211 case aco_opcode::v_cndmask_b32:
1212 case aco_opcode::v_cndmask_b16:
1213 case aco_opcode::v_mov_b32:
1214 case aco_opcode::v_mov_b16: return false;
1215 default: return true;
1220 can_eliminate_fcanonicalize(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp tmp, unsigned idx)
1222 float_mode* fp = &ctx.fp_mode;
1223 if (ctx.info[tmp.id()].is_canonicalized() ||
1224 (tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
1227 aco_opcode op = instr->opcode;
1228 return can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, idx) &&
1229 does_fp_op_flush_denorms(ctx, op);
1233 can_eliminate_and_exec(opt_ctx& ctx, Temp tmp, unsigned pass_flags)
1235 if (ctx.info[tmp.id()].is_vopc()) {
1236 Instruction* vopc_instr = ctx.info[tmp.id()].instr;
1237 /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus
1238 * already produces the same result */
1239 return vopc_instr->pass_flags == pass_flags;
1241 if (ctx.info[tmp.id()].is_bitwise()) {
1242 Instruction* instr = ctx.info[tmp.id()].instr;
1243 if (instr->operands.size() != 2 || instr->pass_flags != pass_flags)
1245 if (!(instr->operands[0].isTemp() && instr->operands[1].isTemp()))
1247 if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_and_b64) {
1248 return can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), pass_flags) ||
1249 can_eliminate_and_exec(ctx, instr->operands[1].getTemp(), pass_flags);
1251 return can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), pass_flags) &&
1252 can_eliminate_and_exec(ctx, instr->operands[1].getTemp(), pass_flags);
1259 is_copy_label(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info, unsigned idx)
1261 return info.is_temp() ||
1262 (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp, idx));
1266 is_op_canonicalized(opt_ctx& ctx, Operand op)
1268 float_mode* fp = &ctx.fp_mode;
1269 if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) ||
1270 (op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
1273 if (op.isConstant() || (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(32))) {
1274 uint32_t val = op.isTemp() ? ctx.info[op.tempId()].val : op.constantValue();
1275 if (op.bytes() == 2)
1276 return (val & 0x7fff) == 0 || (val & 0x7fff) > 0x3ff;
1277 else if (op.bytes() == 4)
1278 return (val & 0x7fffffff) == 0 || (val & 0x7fffffff) > 0x7fffff;
1284 is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int64_t offset0, int64_t offset1)
1286 bool negative_unaligned_scratch_offset_bug = ctx.program->gfx_level == GFX10;
1287 int32_t min = ctx.program->dev.scratch_global_offset_min;
1288 int32_t max = ctx.program->dev.scratch_global_offset_max;
1290 int64_t offset = offset0 + offset1;
1292 bool has_vgpr_offset = instr && !instr->operands[0].isUndefined();
1293 if (negative_unaligned_scratch_offset_bug && has_vgpr_offset && offset < 0 && offset % 4)
1296 return offset >= min && offset <= max;
1300 detect_clamp(Instruction* instr, unsigned* clamped_idx)
1302 VALU_instruction& valu = instr->valu();
1303 if (valu.omod != 0 || valu.opsel != 0)
1307 bool found_zero = false, found_one = false;
1308 bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
1309 for (unsigned i = 0; i < 3; i++) {
1310 if (!valu.neg[i] && instr->operands[i].constantEquals(0))
1312 else if (!valu.neg[i] &&
1313 instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
1318 if (found_zero && found_one && instr->operands[idx].isTemp()) {
1327 label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1329 if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) {
1330 ASSERTED bool all_const = false;
1331 for (Operand& op : instr->operands)
1333 all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));
1334 perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get());
1336 ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 ||
1337 instr->opcode == aco_opcode::s_mov_b64 ||
1338 instr->opcode == aco_opcode::v_mov_b32;
1339 perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead",
1343 if (instr->isSMEM())
1344 smem_combine(ctx, instr);
1346 for (unsigned i = 0; i < instr->operands.size(); i++) {
1347 if (!instr->operands[i].isTemp())
1350 ssa_info info = ctx.info[instr->operands[i].tempId()];
1351 /* propagate undef */
1352 if (info.is_undefined() && is_phi(instr))
1353 instr->operands[i] = Operand(instr->operands[i].regClass());
1354 /* propagate reg->reg of same type */
1355 while (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {
1356 instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);
1357 info = ctx.info[info.temp.id()];
1360 /* PSEUDO: propagate temporaries */
1361 if (instr->isPseudo()) {
1362 while (info.is_temp()) {
1363 pseudo_propagate_temp(ctx, instr, info.temp, i);
1364 info = ctx.info[info.temp.id()];
1368 /* SALU / PSEUDO: propagate inline constants */
1369 if (instr->isSALU() || instr->isPseudo()) {
1370 unsigned bits = get_operand_size(instr, i);
1371 if ((info.is_constant(bits) || (info.is_literal(bits) && instr->isPseudo())) &&
1372 alu_can_accept_constant(instr, i)) {
1373 instr->operands[i] = get_constant_op(ctx, info, bits);
1378 /* VALU: propagate neg, abs & inline constants */
1379 else if (instr->isVALU()) {
1380 if (is_copy_label(ctx, instr, info, i) && info.temp.type() == RegType::vgpr &&
1381 valu_can_accept_vgpr(instr, i)) {
1382 instr->operands[i].setTemp(info.temp);
1383 info = ctx.info[info.temp.id()];
1385 /* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
1386 if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
1387 instr->operands.size() == 1) {
1388 instr->format = withoutDPP(instr->format);
1389 instr->operands[i].setTemp(info.temp);
1390 info = ctx.info[info.temp.id()];
1393 /* for instructions other than v_cndmask_b32, the size of the instruction should match the
1395 unsigned can_use_mod =
1396 instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
1398 can_use_mod && can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i);
1400 if (instr->isSDWA())
1401 can_use_mod = can_use_mod && instr->sdwa().sel[i].size() == 4;
1403 can_use_mod = can_use_mod && (instr->isDPP16() || can_use_VOP3(ctx, instr));
1405 unsigned bits = get_operand_size(instr, i);
1406 bool mod_bitsize_compat = instr->operands[i].bytes() * 8 == bits;
1408 if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32 && mod_bitsize_compat) {
1409 instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
1410 instr->operands[i].setTemp(info.temp);
1411 } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16 && mod_bitsize_compat) {
1412 instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
1413 instr->operands[i].setTemp(info.temp);
1414 } else if (info.is_neg() && can_use_mod && mod_bitsize_compat &&
1415 can_eliminate_fcanonicalize(ctx, instr, info.temp, i)) {
1416 if (!instr->isDPP() && !instr->isSDWA())
1417 instr->format = asVOP3(instr->format);
1418 instr->operands[i].setTemp(info.temp);
1419 if (!instr->valu().abs[i])
1420 instr->valu().neg[i] = true;
1422 if (info.is_abs() && can_use_mod && mod_bitsize_compat &&
1423 can_eliminate_fcanonicalize(ctx, instr, info.temp, i)) {
1424 if (!instr->isDPP() && !instr->isSDWA())
1425 instr->format = asVOP3(instr->format);
1426 instr->operands[i] = Operand(info.temp);
1427 instr->valu().abs[i] = true;
1431 if (instr->isVOP3P()) {
1432 propagate_constants_vop3p(ctx, instr, info, i);
1436 if (info.is_constant(bits) && alu_can_accept_constant(instr, i) &&
1437 (!instr->isSDWA() || ctx.program->gfx_level >= GFX9) && (!instr->isDPP() || i != 1)) {
1438 Operand op = get_constant_op(ctx, info, bits);
1439 perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,
1440 "v_cndmask_b32 with a constant selector", instr.get());
1441 if (i == 0 || instr->isSDWA() || instr->opcode == aco_opcode::v_readlane_b32 ||
1442 instr->opcode == aco_opcode::v_writelane_b32) {
1443 instr->format = withoutDPP(instr->format);
1444 instr->operands[i] = op;
1446 } else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) {
1447 instr->operands[i] = op;
1448 instr->valu().swapOperands(0, i);
1450 } else if (can_use_VOP3(ctx, instr)) {
1451 instr->format = asVOP3(instr->format);
1452 instr->operands[i] = op;
1458 /* MUBUF: propagate constants and combine additions */
1459 else if (instr->isMUBUF()) {
1460 MUBUF_instruction& mubuf = instr->mubuf();
1463 while (info.is_temp())
1464 info = ctx.info[info.temp.id()];
1466 /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
1467 * overflow for scratch accesses works only on GFX9+ and saddr overflow
1468 * never works. Since swizzling is the only thing that separates
1469 * scratch accesses and other accesses and swizzling changing how
1470 * addressing works significantly, this probably applies to swizzled
1471 * MUBUF accesses. */
1472 bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->gfx_level < GFX9;
1474 if (mubuf.offen && mubuf.idxen && i == 1 && info.is_vec() &&
1475 info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
1476 info.instr->operands[0].regClass() == v1 && info.instr->operands[1].isConstant() &&
1477 mubuf.offset + info.instr->operands[1].constantValue() < 4096) {
1478 instr->operands[1] = info.instr->operands[0];
1479 mubuf.offset += info.instr->operands[1].constantValue();
1480 mubuf.offen = false;
1482 } else if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) &&
1483 mubuf.offset + info.val < 4096) {
1484 assert(!mubuf.idxen);
1485 instr->operands[1] = Operand(v1);
1486 mubuf.offset += info.val;
1487 mubuf.offen = false;
1489 } else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) {
1490 instr->operands[2] = Operand::c32(0);
1491 mubuf.offset += info.val;
1493 } else if (mubuf.offen && i == 1 &&
1494 parse_base_offset(ctx, instr.get(), i, &base, &offset,
1495 vaddr_prevent_overflow) &&
1496 base.regClass() == v1 && mubuf.offset + offset < 4096) {
1497 assert(!mubuf.idxen);
1498 instr->operands[1].setTemp(base);
1499 mubuf.offset += offset;
1501 } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
1502 base.regClass() == s1 && mubuf.offset + offset < 4096 && !mubuf.swizzled) {
1503 instr->operands[i].setTemp(base);
1504 mubuf.offset += offset;
1509 else if (instr->isMTBUF()) {
1510 MTBUF_instruction& mtbuf = instr->mtbuf();
1511 while (info.is_temp())
1512 info = ctx.info[info.temp.id()];
1514 if (mtbuf.offen && mtbuf.idxen && i == 1 && info.is_vec() &&
1515 info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
1516 info.instr->operands[0].regClass() == v1 && info.instr->operands[1].isConstant() &&
1517 mtbuf.offset + info.instr->operands[1].constantValue() < 4096) {
1518 instr->operands[1] = info.instr->operands[0];
1519 mtbuf.offset += info.instr->operands[1].constantValue();
1520 mtbuf.offen = false;
1525 /* SCRATCH: propagate constants and combine additions */
1526 else if (instr->isScratch()) {
1527 FLAT_instruction& scratch = instr->scratch();
1530 while (info.is_temp())
1531 info = ctx.info[info.temp.id()];
1533 /* The hardware probably does: 'scratch_base + u2u64(saddr) + i2i64(offset)'. This means
1534 * we can't combine the addition if the unsigned addition overflows and offset is
1535 * positive. In theory, there is also issues if
1536 * 'ilt(offset, 0) && ige(saddr, 0) && ilt(saddr + offset, 0)', but that just
1537 * replaces an already out-of-bounds access with a larger one since 'saddr + offset'
1538 * would be larger than INT32_MAX.
1540 if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
1541 base.regClass() == instr->operands[i].regClass() &&
1542 is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) {
1543 instr->operands[i].setTemp(base);
1544 scratch.offset += (int32_t)offset;
1546 } else if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
1547 base.regClass() == instr->operands[i].regClass() && (int32_t)offset < 0 &&
1548 is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) {
1549 instr->operands[i].setTemp(base);
1550 scratch.offset += (int32_t)offset;
1552 } else if (i <= 1 && info.is_constant_or_literal(32) &&
1553 ctx.program->gfx_level >= GFX10_3 &&
1554 is_scratch_offset_valid(ctx, NULL, scratch.offset, (int32_t)info.val)) {
1555 /* GFX10.3+ can disable both SADDR and ADDR. */
1556 instr->operands[i] = Operand(instr->operands[i].regClass());
1557 scratch.offset += (int32_t)info.val;
1562 /* DS: combine additions */
1563 else if (instr->isDS()) {
1565 DS_instruction& ds = instr->ds();
1568 bool has_usable_ds_offset = ctx.program->gfx_level >= GFX7;
1569 if (has_usable_ds_offset && i == 0 &&
1570 parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
1571 base.regClass() == instr->operands[i].regClass() &&
1572 instr->opcode != aco_opcode::ds_swizzle_b32) {
1573 if (instr->opcode == aco_opcode::ds_write2_b32 ||
1574 instr->opcode == aco_opcode::ds_read2_b32 ||
1575 instr->opcode == aco_opcode::ds_write2_b64 ||
1576 instr->opcode == aco_opcode::ds_read2_b64 ||
1577 instr->opcode == aco_opcode::ds_write2st64_b32 ||
1578 instr->opcode == aco_opcode::ds_read2st64_b32 ||
1579 instr->opcode == aco_opcode::ds_write2st64_b64 ||
1580 instr->opcode == aco_opcode::ds_read2st64_b64) {
1581 bool is64bit = instr->opcode == aco_opcode::ds_write2_b64 ||
1582 instr->opcode == aco_opcode::ds_read2_b64 ||
1583 instr->opcode == aco_opcode::ds_write2st64_b64 ||
1584 instr->opcode == aco_opcode::ds_read2st64_b64;
1585 bool st64 = instr->opcode == aco_opcode::ds_write2st64_b32 ||
1586 instr->opcode == aco_opcode::ds_read2st64_b32 ||
1587 instr->opcode == aco_opcode::ds_write2st64_b64 ||
1588 instr->opcode == aco_opcode::ds_read2st64_b64;
1589 unsigned shifts = (is64bit ? 3 : 2) + (st64 ? 6 : 0);
1590 unsigned mask = BITFIELD_MASK(shifts);
1592 if ((offset & mask) == 0 && ds.offset0 + (offset >> shifts) <= 255 &&
1593 ds.offset1 + (offset >> shifts) <= 255) {
1594 instr->operands[i].setTemp(base);
1595 ds.offset0 += offset >> shifts;
1596 ds.offset1 += offset >> shifts;
1599 if (ds.offset0 + offset <= 65535) {
1600 instr->operands[i].setTemp(base);
1601 ds.offset0 += offset;
1607 else if (instr->isBranch()) {
1608 if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
1609 /* Flip the branch instruction to get rid of the scc_invert instruction */
1610 instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
1611 : aco_opcode::p_cbranch_z;
1612 instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
1617 /* if this instruction doesn't define anything, return */
1618 if (instr->definitions.empty()) {
1619 check_sdwa_extract(ctx, instr);
1623 if (instr->isVALU() || instr->isVINTRP()) {
1624 if (instr_info.can_use_output_modifiers[(int)instr->opcode] || instr->isVINTRP() ||
1625 instr->opcode == aco_opcode::v_cndmask_b32) {
1626 bool canonicalized = true;
1627 if (!does_fp_op_flush_denorms(ctx, instr->opcode)) {
1628 unsigned ops = instr->opcode == aco_opcode::v_cndmask_b32 ? 2 : instr->operands.size();
1629 for (unsigned i = 0; canonicalized && (i < ops); i++)
1630 canonicalized = is_op_canonicalized(ctx, instr->operands[i]);
1633 ctx.info[instr->definitions[0].tempId()].set_canonicalized();
1636 if (instr->isVOPC()) {
1637 ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());
1638 check_sdwa_extract(ctx, instr);
1641 if (instr->isVOP3P()) {
1642 ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
1647 switch (instr->opcode) {
1648 case aco_opcode::p_create_vector: {
1649 bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&
1650 instr->operands[0].regClass() == instr->definitions[0].regClass();
1652 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1656 /* expand vector operands */
1657 std::vector<Operand> ops;
1658 unsigned offset = 0;
1659 for (const Operand& op : instr->operands) {
1660 /* ensure that any expanded operands are properly aligned */
1661 bool aligned = offset % 4 == 0 || op.bytes() < 4;
1662 offset += op.bytes();
1663 if (aligned && op.isTemp() && ctx.info[op.tempId()].is_vec()) {
1664 Instruction* vec = ctx.info[op.tempId()].instr;
1665 for (const Operand& vec_op : vec->operands)
1666 ops.emplace_back(vec_op);
1668 ops.emplace_back(op);
1672 /* combine expanded operands to new vector */
1673 if (ops.size() != instr->operands.size()) {
1674 assert(ops.size() > instr->operands.size());
1675 Definition def = instr->definitions[0];
1676 instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
1677 Format::PSEUDO, ops.size(), 1));
1678 for (unsigned i = 0; i < ops.size(); i++) {
1679 if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() &&
1680 ops[i].regClass() == ctx.info[ops[i].tempId()].temp.regClass())
1681 ops[i].setTemp(ctx.info[ops[i].tempId()].temp);
1682 instr->operands[i] = ops[i];
1684 instr->definitions[0] = def;
1686 for (unsigned i = 0; i < ops.size(); i++) {
1687 assert(instr->operands[i] == ops[i]);
1690 ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
1692 if (instr->operands.size() == 2) {
1693 /* check if this is created from split_vector */
1694 if (instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_split()) {
1695 Instruction* split = ctx.info[instr->operands[1].tempId()].instr;
1696 if (instr->operands[0].isTemp() &&
1697 instr->operands[0].getTemp() == split->definitions[0].getTemp())
1698 ctx.info[instr->definitions[0].tempId()].set_temp(split->operands[0].getTemp());
1703 case aco_opcode::p_split_vector: {
1704 ssa_info& info = ctx.info[instr->operands[0].tempId()];
1706 if (info.is_constant_or_literal(32)) {
1707 uint64_t val = info.val;
1708 for (Definition def : instr->definitions) {
1709 uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u);
1710 ctx.info[def.tempId()].set_constant(ctx.program->gfx_level, val & mask);
1711 val >>= def.bytes() * 8u;
1714 } else if (!info.is_vec()) {
1715 if (instr->definitions.size() == 2 && instr->operands[0].isTemp() &&
1716 instr->definitions[0].bytes() == instr->definitions[1].bytes()) {
1717 ctx.info[instr->definitions[1].tempId()].set_split(instr.get());
1718 if (instr->operands[0].bytes() == 4) {
1719 /* D16 subdword split */
1720 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1721 ctx.info[instr->definitions[1].tempId()].set_extract(instr.get());
1727 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1728 unsigned split_offset = 0;
1729 unsigned vec_offset = 0;
1730 unsigned vec_index = 0;
1731 for (unsigned i = 0; i < instr->definitions.size();
1732 split_offset += instr->definitions[i++].bytes()) {
1733 while (vec_offset < split_offset && vec_index < vec->operands.size())
1734 vec_offset += vec->operands[vec_index++].bytes();
1736 if (vec_offset != split_offset ||
1737 vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
1740 Operand vec_op = vec->operands[vec_index];
1741 if (vec_op.isConstant()) {
1742 ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->gfx_level,
1743 vec_op.constantValue64());
1744 } else if (vec_op.isUndefined()) {
1745 ctx.info[instr->definitions[i].tempId()].set_undefined();
1747 assert(vec_op.isTemp());
1748 ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());
1753 case aco_opcode::p_extract_vector: { /* mov */
1754 ssa_info& info = ctx.info[instr->operands[0].tempId()];
1755 const unsigned index = instr->operands[1].constantValue();
1756 const unsigned dst_offset = index * instr->definitions[0].bytes();
1758 if (info.is_vec()) {
1759 /* check if we index directly into a vector element */
1760 Instruction* vec = info.instr;
1761 unsigned offset = 0;
1763 for (const Operand& op : vec->operands) {
1764 if (offset < dst_offset) {
1765 offset += op.bytes();
1767 } else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
1770 instr->operands[0] = op;
1773 } else if (info.is_constant_or_literal(32)) {
1774 /* propagate constants */
1775 uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
1776 uint32_t val = (info.val >> (dst_offset * 8u)) & mask;
1777 instr->operands[0] =
1778 Operand::get_const(ctx.program->gfx_level, val, instr->definitions[0].bytes());
1782 if (instr->operands[0].bytes() != instr->definitions[0].bytes()) {
1783 if (instr->operands[0].size() != 1)
1787 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1789 ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
1793 /* convert this extract into a copy instruction */
1794 instr->opcode = aco_opcode::p_parallelcopy;
1795 instr->operands.pop_back();
1798 case aco_opcode::p_parallelcopy: /* propagate */
1799 if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() &&
1800 instr->operands[0].regClass() != instr->definitions[0].regClass()) {
1801 /* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so
1802 * duplicate the vector instead.
1804 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1805 aco_ptr<Instruction> old_copy = std::move(instr);
1807 instr.reset(create_instruction<Pseudo_instruction>(
1808 aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1));
1809 instr->definitions[0] = old_copy->definitions[0];
1810 std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin());
1811 for (unsigned i = 0; i < vec->operands.size(); i++) {
1812 Operand& op = instr->operands[i];
1813 if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&
1814 ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())
1815 op.setTemp(ctx.info[op.tempId()].temp);
1817 ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
1821 case aco_opcode::p_as_uniform:
1822 if (instr->definitions[0].isFixed()) {
1823 /* don't copy-propagate copies into fixed registers */
1824 } else if (instr->operands[0].isConstant()) {
1825 ctx.info[instr->definitions[0].tempId()].set_constant(
1826 ctx.program->gfx_level, instr->operands[0].constantValue64());
1827 } else if (instr->operands[0].isTemp()) {
1828 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1829 if (ctx.info[instr->operands[0].tempId()].is_canonicalized())
1830 ctx.info[instr->definitions[0].tempId()].set_canonicalized();
1832 assert(instr->operands[0].isFixed());
1835 case aco_opcode::v_mov_b32:
1836 if (instr->isDPP16()) {
1837 /* anything else doesn't make sense in SSA */
1838 assert(instr->dpp16().row_mask == 0xf && instr->dpp16().bank_mask == 0xf);
1839 ctx.info[instr->definitions[0].tempId()].set_dpp16(instr.get());
1840 } else if (instr->isDPP8()) {
1841 ctx.info[instr->definitions[0].tempId()].set_dpp8(instr.get());
1844 case aco_opcode::p_is_helper:
1845 if (!ctx.program->needs_wqm)
1846 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->gfx_level, 0u);
1848 case aco_opcode::v_mul_f64: ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); break;
1849 case aco_opcode::v_mul_f16:
1850 case aco_opcode::v_mul_f32:
1851 case aco_opcode::v_mul_legacy_f32: { /* omod */
1852 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
1854 /* TODO: try to move the negate/abs modifier to the consumer instead */
1855 bool uses_mods = instr->usesModifiers();
1856 bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
1858 for (unsigned i = 0; i < 2; i++) {
1859 if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
1860 if (!instr->isDPP() && !instr->isSDWA() && !instr->valu().opsel &&
1861 (instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) || /* 1.0 */
1862 instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */
1863 bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u);
1865 VALU_instruction* vop3 = instr->isVOP3() ? &instr->valu() : NULL;
1866 if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod))
1869 bool abs = vop3 && vop3->abs[i];
1870 bool neg = neg1 ^ (vop3 && vop3->neg[i]);
1872 Temp other = instr->operands[i].getTemp();
1873 if (abs && neg && other.type() == RegType::vgpr)
1874 ctx.info[instr->definitions[0].tempId()].set_neg_abs(other);
1875 else if (abs && !neg && other.type() == RegType::vgpr)
1876 ctx.info[instr->definitions[0].tempId()].set_abs(other);
1877 else if (!abs && neg && other.type() == RegType::vgpr)
1878 ctx.info[instr->definitions[0].tempId()].set_neg(other);
1879 else if (!abs && !neg)
1880 ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other);
1881 } else if (uses_mods) {
1883 } else if (instr->operands[!i].constantValue() ==
1884 (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
1885 ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());
1886 } else if (instr->operands[!i].constantValue() ==
1887 (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
1888 ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());
1889 } else if (instr->operands[!i].constantValue() ==
1890 (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */
1891 ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());
1892 } else if (instr->operands[!i].constantValue() == 0u &&
1893 (!(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64
1894 : ctx.fp_mode.preserve_signed_zero_inf_nan32) ||
1895 instr->opcode == aco_opcode::v_mul_legacy_f32)) { /* 0.0 */
1896 ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->gfx_level, 0u);
1905 case aco_opcode::v_mul_lo_u16:
1906 case aco_opcode::v_mul_lo_u16_e64:
1907 case aco_opcode::v_mul_u32_u24:
1908 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
1910 case aco_opcode::v_med3_f16:
1911 case aco_opcode::v_med3_f32: { /* clamp */
1913 if (detect_clamp(instr.get(), &idx) && !instr->valu().abs && !instr->valu().neg)
1914 ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get());
1917 case aco_opcode::v_cndmask_b32:
1918 if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0xFFFFFFFF))
1919 ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
1920 else if (instr->operands[0].constantEquals(0) &&
1921 instr->operands[1].constantEquals(0x3f800000u))
1922 ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
1923 else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(1))
1924 ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
1927 case aco_opcode::v_cmp_lg_u32:
1928 if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
1929 instr->operands[0].constantEquals(0) && instr->operands[1].isTemp() &&
1930 ctx.info[instr->operands[1].tempId()].is_vcc())
1931 ctx.info[instr->definitions[0].tempId()].set_temp(
1932 ctx.info[instr->operands[1].tempId()].temp);
1934 case aco_opcode::p_linear_phi: {
1935 /* lower_bool_phis() can create phis like this */
1936 bool all_same_temp = instr->operands[0].isTemp();
1937 /* this check is needed when moving uniform loop counters out of a divergent loop */
1939 all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
1940 for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
1941 if (!instr->operands[i].isTemp() ||
1942 instr->operands[i].tempId() != instr->operands[0].tempId())
1943 all_same_temp = false;
1945 if (all_same_temp) {
1946 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1948 bool all_undef = instr->operands[0].isUndefined();
1949 for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {
1950 if (!instr->operands[i].isUndefined())
1954 ctx.info[instr->definitions[0].tempId()].set_undefined();
1958 case aco_opcode::v_add_u32:
1959 case aco_opcode::v_add_co_u32:
1960 case aco_opcode::v_add_co_u32_e64:
1961 case aco_opcode::s_add_i32:
1962 case aco_opcode::s_add_u32:
1963 case aco_opcode::v_subbrev_co_u32:
1964 case aco_opcode::v_sub_u32:
1965 case aco_opcode::v_sub_i32:
1966 case aco_opcode::v_sub_co_u32:
1967 case aco_opcode::v_sub_co_u32_e64:
1968 case aco_opcode::s_sub_u32:
1969 case aco_opcode::s_sub_i32:
1970 case aco_opcode::v_subrev_u32:
1971 case aco_opcode::v_subrev_co_u32:
1972 case aco_opcode::v_subrev_co_u32_e64:
1973 ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
1975 case aco_opcode::s_not_b32:
1976 case aco_opcode::s_not_b64:
1977 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1978 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1979 ctx.info[instr->definitions[1].tempId()].set_scc_invert(
1980 ctx.info[instr->operands[0].tempId()].temp);
1981 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1982 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1983 ctx.info[instr->definitions[1].tempId()].set_scc_invert(
1984 ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
1986 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
1988 case aco_opcode::s_and_b32:
1989 case aco_opcode::s_and_b64:
1990 if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
1991 if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1992 /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a
1993 * uniform bool into divergent */
1994 ctx.info[instr->definitions[1].tempId()].set_temp(
1995 ctx.info[instr->operands[0].tempId()].temp);
1996 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
1997 ctx.info[instr->operands[0].tempId()].temp);
1999 } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
2000 /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction
2001 * already produces the same SCC */
2002 ctx.info[instr->definitions[1].tempId()].set_temp(
2003 ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
2004 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
2005 ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
2007 } else if ((ctx.program->stage.num_sw_stages() > 1 ||
2008 ctx.program->stage.hw == HWStage::NGG) &&
2009 instr->pass_flags == 1) {
2010 /* In case of merged shaders, pass_flags=1 means that all lanes are active (exec=-1), so
2011 * s_and is unnecessary. */
2012 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
2017 case aco_opcode::s_or_b32:
2018 case aco_opcode::s_or_b64:
2019 case aco_opcode::s_xor_b32:
2020 case aco_opcode::s_xor_b64:
2021 if (std::all_of(instr->operands.begin(), instr->operands.end(),
2022 [&ctx](const Operand& op)
2024 return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() ||
2025 ctx.info[op.tempId()].is_uniform_bitwise());
2027 ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
2029 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
2031 case aco_opcode::s_lshl_b32:
2032 case aco_opcode::v_or_b32:
2033 case aco_opcode::v_lshlrev_b32:
2034 case aco_opcode::v_bcnt_u32_b32:
2035 case aco_opcode::v_and_b32:
2036 case aco_opcode::v_xor_b32:
2037 case aco_opcode::v_not_b32:
2038 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
2040 case aco_opcode::v_min_f32:
2041 case aco_opcode::v_min_f16:
2042 case aco_opcode::v_min_u32:
2043 case aco_opcode::v_min_i32:
2044 case aco_opcode::v_min_u16:
2045 case aco_opcode::v_min_i16:
2046 case aco_opcode::v_min_u16_e64:
2047 case aco_opcode::v_min_i16_e64:
2048 case aco_opcode::v_max_f32:
2049 case aco_opcode::v_max_f16:
2050 case aco_opcode::v_max_u32:
2051 case aco_opcode::v_max_i32:
2052 case aco_opcode::v_max_u16:
2053 case aco_opcode::v_max_i16:
2054 case aco_opcode::v_max_u16_e64:
2055 case aco_opcode::v_max_i16_e64:
2056 ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());
2058 case aco_opcode::s_cselect_b64:
2059 case aco_opcode::s_cselect_b32:
2060 if (instr->operands[0].constantEquals((unsigned)-1) && instr->operands[1].constantEquals(0)) {
2061 /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
2062 ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
2064 if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
2065 /* Flip the operands to get rid of the scc_invert instruction */
2066 std::swap(instr->operands[0], instr->operands[1]);
2067 instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
2070 case aco_opcode::p_wqm:
2071 if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
2072 ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
2075 case aco_opcode::s_mul_i32:
2076 /* Testing every uint32_t shows that 0x3f800000*n is never a denormal.
2077 * This pattern is created from a uniform nir_op_b2f. */
2078 if (instr->operands[0].constantEquals(0x3f800000u))
2079 ctx.info[instr->definitions[0].tempId()].set_canonicalized();
2081 case aco_opcode::p_extract: {
2082 if (instr->definitions[0].bytes() == 4) {
2083 ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
2084 if (instr->operands[0].regClass() == v1 && parse_insert(instr.get()))
2085 ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
2089 case aco_opcode::p_insert: {
2090 if (instr->operands[0].bytes() == 4) {
2091 if (instr->operands[0].regClass() == v1)
2092 ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
2093 if (parse_extract(instr.get()))
2094 ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
2095 ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
2099 case aco_opcode::ds_read_u8:
2100 case aco_opcode::ds_read_u8_d16:
2101 case aco_opcode::ds_read_u16:
2102 case aco_opcode::ds_read_u16_d16: {
2103 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
2106 case aco_opcode::v_mbcnt_lo_u32_b32: {
2107 if (instr->operands[0].constantEquals(-1) && instr->operands[1].constantEquals(0)) {
2108 if (ctx.program->wave_size == 32)
2109 ctx.info[instr->definitions[0].tempId()].set_subgroup_invocation(instr.get());
2111 ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
2115 case aco_opcode::v_mbcnt_hi_u32_b32:
2116 case aco_opcode::v_mbcnt_hi_u32_b32_e64: {
2117 if (instr->operands[0].constantEquals(-1) && instr->operands[1].isTemp() &&
2118 ctx.info[instr->operands[1].tempId()].is_usedef()) {
2119 Instruction* usedef_instr = ctx.info[instr->operands[1].tempId()].instr;
2120 if (usedef_instr->opcode == aco_opcode::v_mbcnt_lo_u32_b32 &&
2121 usedef_instr->operands[0].constantEquals(-1) &&
2122 usedef_instr->operands[1].constantEquals(0))
2123 ctx.info[instr->definitions[0].tempId()].set_subgroup_invocation(instr.get());
2127 case aco_opcode::v_cvt_f16_f32: {
2128 if (instr->operands[0].isTemp())
2129 ctx.info[instr->operands[0].tempId()].set_f2f16(instr.get());
2132 case aco_opcode::v_cvt_f32_f16: {
2133 if (instr->operands[0].isTemp())
2134 ctx.info[instr->definitions[0].tempId()].set_f2f32(instr.get());
2140 /* Don't remove label_extract if we can't apply the extract to
2141 * neg/abs instructions because we'll likely combine it into another valu. */
2142 if (!(ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)))
2143 check_sdwa_extract(ctx, instr);
2147 original_temp_id(opt_ctx& ctx, Temp tmp)
2149 if (ctx.info[tmp.id()].is_temp())
2150 return ctx.info[tmp.id()].temp.id();
2156 decrease_op_uses_if_dead(opt_ctx& ctx, Instruction* instr)
2158 if (is_dead(ctx.uses, instr)) {
2159 for (const Operand& op : instr->operands) {
2161 ctx.uses[op.tempId()]--;
2167 decrease_uses(opt_ctx& ctx, Instruction* instr)
2169 ctx.uses[instr->definitions[0].tempId()]--;
2170 decrease_op_uses_if_dead(ctx, instr);
2174 copy_operand(opt_ctx& ctx, Operand op)
2177 ctx.uses[op.tempId()]++;
2182 follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false)
2184 if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels))
2186 if (!ignore_uses && ctx.uses[op.tempId()] > 1)
2189 Instruction* instr = ctx.info[op.tempId()].instr;
2191 if (instr->definitions.size() == 2) {
2192 assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
2193 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2197 for (Operand& operand : instr->operands) {
2198 if (fixed_to_exec(operand))
2205 /* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
2206 * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
2208 combine_ordering_test(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2210 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
2212 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2215 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
2217 bitarray8 opsel = 0;
2218 Instruction* op_instr[2];
2221 unsigned bitsize = 0;
2222 for (unsigned i = 0; i < 2; i++) {
2223 op_instr[i] = follow_operand(ctx, instr->operands[i], true);
2227 aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
2228 unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode);
2230 if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp)
2232 if (bitsize && op_bitsize != bitsize)
2234 if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())
2237 if (op_instr[i]->isSDWA() || op_instr[i]->isDPP())
2240 VALU_instruction& valu = op_instr[i]->valu();
2241 if (valu.neg[0] != valu.neg[1] || valu.abs[0] != valu.abs[1] ||
2242 valu.opsel[0] != valu.opsel[1])
2244 opsel[i] = valu.opsel[0];
2246 Temp op0 = op_instr[i]->operands[0].getTemp();
2247 Temp op1 = op_instr[i]->operands[1].getTemp();
2248 if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))
2252 bitsize = op_bitsize;
2255 if (op[1].type() == RegType::sgpr) {
2256 std::swap(op[0], op[1]);
2257 opsel[0].swap(opsel[1]);
2259 unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);
2260 if (num_sgprs > (ctx.program->gfx_level >= GFX10 ? 2 : 1))
2263 aco_opcode new_op = aco_opcode::num_opcodes;
2265 case 16: new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; break;
2266 case 32: new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; break;
2267 case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break;
2269 bool needs_vop3 = num_sgprs > 1 || (opsel[0] && op[0].type() != RegType::vgpr);
2270 VALU_instruction* new_instr = create_instruction<VALU_instruction>(
2271 new_op, needs_vop3 ? asVOP3(Format::VOPC) : Format::VOPC, 2, 1);
2273 new_instr->opsel = opsel;
2274 new_instr->operands[0] = copy_operand(ctx, Operand(op[0]));
2275 new_instr->operands[1] = copy_operand(ctx, Operand(op[1]));
2276 new_instr->definitions[0] = instr->definitions[0];
2277 new_instr->pass_flags = instr->pass_flags;
2279 decrease_uses(ctx, op_instr[0]);
2280 decrease_uses(ctx, op_instr[1]);
2282 ctx.info[instr->definitions[0].tempId()].label = 0;
2283 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2285 instr.reset(new_instr);
2290 /* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
2291 * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
2293 combine_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2295 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
2297 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2300 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
2301 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
2303 Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
2304 Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
2305 if (!nan_test || !cmp)
2307 if (nan_test->isSDWA() || cmp->isSDWA())
2310 if (get_f32_cmp(cmp->opcode) == expected_nan_test)
2311 std::swap(nan_test, cmp);
2312 else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
2315 if (!is_fp_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))
2318 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
2320 if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())
2323 unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());
2324 unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());
2325 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
2326 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
2327 VALU_instruction& cmp_valu = cmp->valu();
2328 VALU_instruction& nan_valu = nan_test->valu();
2329 if ((prop_cmp0 != prop_nan0 || cmp_valu.opsel[0] != nan_valu.opsel[0]) &&
2330 (prop_cmp0 != prop_nan1 || cmp_valu.opsel[0] != nan_valu.opsel[1]))
2332 if ((prop_cmp1 != prop_nan0 || cmp_valu.opsel[1] != nan_valu.opsel[0]) &&
2333 (prop_cmp1 != prop_nan1 || cmp_valu.opsel[1] != nan_valu.opsel[1]))
2335 if (prop_cmp0 == prop_cmp1 && cmp_valu.opsel[0] == cmp_valu.opsel[1])
2338 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
2339 VALU_instruction* new_instr = create_instruction<VALU_instruction>(
2340 new_op, cmp->isVOP3() ? asVOP3(Format::VOPC) : Format::VOPC, 2, 1);
2341 new_instr->neg = cmp_valu.neg;
2342 new_instr->abs = cmp_valu.abs;
2343 new_instr->clamp = cmp_valu.clamp;
2344 new_instr->omod = cmp_valu.omod;
2345 new_instr->opsel = cmp_valu.opsel;
2346 new_instr->operands[0] = copy_operand(ctx, cmp->operands[0]);
2347 new_instr->operands[1] = copy_operand(ctx, cmp->operands[1]);
2348 new_instr->definitions[0] = instr->definitions[0];
2349 new_instr->pass_flags = instr->pass_flags;
2351 decrease_uses(ctx, nan_test);
2352 decrease_uses(ctx, cmp);
2354 ctx.info[instr->definitions[0].tempId()].label = 0;
2355 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2357 instr.reset(new_instr);
2362 /* Optimize v_cmp of constant with subgroup invocation to a constant mask.
2363 * Ideally, we can trade v_cmp for a constant (or literal).
2364 * In a less ideal case, we trade v_cmp for a SALU instruction, which is still a win.
2367 optimize_cmp_subgroup_invocation(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2369 /* This optimization only applies to VOPC with 2 operands. */
2370 if (instr->operands.size() != 2)
2373 /* Find the constant operand or return early if there isn't one. */
2374 const int const_op_idx = instr->operands[0].isConstant() ? 0
2375 : instr->operands[1].isConstant() ? 1
2377 if (const_op_idx == -1)
2380 /* Find the operand that has the subgroup invocation. */
2381 const int mbcnt_op_idx = 1 - const_op_idx;
2382 const Operand mbcnt_op = instr->operands[mbcnt_op_idx];
2383 if (!mbcnt_op.isTemp() || !ctx.info[mbcnt_op.tempId()].is_subgroup_invocation())
2386 /* Adjust opcode so we don't have to care about const_op_idx below. */
2387 const aco_opcode op = const_op_idx == 0 ? get_swapped(instr->opcode) : instr->opcode;
2388 const unsigned wave_size = ctx.program->wave_size;
2389 const unsigned val = instr->operands[const_op_idx].constantValue();
2391 /* Find suitable constant bitmask corresponding to the value. */
2392 unsigned first_bit = 0, num_bits = 0;
2394 case aco_opcode::v_cmp_eq_u32:
2395 case aco_opcode::v_cmp_eq_i32:
2397 num_bits = val >= wave_size ? 0 : 1;
2399 case aco_opcode::v_cmp_le_u32:
2400 case aco_opcode::v_cmp_le_i32:
2402 num_bits = val >= wave_size ? wave_size : (val + 1);
2404 case aco_opcode::v_cmp_lt_u32:
2405 case aco_opcode::v_cmp_lt_i32:
2407 num_bits = val >= wave_size ? wave_size : val;
2409 case aco_opcode::v_cmp_ge_u32:
2410 case aco_opcode::v_cmp_ge_i32:
2412 num_bits = val >= wave_size ? 0 : (wave_size - val);
2414 case aco_opcode::v_cmp_gt_u32:
2415 case aco_opcode::v_cmp_gt_i32:
2416 first_bit = val + 1;
2417 num_bits = val >= wave_size ? 0 : (wave_size - val - 1);
2419 default: return false;
2422 Instruction* cpy = NULL;
2423 const uint64_t mask = BITFIELD64_RANGE(first_bit, num_bits);
2424 if (wave_size == 64 && mask > 0x7fffffff && mask != -1ull) {
2425 /* Mask can't be represented as a 64-bit constant or literal, use s_bfm_b64. */
2426 cpy = create_instruction<SOP2_instruction>(aco_opcode::s_bfm_b64, Format::SOP2, 2, 1);
2427 cpy->operands[0] = Operand::c32(num_bits);
2428 cpy->operands[1] = Operand::c32(first_bit);
2430 /* Copy mask as a literal constant. */
2432 create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1);
2433 cpy->operands[0] = wave_size == 32 ? Operand::c32((uint32_t)mask) : Operand::c64(mask);
2436 cpy->definitions[0] = instr->definitions[0];
2437 ctx.info[instr->definitions[0].tempId()].label = 0;
2438 decrease_uses(ctx, ctx.info[mbcnt_op.tempId()].instr);
2445 is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value)
2447 if (op.isConstant()) {
2448 *value = op.constantValue64();
2450 } else if (op.isTemp()) {
2451 unsigned id = original_temp_id(ctx, op.getTemp());
2452 if (!ctx.info[id].is_constant_or_literal(bit_size))
2454 *value = get_constant_op(ctx, ctx.info[id], bit_size).constantValue64();
2461 is_constant_nan(uint64_t value, unsigned bit_size)
2464 return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff);
2465 else if (bit_size == 32)
2466 return ((value >> 23) & 0xff) == 0xff && (value & 0x7fffff);
2468 return ((value >> 52) & 0x7ff) == 0x7ff && (value & 0xfffffffffffff);
2471 /* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
2472 * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
2474 combine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2476 if (instr->definitions[0].regClass() != ctx.program->lane_mask)
2478 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2481 bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
2483 Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
2484 Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
2486 if (!nan_test || !cmp || nan_test->isSDWA() || cmp->isSDWA() || nan_test->isDPP() ||
2490 aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
2491 if (get_f32_cmp(cmp->opcode) == expected_nan_test)
2492 std::swap(nan_test, cmp);
2493 else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
2496 unsigned bit_size = get_cmp_bitsize(cmp->opcode);
2497 if (!is_fp_cmp(cmp->opcode) || get_cmp_bitsize(nan_test->opcode) != bit_size)
2500 if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
2502 if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())
2505 unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
2506 unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
2507 if (prop_nan0 != prop_nan1)
2510 VALU_instruction& vop3 = nan_test->valu();
2511 if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel[0] != vop3.opsel[1])
2514 int constant_operand = -1;
2515 for (unsigned i = 0; i < 2; i++) {
2516 if (cmp->operands[i].isTemp() &&
2517 original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0 &&
2518 cmp->valu().opsel[i] == nan_test->valu().opsel[0]) {
2519 constant_operand = !i;
2523 if (constant_operand == -1)
2526 uint64_t constant_value;
2527 if (!is_operand_constant(ctx, cmp->operands[constant_operand], bit_size, &constant_value))
2529 if (is_constant_nan(constant_value >> (cmp->valu().opsel[constant_operand] * 16), bit_size))
2532 aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
2533 Instruction* new_instr = create_instruction<VALU_instruction>(new_op, cmp->format, 2, 1);
2534 new_instr->valu().neg = cmp->valu().neg;
2535 new_instr->valu().abs = cmp->valu().abs;
2536 new_instr->valu().clamp = cmp->valu().clamp;
2537 new_instr->valu().omod = cmp->valu().omod;
2538 new_instr->valu().opsel = cmp->valu().opsel;
2539 new_instr->operands[0] = copy_operand(ctx, cmp->operands[0]);
2540 new_instr->operands[1] = copy_operand(ctx, cmp->operands[1]);
2541 new_instr->definitions[0] = instr->definitions[0];
2542 new_instr->pass_flags = instr->pass_flags;
2544 decrease_uses(ctx, nan_test);
2545 decrease_uses(ctx, cmp);
2547 ctx.info[instr->definitions[0].tempId()].label = 0;
2548 ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2550 instr.reset(new_instr);
2555 /* s_not(cmp(a, b)) -> get_inverse(cmp)(a, b) */
2557 combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2559 if (ctx.uses[instr->definitions[1].tempId()])
2561 if (!instr->operands[0].isTemp() || ctx.uses[instr->operands[0].tempId()] != 1)
2564 Instruction* cmp = follow_operand(ctx, instr->operands[0]);
2568 aco_opcode new_opcode = get_inverse(cmp->opcode);
2569 if (new_opcode == aco_opcode::num_opcodes)
2572 /* Invert compare instruction and assign this instruction's definition */
2573 cmp->opcode = new_opcode;
2574 ctx.info[instr->definitions[0].tempId()] = ctx.info[cmp->definitions[0].tempId()];
2575 std::swap(instr->definitions[0], cmp->definitions[0]);
2577 ctx.uses[instr->operands[0].tempId()]--;
2581 /* op1(op2(1, 2), 0) if swap = false
2582 * op1(0, op2(1, 2)) if swap = true */
2584 match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap,
2585 const char* shuffle_str, Operand operands[3], bitarray8& neg, bitarray8& abs,
2586 bitarray8& opsel, bool* op1_clamp, uint8_t* op1_omod, bool* inbetween_neg,
2587 bool* inbetween_abs, bool* inbetween_opsel, bool* precise)
2590 if (op1_instr->opcode != op1)
2593 Instruction* op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
2594 if (!op2_instr || op2_instr->opcode != op2)
2597 VALU_instruction* op1_valu = op1_instr->isVALU() ? &op1_instr->valu() : NULL;
2598 VALU_instruction* op2_valu = op2_instr->isVALU() ? &op2_instr->valu() : NULL;
2600 if (op1_instr->isSDWA() || op2_instr->isSDWA())
2602 if (op1_instr->isDPP() || op2_instr->isDPP())
2605 /* don't support inbetween clamp/omod */
2606 if (op2_valu && (op2_valu->clamp || op2_valu->omod))
2609 /* get operands and modifiers and check inbetween modifiers */
2610 *op1_clamp = op1_valu ? (bool)op1_valu->clamp : false;
2611 *op1_omod = op1_valu ? (unsigned)op1_valu->omod : 0u;
2614 *inbetween_neg = op1_valu ? op1_valu->neg[swap] : false;
2615 else if (op1_valu && op1_valu->neg[swap])
2619 *inbetween_abs = op1_valu ? op1_valu->abs[swap] : false;
2620 else if (op1_valu && op1_valu->abs[swap])
2623 if (inbetween_opsel)
2624 *inbetween_opsel = op1_valu ? op1_valu->opsel[swap] : false;
2625 else if (op1_valu && op1_valu->opsel[swap])
2628 *precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise();
2631 shuffle[shuffle_str[0] - '0'] = 0;
2632 shuffle[shuffle_str[1] - '0'] = 1;
2633 shuffle[shuffle_str[2] - '0'] = 2;
2635 operands[shuffle[0]] = op1_instr->operands[!swap];
2636 neg[shuffle[0]] = op1_valu ? op1_valu->neg[!swap] : false;
2637 abs[shuffle[0]] = op1_valu ? op1_valu->abs[!swap] : false;
2638 opsel[shuffle[0]] = op1_valu ? op1_valu->opsel[!swap] : false;
2640 for (unsigned i = 0; i < 2; i++) {
2641 operands[shuffle[i + 1]] = op2_instr->operands[i];
2642 neg[shuffle[i + 1]] = op2_valu ? op2_valu->neg[i] : false;
2643 abs[shuffle[i + 1]] = op2_valu ? op2_valu->abs[i] : false;
2644 opsel[shuffle[i + 1]] = op2_valu ? op2_valu->opsel[i] : false;
2647 /* check operands */
2648 if (!check_vop3_operands(ctx, 3, operands))
2655 create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
2656 Operand operands[3], uint8_t neg, uint8_t abs, uint8_t opsel, bool clamp,
2659 VALU_instruction* new_instr = create_instruction<VALU_instruction>(opcode, Format::VOP3, 3, 1);
2660 new_instr->neg = neg;
2661 new_instr->abs = abs;
2662 new_instr->clamp = clamp;
2663 new_instr->omod = omod;
2664 new_instr->opsel = opsel;
2665 new_instr->operands[0] = operands[0];
2666 new_instr->operands[1] = operands[1];
2667 new_instr->operands[2] = operands[2];
2668 new_instr->definitions[0] = instr->definitions[0];
2669 new_instr->pass_flags = instr->pass_flags;
2670 ctx.info[instr->definitions[0].tempId()].label = 0;
2672 instr.reset(new_instr);
2676 combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op,
2677 const char* shuffle, uint8_t ops)
2679 for (unsigned swap = 0; swap < 2; swap++) {
2680 if (!((1 << swap) & ops))
2683 Operand operands[3];
2684 bool clamp, precise;
2685 bitarray8 neg = 0, abs = 0, opsel = 0;
2687 if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg,
2688 abs, opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
2689 ctx.uses[instr->operands[swap].tempId()]--;
2690 create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
2697 /* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */
2699 combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2701 bool is_or = instr->opcode == aco_opcode::v_or_b32;
2702 aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32;
2704 if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32,
2707 if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32,
2710 if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2))
2712 if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))
2715 if (instr->isSDWA() || instr->isDPP())
2718 /* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)
2719 * v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b)
2720 * v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b)
2721 * v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b)
2723 for (unsigned i = 0; i < 2; i++) {
2724 Instruction* extins = follow_operand(ctx, instr->operands[i]);
2729 Operand operands[3];
2731 if (extins->opcode == aco_opcode::p_insert &&
2732 (extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) {
2735 Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue());
2737 (extins->opcode == aco_opcode::p_insert ||
2738 (extins->opcode == aco_opcode::p_extract &&
2739 extins->operands[3].constantEquals(0))) &&
2740 extins->operands[1].constantEquals(0)) {
2741 op = aco_opcode::v_and_or_b32;
2742 operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu);
2747 operands[0] = extins->operands[0];
2748 operands[2] = instr->operands[!i];
2750 if (!check_vop3_operands(ctx, 3, operands))
2753 uint8_t neg = 0, abs = 0, opsel = 0, omod = 0;
2755 if (instr->isVOP3())
2756 clamp = instr->valu().clamp;
2758 ctx.uses[instr->operands[i].tempId()]--;
2759 create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod);
2766 /* v_xor(a, s_not(b)) -> v_xnor(a, b)
2767 * v_xor(a, v_not(b)) -> v_xnor(a, b)
2770 combine_xor_not(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2772 if (instr->usesModifiers())
2775 for (unsigned i = 0; i < 2; i++) {
2776 Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
2778 (op_instr->opcode != aco_opcode::v_not_b32 &&
2779 op_instr->opcode != aco_opcode::s_not_b32) ||
2780 op_instr->usesModifiers() || op_instr->operands[0].isLiteral())
2783 instr->opcode = aco_opcode::v_xnor_b32;
2784 instr->operands[i] = copy_operand(ctx, op_instr->operands[0]);
2785 decrease_uses(ctx, op_instr);
2786 if (instr->operands[0].isOfType(RegType::vgpr))
2787 std::swap(instr->operands[0], instr->operands[1]);
2788 if (!instr->operands[1].isOfType(RegType::vgpr))
2789 instr->format = asVOP3(instr->format);
2797 /* v_not(v_xor(a, b)) -> v_xnor(a, b) */
2799 combine_not_xor(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2801 if (instr->usesModifiers())
2804 Instruction* op_instr = follow_operand(ctx, instr->operands[0]);
2805 if (!op_instr || op_instr->opcode != aco_opcode::v_xor_b32 || op_instr->isSDWA())
2808 ctx.uses[instr->operands[0].tempId()]--;
2809 std::swap(instr->definitions[0], op_instr->definitions[0]);
2810 op_instr->opcode = aco_opcode::v_xnor_b32;
2816 combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode op3src,
2819 /* TODO: this can handle SDWA min/max instructions by using opsel */
2821 /* min(min(a, b), c) -> min3(a, b, c)
2822 * max(max(a, b), c) -> max3(a, b, c)
2823 * gfx11: min(-min(a, b), c) -> maxmin(-a, -b, c)
2824 * gfx11: max(-max(a, b), c) -> minmax(-a, -b, c)
2826 for (unsigned swap = 0; swap < 2; swap++) {
2827 Operand operands[3];
2828 bool clamp, precise;
2829 bitarray8 opsel = 0, neg = 0, abs = 0;
2832 if (match_op3_for_vop3(ctx, instr->opcode, instr->opcode, instr.get(), swap, "120", operands,
2833 neg, abs, opsel, &clamp, &omod, &inbetween_neg, NULL, NULL,
2836 (minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
2837 ctx.uses[instr->operands[swap].tempId()]--;
2838 if (inbetween_neg) {
2841 create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
2843 create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
2849 /* min(-max(a, b), c) -> min3(-a, -b, c)
2850 * max(-min(a, b), c) -> max3(-a, -b, c)
2851 * gfx11: min(max(a, b), c) -> maxmin(a, b, c)
2852 * gfx11: max(min(a, b), c) -> minmax(a, b, c)
2854 for (unsigned swap = 0; swap < 2; swap++) {
2855 Operand operands[3];
2856 bool clamp, precise;
2857 bitarray8 opsel = 0, neg = 0, abs = 0;
2860 if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "120", operands, neg,
2861 abs, opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&
2863 (minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
2864 ctx.uses[instr->operands[swap].tempId()]--;
2865 if (inbetween_neg) {
2868 create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
2870 create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
2878 /* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
2879 * s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
2880 * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
2881 * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
2882 * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
2883 * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
2885 combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2888 if (!instr->operands[0].isTemp())
2890 if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2893 Instruction* op2_instr = follow_operand(ctx, instr->operands[0]);
2896 switch (op2_instr->opcode) {
2897 case aco_opcode::s_and_b32:
2898 case aco_opcode::s_or_b32:
2899 case aco_opcode::s_xor_b32:
2900 case aco_opcode::s_and_b64:
2901 case aco_opcode::s_or_b64:
2902 case aco_opcode::s_xor_b64: break;
2903 default: return false;
2906 /* create instruction */
2907 std::swap(instr->definitions[0], op2_instr->definitions[0]);
2908 std::swap(instr->definitions[1], op2_instr->definitions[1]);
2909 ctx.uses[instr->operands[0].tempId()]--;
2910 ctx.info[op2_instr->definitions[0].tempId()].label = 0;
2912 switch (op2_instr->opcode) {
2913 case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break;
2914 case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break;
2915 case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break;
2916 case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break;
2917 case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break;
2918 case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break;
2925 /* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)
2926 * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
2927 * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
2928 * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
2930 combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2932 if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
2935 for (unsigned i = 0; i < 2; i++) {
2936 Instruction* op2_instr = follow_operand(ctx, instr->operands[i]);
2937 if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 &&
2938 op2_instr->opcode != aco_opcode::s_not_b64))
2940 if (ctx.uses[op2_instr->definitions[1].tempId()])
2943 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2944 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2947 ctx.uses[instr->operands[i].tempId()]--;
2948 instr->operands[0] = instr->operands[!i];
2949 instr->operands[1] = op2_instr->operands[0];
2950 ctx.info[instr->definitions[0].tempId()].label = 0;
2952 switch (instr->opcode) {
2953 case aco_opcode::s_and_b32: instr->opcode = aco_opcode::s_andn2_b32; break;
2954 case aco_opcode::s_or_b32: instr->opcode = aco_opcode::s_orn2_b32; break;
2955 case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_andn2_b64; break;
2956 case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_orn2_b64; break;
2965 /* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
2967 combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2969 if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
2972 for (unsigned i = 0; i < 2; i++) {
2973 Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true);
2974 if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
2975 ctx.uses[op2_instr->definitions[1].tempId()])
2977 if (!op2_instr->operands[1].isConstant())
2980 uint32_t shift = op2_instr->operands[1].constantValue();
2981 if (shift < 1 || shift > 4)
2984 if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2985 instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2988 instr->operands[1] = instr->operands[!i];
2989 instr->operands[0] = copy_operand(ctx, op2_instr->operands[0]);
2990 decrease_uses(ctx, op2_instr);
2991 ctx.info[instr->definitions[0].tempId()].label = 0;
2993 instr->opcode = std::array<aco_opcode, 4>{
2994 aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32,
2995 aco_opcode::s_lshl4_add_u32}[shift - 1];
3002 /* s_abs_i32(s_sub_[iu]32(a, b)) -> s_absdiff_i32(a, b)
3003 * s_abs_i32(s_add_[iu]32(a, #b)) -> s_absdiff_i32(a, -b)
3006 combine_sabsdiff(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3008 if (!instr->operands[0].isTemp() || !ctx.info[instr->operands[0].tempId()].is_add_sub())
3011 Instruction* op_instr = follow_operand(ctx, instr->operands[0], false);
3015 if (op_instr->opcode == aco_opcode::s_add_i32 || op_instr->opcode == aco_opcode::s_add_u32) {
3016 for (unsigned i = 0; i < 2; i++) {
3018 if (op_instr->operands[!i].isLiteral() ||
3019 !is_operand_constant(ctx, op_instr->operands[i], 32, &constant))
3022 if (op_instr->operands[i].isTemp())
3023 ctx.uses[op_instr->operands[i].tempId()]--;
3024 op_instr->operands[0] = op_instr->operands[!i];
3025 op_instr->operands[1] = Operand::c32(-int32_t(constant));
3032 op_instr->opcode = aco_opcode::s_absdiff_i32;
3033 std::swap(instr->definitions[0], op_instr->definitions[0]);
3034 std::swap(instr->definitions[1], op_instr->definitions[1]);
3035 ctx.uses[instr->operands[0].tempId()]--;
3040 /* s_cmp_{lg,eq}(s_and(a, s_lshl(1, b)), 0) -> s_bitcmp[10](a, b)*/
3042 combine_s_bitcmp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3046 switch (instr->opcode) {
3047 case aco_opcode::s_cmp_lg_i32:
3048 case aco_opcode::s_cmp_lg_u32: lg = true; break;
3049 case aco_opcode::s_cmp_eq_i32:
3050 case aco_opcode::s_cmp_eq_u32: break;
3051 case aco_opcode::s_cmp_lg_u64: lg = true; FALLTHROUGH;
3052 case aco_opcode::s_cmp_eq_u64: b64 = true; break;
3053 default: return false;
3056 aco_opcode s_and = b64 ? aco_opcode::s_and_b64 : aco_opcode::s_and_b32;
3057 aco_opcode s_lshl = b64 ? aco_opcode::s_lshl_b64 : aco_opcode::s_lshl_b32;
3059 for (unsigned cmp_idx = 0; cmp_idx < 2; cmp_idx++) {
3060 Instruction* and_instr = follow_operand(ctx, instr->operands[cmp_idx], false);
3061 if (!and_instr || and_instr->opcode != s_and)
3064 for (unsigned and_idx = 0; and_idx < 2; and_idx++) {
3065 Instruction* lshl_instr = follow_operand(ctx, and_instr->operands[and_idx], true);
3066 if (!lshl_instr || lshl_instr->opcode != s_lshl ||
3067 !lshl_instr->operands[0].constantEquals(1) ||
3068 (lshl_instr->operands[1].isLiteral() && and_instr->operands[!and_idx].isLiteral()))
3069 lshl_instr = nullptr;
3073 (!is_operand_constant(ctx, and_instr->operands[and_idx], b64 ? 64 : 32, &constant) ||
3074 !util_is_power_of_two_or_zero64(constant) || constant == 0))
3078 if (instr->operands[!cmp_idx].constantEquals(0)) {
3080 } else if (lshl_instr && instr->operands[!cmp_idx].isTemp() &&
3081 instr->operands[!cmp_idx].tempId() == lshl_instr->definitions[0].tempId()) {
3083 ctx.uses[lshl_instr->definitions[0].tempId()]--;
3089 instr->opcode = aco_opcode::s_bitcmp1_b64;
3090 else if (!test1 && b64)
3091 instr->opcode = aco_opcode::s_bitcmp0_b64;
3092 else if (test1 && !b64)
3093 instr->opcode = aco_opcode::s_bitcmp1_b32;
3095 instr->opcode = aco_opcode::s_bitcmp0_b32;
3097 instr->operands[0] = copy_operand(ctx, and_instr->operands[!and_idx]);
3098 decrease_uses(ctx, and_instr);
3100 instr->operands[1] = copy_operand(ctx, lshl_instr->operands[1]);
3101 decrease_op_uses_if_dead(ctx, lshl_instr);
3103 instr->operands[1] = Operand::c32(ffsll(constant) - 1);
3113 combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
3115 if (instr->usesModifiers())
3118 for (unsigned i = 0; i < 2; i++) {
3119 if (!((1 << i) & ops))
3121 if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2i() &&
3122 ctx.uses[instr->operands[i].tempId()] == 1) {
3124 aco_ptr<Instruction> new_instr;
3125 if (instr->operands[!i].isTemp() &&
3126 instr->operands[!i].getTemp().type() == RegType::vgpr) {
3127 new_instr.reset(create_instruction<VALU_instruction>(new_op, Format::VOP2, 3, 2));
3128 } else if (ctx.program->gfx_level >= GFX10 ||
3129 (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
3131 create_instruction<VALU_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
3135 ctx.uses[instr->operands[i].tempId()]--;
3136 new_instr->definitions[0] = instr->definitions[0];
3137 if (instr->definitions.size() == 2) {
3138 new_instr->definitions[1] = instr->definitions[1];
3140 new_instr->definitions[1] =
3141 Definition(ctx.program->allocateTmp(ctx.program->lane_mask));
3142 /* Make sure the uses vector is large enough and the number of
3143 * uses properly initialized to 0.
3145 ctx.uses.push_back(0);
3147 new_instr->operands[0] = Operand::zero();
3148 new_instr->operands[1] = instr->operands[!i];
3149 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
3150 new_instr->pass_flags = instr->pass_flags;
3151 instr = std::move(new_instr);
3152 ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
3161 combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3163 if (instr->usesModifiers())
3166 for (unsigned i = 0; i < 2; i++) {
3167 Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
3168 if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
3169 !op_instr->usesModifiers() && op_instr->operands[0].isTemp() &&
3170 op_instr->operands[0].getTemp().type() == RegType::vgpr &&
3171 op_instr->operands[1].constantEquals(0)) {
3172 aco_ptr<Instruction> new_instr{
3173 create_instruction<VALU_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)};
3174 ctx.uses[instr->operands[i].tempId()]--;
3175 new_instr->operands[0] = op_instr->operands[0];
3176 new_instr->operands[1] = instr->operands[!i];
3177 new_instr->definitions[0] = instr->definitions[0];
3178 new_instr->pass_flags = instr->pass_flags;
3179 instr = std::move(new_instr);
3180 ctx.info[instr->definitions[0].tempId()].label = 0;
3190 get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3,
3191 aco_opcode* med3, aco_opcode* minmax, bool* some_gfx9_only)
3194 #define MINMAX(type, gfx9) \
3195 case aco_opcode::v_min_##type: \
3196 case aco_opcode::v_max_##type: \
3197 *min = aco_opcode::v_min_##type; \
3198 *max = aco_opcode::v_max_##type; \
3199 *med3 = aco_opcode::v_med3_##type; \
3200 *min3 = aco_opcode::v_min3_##type; \
3201 *max3 = aco_opcode::v_max3_##type; \
3202 *minmax = op == *min ? aco_opcode::v_maxmin_##type : aco_opcode::v_minmax_##type; \
3203 *some_gfx9_only = gfx9; \
3205 #define MINMAX_INT16(type, gfx9) \
3206 case aco_opcode::v_min_##type: \
3207 case aco_opcode::v_max_##type: \
3208 *min = aco_opcode::v_min_##type; \
3209 *max = aco_opcode::v_max_##type; \
3210 *med3 = aco_opcode::v_med3_##type; \
3211 *min3 = aco_opcode::v_min3_##type; \
3212 *max3 = aco_opcode::v_max3_##type; \
3213 *minmax = aco_opcode::num_opcodes; \
3214 *some_gfx9_only = gfx9; \
3216 #define MINMAX_INT16_E64(type, gfx9) \
3217 case aco_opcode::v_min_##type##_e64: \
3218 case aco_opcode::v_max_##type##_e64: \
3219 *min = aco_opcode::v_min_##type##_e64; \
3220 *max = aco_opcode::v_max_##type##_e64; \
3221 *med3 = aco_opcode::v_med3_##type; \
3222 *min3 = aco_opcode::v_min3_##type; \
3223 *max3 = aco_opcode::v_max3_##type; \
3224 *minmax = aco_opcode::num_opcodes; \
3225 *some_gfx9_only = gfx9; \
3231 MINMAX_INT16(u16, true)
3232 MINMAX_INT16(i16, true)
3233 MINMAX_INT16_E64(u16, true)
3234 MINMAX_INT16_E64(i16, true)
3235 #undef MINMAX_INT16_E64
3238 default: return false;
3243 * v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
3244 * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
3247 combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode min, aco_opcode max,
3250 /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
3251 * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
3252 * minVal > maxVal, which means we can always select it to a v_med3_f32 */
3253 aco_opcode other_op;
3254 if (instr->opcode == min)
3256 else if (instr->opcode == max)
3261 for (unsigned swap = 0; swap < 2; swap++) {
3262 Operand operands[3];
3263 bool clamp, precise;
3264 bitarray8 opsel = 0, neg = 0, abs = 0;
3266 if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg,
3267 abs, opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
3268 /* max(min(src, upper), lower) returns upper if src is NaN, but
3269 * med3(src, lower, upper) returns lower.
3271 if (precise && instr->opcode != min &&
3272 (min == aco_opcode::v_min_f16 || min == aco_opcode::v_min_f32))
3275 int const0_idx = -1, const1_idx = -1;
3276 uint32_t const0 = 0, const1 = 0;
3277 for (int i = 0; i < 3; i++) {
3279 bool hi16 = opsel & (1 << i);
3280 if (operands[i].isConstant()) {
3281 val = hi16 ? operands[i].constantValue16(true) : operands[i].constantValue();
3282 } else if (operands[i].isTemp() &&
3283 ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {
3284 val = ctx.info[operands[i].tempId()].val >> (hi16 ? 16 : 0);
3288 if (const0_idx >= 0) {
3296 if (const0_idx < 0 || const1_idx < 0)
3299 int lower_idx = const0_idx;
3301 case aco_opcode::v_min_f32:
3302 case aco_opcode::v_min_f16: {
3303 float const0_f, const1_f;
3304 if (min == aco_opcode::v_min_f32) {
3305 memcpy(&const0_f, &const0, 4);
3306 memcpy(&const1_f, &const1, 4);
3308 const0_f = _mesa_half_to_float(const0);
3309 const1_f = _mesa_half_to_float(const1);
3311 if (abs[const0_idx])
3312 const0_f = fabsf(const0_f);
3313 if (abs[const1_idx])
3314 const1_f = fabsf(const1_f);
3315 if (neg[const0_idx])
3316 const0_f = -const0_f;
3317 if (neg[const1_idx])
3318 const1_f = -const1_f;
3319 lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
3322 case aco_opcode::v_min_u32: {
3323 lower_idx = const0 < const1 ? const0_idx : const1_idx;
3326 case aco_opcode::v_min_u16:
3327 case aco_opcode::v_min_u16_e64: {
3328 lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;
3331 case aco_opcode::v_min_i32: {
3333 const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
3335 const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
3336 lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
3339 case aco_opcode::v_min_i16:
3340 case aco_opcode::v_min_i16_e64: {
3341 int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;
3342 int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;
3343 lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
3348 int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
3350 if (instr->opcode == min) {
3351 if (upper_idx != 0 || lower_idx == 0)
3354 if (upper_idx == 0 || lower_idx != 0)
3358 ctx.uses[instr->operands[swap].tempId()]--;
3359 create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);
3369 apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3371 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
3372 instr->opcode == aco_opcode::v_lshrrev_b64 ||
3373 instr->opcode == aco_opcode::v_ashrrev_i64;
3375 /* find candidates and create the set of sgprs already read */
3376 unsigned sgpr_ids[2] = {0, 0};
3377 uint32_t operand_mask = 0;
3378 bool has_literal = false;
3379 for (unsigned i = 0; i < instr->operands.size(); i++) {
3380 if (instr->operands[i].isLiteral())
3382 if (!instr->operands[i].isTemp())
3384 if (instr->operands[i].getTemp().type() == RegType::sgpr) {
3385 if (instr->operands[i].tempId() != sgpr_ids[0])
3386 sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
3388 ssa_info& info = ctx.info[instr->operands[i].tempId()];
3389 if (is_copy_label(ctx, instr, info, i) && info.temp.type() == RegType::sgpr)
3390 operand_mask |= 1u << i;
3391 if (info.is_extract() && info.instr->operands[0].getTemp().type() == RegType::sgpr)
3392 operand_mask |= 1u << i;
3394 unsigned max_sgprs = 1;
3395 if (ctx.program->gfx_level >= GFX10 && !is_shift64)
3400 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
3402 /* keep on applying sgprs until there is nothing left to be done */
3403 while (operand_mask) {
3404 uint32_t sgpr_idx = 0;
3405 uint32_t sgpr_info_id = 0;
3406 uint32_t mask = operand_mask;
3409 unsigned i = u_bit_scan(&mask);
3410 uint16_t uses = ctx.uses[instr->operands[i].tempId()];
3411 if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {
3413 sgpr_info_id = instr->operands[i].tempId();
3416 operand_mask &= ~(1u << sgpr_idx);
3418 ssa_info& info = ctx.info[sgpr_info_id];
3420 /* Applying two sgprs require making it VOP3, so don't do it unless it's
3421 * definitively beneficial.
3422 * TODO: this is too conservative because later the use count could be reduced to 1 */
3423 if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() &&
3424 !instr->isSDWA() && instr->format != Format::VOP3P)
3427 Temp sgpr = info.is_extract() ? info.instr->operands[0].getTemp() : info.temp;
3428 bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];
3429 if (new_sgpr && num_sgprs >= max_sgprs)
3433 instr->format = withoutDPP(instr->format);
3435 if (sgpr_idx == 1 && instr->isDPP())
3438 if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
3439 info.is_extract()) {
3440 /* can_apply_extract() checks SGPR encoding restrictions */
3441 if (info.is_extract() && can_apply_extract(ctx, instr, sgpr_idx, info))
3442 apply_extract(ctx, instr, sgpr_idx, info);
3443 else if (info.is_extract())
3445 instr->operands[sgpr_idx] = Operand(sgpr);
3446 } else if (can_swap_operands(instr, &instr->opcode) && !instr->valu().opsel[sgpr_idx]) {
3447 instr->operands[sgpr_idx] = instr->operands[0];
3448 instr->operands[0] = Operand(sgpr);
3449 instr->valu().opsel[0].swap(instr->valu().opsel[sgpr_idx]);
3450 /* swap bits using a 4-entry LUT */
3451 uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;
3452 operand_mask = (operand_mask & ~0x3) | swapped;
3453 } else if (can_use_VOP3(ctx, instr) && !info.is_extract()) {
3454 instr->format = asVOP3(instr->format);
3455 instr->operands[sgpr_idx] = Operand(sgpr);
3461 sgpr_ids[num_sgprs++] = sgpr.id();
3462 ctx.uses[sgpr_info_id]--;
3463 ctx.uses[sgpr.id()]++;
3465 /* TODO: handle when it's a VGPR */
3466 if ((ctx.info[sgpr.id()].label & (label_extract | label_temp)) &&
3467 ctx.info[sgpr.id()].temp.type() == RegType::sgpr)
3468 operand_mask |= 1u << sgpr_idx;
3472 /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
3474 apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3476 if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||
3477 !instr_info.can_use_output_modifiers[(int)instr->opcode])
3480 bool can_vop3 = can_use_VOP3(ctx, instr);
3482 instr->opcode == aco_opcode::v_fma_mix_f32 || instr->opcode == aco_opcode::v_fma_mixlo_f16;
3483 if (!instr->isSDWA() && !is_mad_mix && !can_vop3)
3486 /* omod flushes -0 to +0 and has no effect if denormals are enabled. SDWA omod is GFX9+. */
3487 bool can_use_omod = (can_vop3 || ctx.program->gfx_level >= GFX9) && !instr->isVOP3P();
3488 if (instr->definitions[0].bytes() == 4)
3490 can_use_omod && ctx.fp_mode.denorm32 == 0 && !ctx.fp_mode.preserve_signed_zero_inf_nan32;
3492 can_use_omod = can_use_omod && ctx.fp_mode.denorm16_64 == 0 &&
3493 !ctx.fp_mode.preserve_signed_zero_inf_nan16_64;
3495 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
3497 uint64_t omod_labels = label_omod2 | label_omod4 | label_omod5;
3498 if (!def_info.is_clamp() && !(can_use_omod && (def_info.label & omod_labels)))
3500 /* if the omod/clamp instruction is dead, then the single user of this
3501 * instruction is a different instruction */
3502 if (!ctx.uses[def_info.instr->definitions[0].tempId()])
3505 if (def_info.instr->definitions[0].bytes() != instr->definitions[0].bytes())
3508 /* MADs/FMAs are created later, so we don't have to update the original add */
3509 assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
3511 if (!instr->isSDWA() && !instr->isVOP3P())
3512 instr->format = asVOP3(instr->format);
3514 if (!def_info.is_clamp() && (instr->valu().clamp || instr->valu().omod))
3517 if (def_info.is_omod2())
3518 instr->valu().omod = 1;
3519 else if (def_info.is_omod4())
3520 instr->valu().omod = 2;
3521 else if (def_info.is_omod5())
3522 instr->valu().omod = 3;
3523 else if (def_info.is_clamp())
3524 instr->valu().clamp = true;
3526 instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
3527 ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert | label_f2f16;
3528 ctx.uses[def_info.instr->definitions[0].tempId()]--;
3533 /* Combine an p_insert (or p_extract, in some cases) instruction with instr.
3534 * p_insert(instr(...)) -> instr_insert().
3537 apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3539 if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1)
3542 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
3543 if (!def_info.is_insert())
3545 /* if the insert instruction is dead, then the single user of this
3546 * instruction is a different instruction */
3547 if (!ctx.uses[def_info.instr->definitions[0].tempId()])
3550 /* MADs/FMAs are created later, so we don't have to update the original add */
3551 assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
3553 SubdwordSel sel = parse_insert(def_info.instr);
3556 if (!can_use_SDWA(ctx.program->gfx_level, instr, true))
3559 convert_to_SDWA(ctx.program->gfx_level, instr);
3560 if (instr->sdwa().dst_sel.size() != 4)
3562 instr->sdwa().dst_sel = sel;
3564 instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
3565 ctx.info[instr->definitions[0].tempId()].label = 0;
3566 ctx.uses[def_info.instr->definitions[0].tempId()]--;
3571 /* Remove superfluous extract after ds_read like so:
3572 * p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN()
3575 apply_ds_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
3577 /* Check if p_extract has a usedef operand and is the only user. */
3578 if (!ctx.info[extract->operands[0].tempId()].is_usedef() ||
3579 ctx.uses[extract->operands[0].tempId()] > 1)
3582 /* Check if the usedef is a DS instruction. */
3583 Instruction* ds = ctx.info[extract->operands[0].tempId()].instr;
3584 if (ds->format != Format::DS)
3587 unsigned extract_idx = extract->operands[1].constantValue();
3588 unsigned bits_extracted = extract->operands[2].constantValue();
3589 unsigned sign_ext = extract->operands[3].constantValue();
3590 unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
3592 /* TODO: These are doable, but probably don't occur too often. */
3593 if (extract_idx || sign_ext || dst_bitsize != 32)
3596 unsigned bits_loaded = 0;
3597 if (ds->opcode == aco_opcode::ds_read_u8 || ds->opcode == aco_opcode::ds_read_u8_d16)
3599 else if (ds->opcode == aco_opcode::ds_read_u16 || ds->opcode == aco_opcode::ds_read_u16_d16)
3604 /* Shrink the DS load if the extracted bit size is smaller. */
3605 bits_loaded = MIN2(bits_loaded, bits_extracted);
3607 /* Change the DS opcode so it writes the full register. */
3608 if (bits_loaded == 8)
3609 ds->opcode = aco_opcode::ds_read_u8;
3610 else if (bits_loaded == 16)
3611 ds->opcode = aco_opcode::ds_read_u16;
3613 unreachable("Forgot to add DS opcode above.");
3615 /* The DS now produces the exact same thing as the extract, remove the extract. */
3616 std::swap(ds->definitions[0], extract->definitions[0]);
3617 ctx.uses[extract->definitions[0].tempId()] = 0;
3618 ctx.info[ds->definitions[0].tempId()].label = 0;
3622 /* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */
3624 combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3626 if (instr->usesModifiers())
3629 for (unsigned i = 0; i < 2; i++) {
3630 Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
3631 if (op_instr && op_instr->opcode == aco_opcode::v_subbrev_co_u32 &&
3632 op_instr->operands[0].constantEquals(0) && op_instr->operands[1].constantEquals(0) &&
3633 !op_instr->usesModifiers()) {
3635 aco_ptr<Instruction> new_instr;
3636 if (instr->operands[!i].isTemp() &&
3637 instr->operands[!i].getTemp().type() == RegType::vgpr) {
3639 create_instruction<VALU_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1));
3640 } else if (ctx.program->gfx_level >= GFX10 ||
3641 (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
3642 new_instr.reset(create_instruction<VALU_instruction>(aco_opcode::v_cndmask_b32,
3643 asVOP3(Format::VOP2), 3, 1));
3648 new_instr->operands[0] = Operand::zero();
3649 new_instr->operands[1] = instr->operands[!i];
3650 new_instr->operands[2] = copy_operand(ctx, op_instr->operands[2]);
3651 new_instr->definitions[0] = instr->definitions[0];
3652 new_instr->pass_flags = instr->pass_flags;
3653 instr = std::move(new_instr);
3654 decrease_uses(ctx, op_instr);
3655 ctx.info[instr->definitions[0].tempId()].label = 0;
3663 /* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c)
3664 * v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c)
3665 * v_sub(c, s_lshl(a, b)) -> v_mad_i32_i24(a, -(1<<b), c)
3666 * v_sub(c, v_lshlrev(a, b)) -> v_mad_i32_i24(b, -(1<<a), c)
3669 combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr, bool is_sub)
3671 if (instr->usesModifiers())
3674 /* Substractions: start at operand 1 to avoid mixup such as
3675 * turning v_sub(v_lshlrev(a, b), c) into v_mad_i32_i24(b, -(1<<a), c)
3677 unsigned start_op_idx = is_sub ? 1 : 0;
3679 /* Don't allow 24-bit operands on subtraction because
3680 * v_mad_i32_i24 applies a sign extension.
3682 bool allow_24bit = !is_sub;
3684 for (unsigned i = start_op_idx; i < 2; i++) {
3685 Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
3689 if (op_instr->opcode != aco_opcode::s_lshl_b32 &&
3690 op_instr->opcode != aco_opcode::v_lshlrev_b32)
3693 int shift_op_idx = op_instr->opcode == aco_opcode::s_lshl_b32 ? 1 : 0;
3695 if (op_instr->operands[shift_op_idx].isConstant() &&
3696 ((allow_24bit && op_instr->operands[!shift_op_idx].is24bit()) ||
3697 op_instr->operands[!shift_op_idx].is16bit())) {
3698 uint32_t multiplier = 1 << (op_instr->operands[shift_op_idx].constantValue() % 32u);
3700 multiplier = -multiplier;
3701 if (is_sub ? (multiplier < 0xff800000) : (multiplier > 0xffffff))
3705 op_instr->operands[!shift_op_idx],
3706 Operand::c32(multiplier),
3707 instr->operands[!i],
3709 if (!check_vop3_operands(ctx, 3, ops))
3712 ctx.uses[instr->operands[i].tempId()]--;
3714 aco_opcode mad_op = is_sub ? aco_opcode::v_mad_i32_i24 : aco_opcode::v_mad_u32_u24;
3715 aco_ptr<VALU_instruction> new_instr{
3716 create_instruction<VALU_instruction>(mad_op, Format::VOP3, 3, 1)};
3717 for (unsigned op_idx = 0; op_idx < 3; ++op_idx)
3718 new_instr->operands[op_idx] = ops[op_idx];
3719 new_instr->definitions[0] = instr->definitions[0];
3720 new_instr->pass_flags = instr->pass_flags;
3721 instr = std::move(new_instr);
3722 ctx.info[instr->definitions[0].tempId()].label = 0;
3731 propagate_swizzles(VALU_instruction* instr, bool opsel_lo, bool opsel_hi)
3733 /* propagate swizzles which apply to a result down to the instruction's operands:
3734 * result = a.xy + b.xx -> result.yx = a.yx + b.xx */
3735 uint8_t tmp_lo = instr->opsel_lo;
3736 uint8_t tmp_hi = instr->opsel_hi;
3737 uint8_t neg_lo = instr->neg_lo;
3738 uint8_t neg_hi = instr->neg_hi;
3739 if (opsel_lo == 1) {
3740 instr->opsel_lo = tmp_hi;
3741 instr->neg_lo = neg_hi;
3743 if (opsel_hi == 0) {
3744 instr->opsel_hi = tmp_lo;
3745 instr->neg_hi = neg_lo;
3750 combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3752 VALU_instruction* vop3p = &instr->valu();
3755 if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) &&
3756 vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1 &&
3757 !vop3p->opsel_lo[1] && !vop3p->opsel_hi[1]) {
3759 ssa_info& info = ctx.info[instr->operands[0].tempId()];
3760 if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) {
3761 VALU_instruction* candidate = &ctx.info[instr->operands[0].tempId()].instr->valu();
3762 candidate->clamp = true;
3763 propagate_swizzles(candidate, vop3p->opsel_lo[0], vop3p->opsel_hi[0]);
3764 instr->definitions[0].swapTemp(candidate->definitions[0]);
3765 ctx.info[candidate->definitions[0].tempId()].instr = candidate;
3766 ctx.uses[instr->definitions[0].tempId()]--;
3771 /* check for fneg modifiers */
3772 for (unsigned i = 0; i < instr->operands.size(); i++) {
3773 if (!can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i))
3775 Operand& op = instr->operands[i];
3779 ssa_info& info = ctx.info[op.tempId()];
3780 if (info.is_vop3p() && info.instr->opcode == aco_opcode::v_pk_mul_f16 &&
3781 info.instr->operands[1].constantEquals(0x3C00)) {
3783 VALU_instruction* fneg = &info.instr->valu();
3785 if (fneg->opsel_lo[1] || fneg->opsel_hi[1])
3789 for (unsigned j = 0; j < instr->operands.size(); j++)
3790 ops[j] = instr->operands[j];
3791 ops[i] = info.instr->operands[0];
3792 if (!check_vop3_operands(ctx, instr->operands.size(), ops))
3797 instr->operands[i] = fneg->operands[0];
3799 /* opsel_lo/hi is either 0 or 1:
3800 * if 0 - pick selection from fneg->lo
3801 * if 1 - pick selection from fneg->hi
3803 bool opsel_lo = vop3p->opsel_lo[i];
3804 bool opsel_hi = vop3p->opsel_hi[i];
3805 bool neg_lo = fneg->neg_lo[0] ^ fneg->neg_lo[1];
3806 bool neg_hi = fneg->neg_hi[0] ^ fneg->neg_hi[1];
3807 vop3p->neg_lo[i] ^= opsel_lo ? neg_hi : neg_lo;
3808 vop3p->neg_hi[i] ^= opsel_hi ? neg_hi : neg_lo;
3809 vop3p->opsel_lo[i] ^= opsel_lo ? !fneg->opsel_hi[0] : fneg->opsel_lo[0];
3810 vop3p->opsel_hi[i] ^= opsel_hi ? !fneg->opsel_hi[0] : fneg->opsel_lo[0];
3812 if (--ctx.uses[fneg->definitions[0].tempId()])
3813 ctx.uses[fneg->operands[0].tempId()]++;
3817 if (instr->opcode == aco_opcode::v_pk_add_f16 || instr->opcode == aco_opcode::v_pk_add_u16) {
3818 bool fadd = instr->opcode == aco_opcode::v_pk_add_f16;
3819 if (fadd && instr->definitions[0].isPrecise())
3822 Instruction* mul_instr = nullptr;
3823 unsigned add_op_idx = 0;
3824 bitarray8 mul_neg_lo = 0, mul_neg_hi = 0, mul_opsel_lo = 0, mul_opsel_hi = 0;
3825 uint32_t uses = UINT32_MAX;
3827 /* find the 'best' mul instruction to combine with the add */
3828 for (unsigned i = 0; i < 2; i++) {
3829 Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
3833 if (ctx.info[instr->operands[i].tempId()].is_vop3p()) {
3835 if (op_instr->opcode != aco_opcode::v_pk_mul_f16 ||
3836 op_instr->definitions[0].isPrecise())
3839 if (op_instr->opcode != aco_opcode::v_pk_mul_lo_u16)
3843 Operand op[3] = {op_instr->operands[0], op_instr->operands[1], instr->operands[1 - i]};
3844 if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
3847 /* no clamp allowed between mul and add */
3848 if (op_instr->valu().clamp)
3851 mul_instr = op_instr;
3853 uses = ctx.uses[instr->operands[i].tempId()];
3854 mul_neg_lo = mul_instr->valu().neg_lo;
3855 mul_neg_hi = mul_instr->valu().neg_hi;
3856 mul_opsel_lo = mul_instr->valu().opsel_lo;
3857 mul_opsel_hi = mul_instr->valu().opsel_hi;
3858 } else if (instr->operands[i].bytes() == 2) {
3859 if ((fadd && (op_instr->opcode != aco_opcode::v_mul_f16 ||
3860 op_instr->definitions[0].isPrecise())) ||
3861 (!fadd && op_instr->opcode != aco_opcode::v_mul_lo_u16 &&
3862 op_instr->opcode != aco_opcode::v_mul_lo_u16_e64))
3865 if (op_instr->valu().clamp || op_instr->valu().omod || op_instr->valu().abs)
3868 if (op_instr->isDPP() || (op_instr->isSDWA() && (op_instr->sdwa().sel[0].size() < 2 ||
3869 op_instr->sdwa().sel[1].size() < 2)))
3872 Operand op[3] = {op_instr->operands[0], op_instr->operands[1], instr->operands[1 - i]};
3873 if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
3876 mul_instr = op_instr;
3878 uses = ctx.uses[instr->operands[i].tempId()];
3879 mul_neg_lo = mul_instr->valu().neg;
3880 mul_neg_hi = mul_instr->valu().neg;
3881 if (mul_instr->isSDWA()) {
3882 for (unsigned j = 0; j < 2; j++)
3883 mul_opsel_lo[j] = mul_instr->sdwa().sel[j].offset();
3885 mul_opsel_lo = mul_instr->valu().opsel;
3887 mul_opsel_hi = mul_opsel_lo;
3894 /* turn mul + packed add into v_pk_fma_f16 */
3895 aco_opcode mad = fadd ? aco_opcode::v_pk_fma_f16 : aco_opcode::v_pk_mad_u16;
3896 aco_ptr<VALU_instruction> fma{create_instruction<VALU_instruction>(mad, Format::VOP3P, 3, 1)};
3897 fma->operands[0] = copy_operand(ctx, mul_instr->operands[0]);
3898 fma->operands[1] = copy_operand(ctx, mul_instr->operands[1]);
3899 fma->operands[2] = instr->operands[add_op_idx];
3900 fma->clamp = vop3p->clamp;
3901 fma->neg_lo = mul_neg_lo;
3902 fma->neg_hi = mul_neg_hi;
3903 fma->opsel_lo = mul_opsel_lo;
3904 fma->opsel_hi = mul_opsel_hi;
3905 propagate_swizzles(fma.get(), vop3p->opsel_lo[1 - add_op_idx],
3906 vop3p->opsel_hi[1 - add_op_idx]);
3907 fma->opsel_lo[2] = vop3p->opsel_lo[add_op_idx];
3908 fma->opsel_hi[2] = vop3p->opsel_hi[add_op_idx];
3909 fma->neg_lo[2] = vop3p->neg_lo[add_op_idx];
3910 fma->neg_hi[2] = vop3p->neg_hi[add_op_idx];
3911 fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx];
3912 fma->neg_hi[1] = fma->neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx];
3913 fma->definitions[0] = instr->definitions[0];
3914 fma->pass_flags = instr->pass_flags;
3915 instr = std::move(fma);
3916 ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
3917 decrease_uses(ctx, mul_instr);
3923 can_use_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3925 if (ctx.program->gfx_level < GFX9)
3928 /* v_mad_mix* on GFX9 always flushes denormals for 16-bit inputs/outputs */
3929 if (ctx.program->gfx_level == GFX9 && ctx.fp_mode.denorm16_64)
3932 switch (instr->opcode) {
3933 case aco_opcode::v_add_f32:
3934 case aco_opcode::v_sub_f32:
3935 case aco_opcode::v_subrev_f32:
3936 case aco_opcode::v_mul_f32:
3937 case aco_opcode::v_fma_f32: break;
3938 case aco_opcode::v_fma_mix_f32:
3939 case aco_opcode::v_fma_mixlo_f16: return true;
3940 default: return false;
3943 if (instr->opcode == aco_opcode::v_fma_f32 && !ctx.program->dev.fused_mad_mix &&
3944 instr->definitions[0].isPrecise())
3947 return !instr->valu().omod && !instr->isSDWA() && !instr->isDPP();
3951 to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3953 bool is_add = instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32;
3955 aco_ptr<VALU_instruction> vop3p{
3956 create_instruction<VALU_instruction>(aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1)};
3958 for (unsigned i = 0; i < instr->operands.size(); i++) {
3959 vop3p->operands[is_add + i] = instr->operands[i];
3960 vop3p->neg_lo[is_add + i] = instr->valu().neg[i];
3961 vop3p->neg_hi[is_add + i] = instr->valu().abs[i];
3963 if (instr->opcode == aco_opcode::v_mul_f32) {
3964 vop3p->operands[2] = Operand::zero();
3965 vop3p->neg_lo[2] = true;
3966 } else if (is_add) {
3967 vop3p->operands[0] = Operand::c32(0x3f800000);
3968 if (instr->opcode == aco_opcode::v_sub_f32)
3969 vop3p->neg_lo[2] ^= true;
3970 else if (instr->opcode == aco_opcode::v_subrev_f32)
3971 vop3p->neg_lo[1] ^= true;
3973 vop3p->definitions[0] = instr->definitions[0];
3974 vop3p->clamp = instr->valu().clamp;
3975 vop3p->pass_flags = instr->pass_flags;
3976 instr = std::move(vop3p);
3978 ctx.info[instr->definitions[0].tempId()].label &= label_f2f16 | label_clamp | label_mul;
3979 if (ctx.info[instr->definitions[0].tempId()].label & label_mul)
3980 ctx.info[instr->definitions[0].tempId()].instr = instr.get();
3984 combine_output_conversion(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3986 ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
3987 if (!def_info.is_f2f16())
3989 Instruction* conv = def_info.instr;
3991 if (!can_use_mad_mix(ctx, instr) || ctx.uses[instr->definitions[0].tempId()] != 1)
3994 if (!ctx.uses[conv->definitions[0].tempId()])
3997 if (conv->usesModifiers())
4000 if (!instr->isVOP3P())
4001 to_mad_mix(ctx, instr);
4003 instr->opcode = aco_opcode::v_fma_mixlo_f16;
4004 instr->definitions[0].swapTemp(conv->definitions[0]);
4005 if (conv->definitions[0].isPrecise())
4006 instr->definitions[0].setPrecise(true);
4007 ctx.info[instr->definitions[0].tempId()].label &= label_clamp;
4008 ctx.uses[conv->definitions[0].tempId()]--;
4014 combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
4016 if (!can_use_mad_mix(ctx, instr))
4019 for (unsigned i = 0; i < instr->operands.size(); i++) {
4020 if (!instr->operands[i].isTemp())
4022 Temp tmp = instr->operands[i].getTemp();
4023 if (!ctx.info[tmp.id()].is_f2f32())
4026 Instruction* conv = ctx.info[tmp.id()].instr;
4027 if (conv->valu().clamp || conv->valu().omod) {
4029 } else if (conv->isSDWA() &&
4030 (conv->sdwa().dst_sel.size() != 4 || conv->sdwa().sel[0].size() != 2)) {
4032 } else if (conv->isDPP()) {
4036 if (get_operand_size(instr, i) != 32)
4039 /* Conversion to VOP3P will add inline constant operands, but that shouldn't affect
4040 * check_vop3_operands(). */
4042 for (unsigned j = 0; j < instr->operands.size(); j++)
4043 op[j] = instr->operands[j];
4044 op[i] = conv->operands[0];
4045 if (!check_vop3_operands(ctx, instr->operands.size(), op))
4048 if (!instr->isVOP3P()) {
4050 instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32;
4051 to_mad_mix(ctx, instr);
4055 if (--ctx.uses[tmp.id()])
4056 ctx.uses[conv->operands[0].tempId()]++;
4057 instr->operands[i].setTemp(conv->operands[0].getTemp());
4058 if (conv->definitions[0].isPrecise())
4059 instr->definitions[0].setPrecise(true);
4060 instr->valu().opsel_hi[i] = true;
4061 if (conv->isSDWA() && conv->sdwa().sel[0].offset() == 2)
4062 instr->valu().opsel_lo[i] = true;
4064 instr->valu().opsel_lo[i] = conv->valu().opsel[0];
4065 bool neg = conv->valu().neg[0];
4066 bool abs = conv->valu().abs[0];
4067 if (!instr->valu().abs[i]) {
4068 instr->valu().neg[i] ^= neg;
4069 instr->valu().abs[i] = abs;
4074 // TODO: we could possibly move the whole label_instruction pass to combine_instruction:
4075 // this would mean that we'd have to fix the instruction uses while value propagation
4077 /* also returns true for inf */
4079 is_pow_of_two(opt_ctx& ctx, Operand op)
4081 if (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(op.bytes() * 8))
4082 return is_pow_of_two(ctx, get_constant_op(ctx, ctx.info[op.tempId()], op.bytes() * 8));
4083 else if (!op.isConstant())
4086 uint64_t val = op.constantValue64();
4088 if (op.bytes() == 4) {
4089 uint32_t exponent = (val & 0x7f800000) >> 23;
4090 uint32_t fraction = val & 0x007fffff;
4091 return (exponent >= 127) && (fraction == 0);
4092 } else if (op.bytes() == 2) {
4093 uint32_t exponent = (val & 0x7c00) >> 10;
4094 uint32_t fraction = val & 0x03ff;
4095 return (exponent >= 15) && (fraction == 0);
4097 assert(op.bytes() == 8);
4098 uint64_t exponent = (val & UINT64_C(0x7ff0000000000000)) >> 52;
4099 uint64_t fraction = val & UINT64_C(0x000fffffffffffff);
4100 return (exponent >= 1023) && (fraction == 0);
4105 combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
4107 if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
4110 if (instr->isVALU()) {
4111 /* Apply SDWA. Do this after label_instruction() so it can remove
4112 * label_extract if not all instructions can take SDWA. */
4113 for (unsigned i = 0; i < instr->operands.size(); i++) {
4114 Operand& op = instr->operands[i];
4117 ssa_info& info = ctx.info[op.tempId()];
4118 if (!info.is_extract())
4120 /* if there are that many uses, there are likely better combinations */
4121 // TODO: delay applying extract to a point where we know better
4122 if (ctx.uses[op.tempId()] > 4) {
4123 info.label &= ~label_extract;
4126 if (info.is_extract() &&
4127 (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
4128 instr->operands[i].getTemp().type() == RegType::sgpr) &&
4129 can_apply_extract(ctx, instr, i, info)) {
4130 /* Increase use count of the extract's operand if the extract still has uses. */
4131 apply_extract(ctx, instr, i, info);
4132 if (--ctx.uses[instr->operands[i].tempId()])
4133 ctx.uses[info.instr->operands[0].tempId()]++;
4134 instr->operands[i].setTemp(info.instr->operands[0].getTemp());
4138 if (can_apply_sgprs(ctx, instr))
4139 apply_sgprs(ctx, instr);
4140 combine_mad_mix(ctx, instr);
4141 while (apply_omod_clamp(ctx, instr) || combine_output_conversion(ctx, instr))
4143 apply_insert(ctx, instr);
4146 if (instr->isVOP3P() && instr->opcode != aco_opcode::v_fma_mix_f32 &&
4147 instr->opcode != aco_opcode::v_fma_mixlo_f16)
4148 return combine_vop3p(ctx, instr);
4150 if (instr->isSDWA() || instr->isDPP())
4153 if (instr->opcode == aco_opcode::p_extract) {
4154 ssa_info& info = ctx.info[instr->operands[0].tempId()];
4155 if (info.is_extract() && can_apply_extract(ctx, instr, 0, info)) {
4156 apply_extract(ctx, instr, 0, info);
4157 if (--ctx.uses[instr->operands[0].tempId()])
4158 ctx.uses[info.instr->operands[0].tempId()]++;
4159 instr->operands[0].setTemp(info.instr->operands[0].getTemp());
4162 apply_ds_extract(ctx, instr);
4165 if (instr->isVOPC()) {
4166 if (optimize_cmp_subgroup_invocation(ctx, instr))
4170 /* TODO: There are still some peephole optimizations that could be done:
4171 * - abs(a - b) -> s_absdiff_i32
4172 * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
4173 * - patterns for v_alignbit_b32 and v_alignbyte_b32
4174 * These aren't probably too interesting though.
4175 * There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but
4176 * probably more useful than the previously mentioned optimizations.
4177 * The various comparison optimizations also currently only work with 32-bit
4180 /* neg(mul(a, b)) -> mul(neg(a), b), abs(mul(a, b)) -> mul(abs(a), abs(b)) */
4181 if ((ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)) &&
4182 ctx.uses[instr->operands[1].tempId()] == 1) {
4183 Temp val = ctx.info[instr->definitions[0].tempId()].temp;
4185 if (!ctx.info[val.id()].is_mul())
4188 Instruction* mul_instr = ctx.info[val.id()].instr;
4190 if (mul_instr->operands[0].isLiteral())
4192 if (mul_instr->valu().clamp)
4194 if (mul_instr->isSDWA() || mul_instr->isDPP())
4196 if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32 &&
4197 ctx.fp_mode.preserve_signed_zero_inf_nan32)
4199 if (mul_instr->definitions[0].bytes() != instr->definitions[0].bytes())
4202 /* convert to mul(neg(a), b), mul(abs(a), abs(b)) or mul(neg(abs(a)), abs(b)) */
4203 ctx.uses[mul_instr->definitions[0].tempId()]--;
4204 Definition def = instr->definitions[0];
4205 bool is_neg = ctx.info[instr->definitions[0].tempId()].is_neg();
4206 bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
4207 uint32_t pass_flags = instr->pass_flags;
4208 Format format = mul_instr->format == Format::VOP2 ? asVOP3(Format::VOP2) : mul_instr->format;
4209 instr.reset(create_instruction<VALU_instruction>(mul_instr->opcode, format,
4210 mul_instr->operands.size(), 1));
4211 std::copy(mul_instr->operands.cbegin(), mul_instr->operands.cend(), instr->operands.begin());
4212 instr->pass_flags = pass_flags;
4213 instr->definitions[0] = def;
4214 VALU_instruction& new_mul = instr->valu();
4215 VALU_instruction& mul = mul_instr->valu();
4216 new_mul.neg = mul.neg;
4217 new_mul.abs = mul.abs;
4218 new_mul.omod = mul.omod;
4219 new_mul.opsel = mul.opsel;
4220 new_mul.opsel_lo = mul.opsel_lo;
4221 new_mul.opsel_hi = mul.opsel_hi;
4223 new_mul.neg[0] = new_mul.neg[1] = false;
4224 new_mul.abs[0] = new_mul.abs[1] = true;
4226 new_mul.neg[0] ^= is_neg;
4227 new_mul.clamp = false;
4229 ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
4233 /* combine mul+add -> mad */
4235 (instr->opcode == aco_opcode::v_fma_mix_f32 ||
4236 instr->opcode == aco_opcode::v_fma_mixlo_f16) &&
4237 !instr->valu().neg_lo[0] &&
4238 ((instr->operands[0].constantEquals(0x3f800000) && !instr->valu().opsel_hi[0]) ||
4239 (instr->operands[0].constantEquals(0x3C00) && instr->valu().opsel_hi[0] &&
4240 !instr->valu().opsel_lo[0]));
4241 bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 ||
4242 instr->opcode == aco_opcode::v_subrev_f32;
4243 bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 ||
4244 instr->opcode == aco_opcode::v_subrev_f16;
4245 bool mad64 = instr->opcode == aco_opcode::v_add_f64;
4246 if (is_add_mix || mad16 || mad32 || mad64) {
4247 Instruction* mul_instr = nullptr;
4248 unsigned add_op_idx = 0;
4249 uint32_t uses = UINT32_MAX;
4250 bool emit_fma = false;
4251 /* find the 'best' mul instruction to combine with the add */
4252 for (unsigned i = is_add_mix ? 1 : 0; i < instr->operands.size(); i++) {
4253 if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_mul())
4255 ssa_info& info = ctx.info[instr->operands[i].tempId()];
4257 /* no clamp/omod allowed between mul and add */
4258 if (info.instr->isVOP3() && (info.instr->valu().clamp || info.instr->valu().omod))
4260 if (info.instr->isVOP3P() && info.instr->valu().clamp)
4262 /* v_fma_mix_f32/etc can't do omod */
4263 if (info.instr->isVOP3P() && instr->isVOP3() && instr->valu().omod)
4265 /* don't promote fp16 to fp32 or remove fp32->fp16->fp32 conversions */
4266 if (is_add_mix && info.instr->definitions[0].bytes() == 2)
4269 if (get_operand_size(instr, i) != info.instr->definitions[0].bytes() * 8)
4272 bool legacy = info.instr->opcode == aco_opcode::v_mul_legacy_f32;
4273 bool mad_mix = is_add_mix || info.instr->isVOP3P();
4275 /* Multiplication by power-of-two should never need rounding. 1/power-of-two also works,
4276 * but using fma removes denormal flushing (0xfffffe * 0.5 + 0x810001a2).
4278 bool is_fma_precise = is_pow_of_two(ctx, info.instr->operands[0]) ||
4279 is_pow_of_two(ctx, info.instr->operands[1]);
4281 bool has_fma = mad16 || mad64 || (legacy && ctx.program->gfx_level >= GFX10_3) ||
4282 (mad32 && !legacy && !mad_mix && ctx.program->dev.has_fast_fma32) ||
4283 (mad_mix && ctx.program->dev.fused_mad_mix);
4284 bool has_mad = mad_mix ? !ctx.program->dev.fused_mad_mix
4285 : ((mad32 && ctx.program->gfx_level < GFX10_3) ||
4286 (mad16 && ctx.program->gfx_level <= GFX9));
4289 (!(info.instr->definitions[0].isPrecise() || instr->definitions[0].isPrecise()) ||
4292 has_mad && (mad_mix || mad32 ? ctx.fp_mode.denorm32 : ctx.fp_mode.denorm16_64) == 0;
4293 if (mad_mix && legacy)
4295 if (!can_use_fma && !can_use_mad)
4298 unsigned candidate_add_op_idx = is_add_mix ? (3 - i) : (1 - i);
4299 Operand op[3] = {info.instr->operands[0], info.instr->operands[1],
4300 instr->operands[candidate_add_op_idx]};
4301 if (info.instr->isSDWA() || info.instr->isDPP() || !check_vop3_operands(ctx, 3, op) ||
4302 ctx.uses[instr->operands[i].tempId()] > uses)
4305 if (ctx.uses[instr->operands[i].tempId()] == uses) {
4306 unsigned cur_idx = mul_instr->definitions[0].tempId();
4307 unsigned new_idx = info.instr->definitions[0].tempId();
4308 if (cur_idx > new_idx)
4312 mul_instr = info.instr;
4313 add_op_idx = candidate_add_op_idx;
4314 uses = ctx.uses[instr->operands[i].tempId()];
4315 emit_fma = !can_use_mad;
4319 /* turn mul+add into v_mad/v_fma */
4320 Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1],
4321 instr->operands[add_op_idx]};
4322 ctx.uses[mul_instr->definitions[0].tempId()]--;
4323 if (ctx.uses[mul_instr->definitions[0].tempId()]) {
4325 ctx.uses[op[0].tempId()]++;
4327 ctx.uses[op[1].tempId()]++;
4330 bool neg[3] = {false, false, false};
4331 bool abs[3] = {false, false, false};
4334 bitarray8 opsel_lo = 0;
4335 bitarray8 opsel_hi = 0;
4336 bitarray8 opsel = 0;
4337 unsigned mul_op_idx = (instr->isVOP3P() ? 3 : 1) - add_op_idx;
4339 VALU_instruction& valu_mul = mul_instr->valu();
4340 neg[0] = valu_mul.neg[0];
4341 neg[1] = valu_mul.neg[1];
4342 abs[0] = valu_mul.abs[0];
4343 abs[1] = valu_mul.abs[1];
4344 opsel_lo = valu_mul.opsel_lo & 0x3;
4345 opsel_hi = valu_mul.opsel_hi & 0x3;
4346 opsel = valu_mul.opsel & 0x3;
4348 VALU_instruction& valu = instr->valu();
4349 neg[2] = valu.neg[add_op_idx];
4350 abs[2] = valu.abs[add_op_idx];
4351 opsel_lo[2] = valu.opsel_lo[add_op_idx];
4352 opsel_hi[2] = valu.opsel_hi[add_op_idx];
4353 opsel[2] = valu.opsel[add_op_idx];
4354 opsel[3] = valu.opsel[3];
4357 /* abs of the multiplication result */
4358 if (valu.abs[mul_op_idx]) {
4364 /* neg of the multiplication result */
4365 neg[1] ^= valu.neg[mul_op_idx];
4367 if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
4368 neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
4369 else if (instr->opcode == aco_opcode::v_subrev_f32 ||
4370 instr->opcode == aco_opcode::v_subrev_f16)
4371 neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
4373 aco_ptr<Instruction> add_instr = std::move(instr);
4374 aco_ptr<VALU_instruction> mad;
4375 if (add_instr->isVOP3P() || mul_instr->isVOP3P()) {
4379 aco_opcode mad_op = add_instr->definitions[0].bytes() == 2 ? aco_opcode::v_fma_mixlo_f16
4380 : aco_opcode::v_fma_mix_f32;
4381 mad.reset(create_instruction<VALU_instruction>(mad_op, Format::VOP3P, 3, 1));
4386 aco_opcode mad_op = emit_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
4387 if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32) {
4388 assert(emit_fma == (ctx.program->gfx_level >= GFX10_3));
4389 mad_op = emit_fma ? aco_opcode::v_fma_legacy_f32 : aco_opcode::v_mad_legacy_f32;
4391 mad_op = emit_fma ? (ctx.program->gfx_level == GFX8 ? aco_opcode::v_fma_legacy_f16
4392 : aco_opcode::v_fma_f16)
4393 : (ctx.program->gfx_level == GFX8 ? aco_opcode::v_mad_legacy_f16
4394 : aco_opcode::v_mad_f16);
4396 mad_op = aco_opcode::v_fma_f64;
4399 mad.reset(create_instruction<VALU_instruction>(mad_op, Format::VOP3, 3, 1));
4402 for (unsigned i = 0; i < 3; i++) {
4403 mad->operands[i] = op[i];
4404 mad->neg[i] = neg[i];
4405 mad->abs[i] = abs[i];
4409 mad->opsel_lo = opsel_lo;
4410 mad->opsel_hi = opsel_hi;
4412 mad->definitions[0] = add_instr->definitions[0];
4413 mad->definitions[0].setPrecise(add_instr->definitions[0].isPrecise() ||
4414 mul_instr->definitions[0].isPrecise());
4415 mad->pass_flags = add_instr->pass_flags;
4417 instr = std::move(mad);
4419 /* mark this ssa_def to be re-checked for profitability and literals */
4420 ctx.mad_infos.emplace_back(std::move(add_instr), mul_instr->definitions[0].tempId());
4421 ctx.info[instr->definitions[0].tempId()].set_mad(ctx.mad_infos.size() - 1);
4425 /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
4426 else if (((instr->opcode == aco_opcode::v_mul_f32 &&
4427 !ctx.fp_mode.preserve_signed_zero_inf_nan32) ||
4428 instr->opcode == aco_opcode::v_mul_legacy_f32) &&
4429 !instr->usesModifiers() && !ctx.fp_mode.must_flush_denorms32) {
4430 for (unsigned i = 0; i < 2; i++) {
4431 if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
4432 ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() &&
4433 instr->operands[!i].getTemp().type() == RegType::vgpr) {
4434 ctx.uses[instr->operands[i].tempId()]--;
4435 ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
4437 aco_ptr<VALU_instruction> new_instr{
4438 create_instruction<VALU_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
4439 new_instr->operands[0] = Operand::zero();
4440 new_instr->operands[1] = instr->operands[!i];
4441 new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
4442 new_instr->definitions[0] = instr->definitions[0];
4443 new_instr->pass_flags = instr->pass_flags;
4444 instr = std::move(new_instr);
4445 ctx.info[instr->definitions[0].tempId()].label = 0;
4449 } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->gfx_level >= GFX9) {
4450 if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012",
4452 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32,
4454 } else if (combine_add_or_then_and_lshl(ctx, instr)) {
4456 } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->gfx_level >= GFX10) {
4457 if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012",
4459 } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32,
4461 } else if (combine_xor_not(ctx, instr)) {
4463 } else if (instr->opcode == aco_opcode::v_not_b32 && ctx.program->gfx_level >= GFX10) {
4464 combine_not_xor(ctx, instr);
4465 } else if (instr->opcode == aco_opcode::v_add_u16) {
4466 combine_three_valu_op(
4467 ctx, instr, aco_opcode::v_mul_lo_u16,
4468 ctx.program->gfx_level == GFX8 ? aco_opcode::v_mad_legacy_u16 : aco_opcode::v_mad_u16,
4470 } else if (instr->opcode == aco_opcode::v_add_u16_e64) {
4471 combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16_e64, aco_opcode::v_mad_u16, "120",
4473 } else if (instr->opcode == aco_opcode::v_add_u32) {
4474 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
4475 } else if (combine_add_bcnt(ctx, instr)) {
4476 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
4477 aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
4478 } else if (ctx.program->gfx_level >= GFX9 && !instr->usesModifiers()) {
4479 if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120",
4481 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32,
4483 } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32,
4485 } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32,
4487 } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
4489 } else if (combine_add_or_then_and_lshl(ctx, instr)) {
4492 } else if (instr->opcode == aco_opcode::v_add_co_u32 ||
4493 instr->opcode == aco_opcode::v_add_co_u32_e64) {
4494 bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0;
4495 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
4496 } else if (!carry_out && combine_add_bcnt(ctx, instr)) {
4497 } else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
4498 aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
4499 } else if (!carry_out && combine_add_lshl(ctx, instr, false)) {
4501 } else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 ||
4502 instr->opcode == aco_opcode::v_sub_co_u32_e64) {
4504 instr->opcode != aco_opcode::v_sub_u32 && ctx.uses[instr->definitions[1].tempId()] > 0;
4505 if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2)) {
4506 } else if (!carry_out && combine_add_lshl(ctx, instr, true)) {
4508 } else if (instr->opcode == aco_opcode::v_subrev_u32 ||
4509 instr->opcode == aco_opcode::v_subrev_co_u32 ||
4510 instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
4511 combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
4512 } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->gfx_level >= GFX9) {
4513 combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120",
4515 } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) &&
4516 ctx.program->gfx_level >= GFX9) {
4517 combine_salu_lshl_add(ctx, instr);
4518 } else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
4519 if (!combine_salu_not_bitwise(ctx, instr))
4520 combine_inverse_comparison(ctx, instr);
4521 } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
4522 instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
4523 if (combine_ordering_test(ctx, instr)) {
4524 } else if (combine_comparison_ordering(ctx, instr)) {
4525 } else if (combine_constant_comparison_ordering(ctx, instr)) {
4526 } else if (combine_salu_n2(ctx, instr)) {
4528 } else if (instr->opcode == aco_opcode::s_abs_i32) {
4529 combine_sabsdiff(ctx, instr);
4530 } else if (instr->opcode == aco_opcode::s_cmp_lg_i32 ||
4531 instr->opcode == aco_opcode::s_cmp_lg_u32 ||
4532 instr->opcode == aco_opcode::s_cmp_lg_u64 ||
4533 instr->opcode == aco_opcode::s_cmp_eq_i32 ||
4534 instr->opcode == aco_opcode::s_cmp_eq_u32 ||
4535 instr->opcode == aco_opcode::s_cmp_eq_u64) {
4536 combine_s_bitcmp(ctx, instr);
4537 } else if (instr->opcode == aco_opcode::v_and_b32) {
4538 combine_and_subbrev(ctx, instr);
4539 } else if (instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) {
4540 /* set existing v_fma_f32 with label_mad so we can create v_fmamk_f32/v_fmaak_f32.
4541 * since ctx.uses[mad_info::mul_temp_id] is always 0, we don't have to worry about
4542 * select_instruction() using mad_info::add_instr.
4544 ctx.mad_infos.emplace_back(nullptr, 0);
4545 ctx.info[instr->definitions[0].tempId()].set_mad(ctx.mad_infos.size() - 1);
4546 } else if (instr->opcode == aco_opcode::v_med3_f32 || instr->opcode == aco_opcode::v_med3_f16) {
4548 if (detect_clamp(instr.get(), &idx)) {
4549 instr->format = asVOP3(Format::VOP2);
4550 instr->operands[0] = instr->operands[idx];
4551 instr->operands[1] = Operand::zero();
4553 instr->opcode == aco_opcode::v_med3_f32 ? aco_opcode::v_add_f32 : aco_opcode::v_add_f16;
4554 instr->valu().clamp = true;
4555 instr->valu().abs = (uint8_t)instr->valu().abs[idx];
4556 instr->valu().neg = (uint8_t)instr->valu().neg[idx];
4557 instr->operands.pop_back();
4560 aco_opcode min, max, min3, max3, med3, minmax;
4561 bool some_gfx9_only;
4562 if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &minmax,
4564 (!some_gfx9_only || ctx.program->gfx_level >= GFX9)) {
4565 if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,
4566 instr->opcode == min ? min3 : max3, minmax)) {
4568 combine_clamp(ctx, instr, min, max, med3);
4575 to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
4577 /* Check every operand to make sure they are suitable. */
4578 for (Operand& op : instr->operands) {
4581 if (!ctx.info[op.tempId()].is_uniform_bool() && !ctx.info[op.tempId()].is_uniform_bitwise())
4585 switch (instr->opcode) {
4586 case aco_opcode::s_and_b32:
4587 case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break;
4588 case aco_opcode::s_or_b32:
4589 case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_or_b32; break;
4590 case aco_opcode::s_xor_b32:
4591 case aco_opcode::s_xor_b64: instr->opcode = aco_opcode::s_absdiff_i32; break;
4593 /* Don't transform other instructions. They are very unlikely to appear here. */
4597 for (Operand& op : instr->operands) {
4598 ctx.uses[op.tempId()]--;
4600 if (ctx.info[op.tempId()].is_uniform_bool()) {
4601 /* Just use the uniform boolean temp. */
4602 op.setTemp(ctx.info[op.tempId()].temp);
4603 } else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
4604 /* Use the SCC definition of the predecessor instruction.
4605 * This allows the predecessor to get picked up by the same optimization (if it has no
4606 * divergent users), and it also makes sure that the current instruction will keep working
4607 * even if the predecessor won't be transformed.
4609 Instruction* pred_instr = ctx.info[op.tempId()].instr;
4610 assert(pred_instr->definitions.size() >= 2);
4611 assert(pred_instr->definitions[1].isFixed() &&
4612 pred_instr->definitions[1].physReg() == scc);
4613 op.setTemp(pred_instr->definitions[1].getTemp());
4615 unreachable("Invalid operand on uniform bitwise instruction.");
4618 ctx.uses[op.tempId()]++;
4621 instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
4622 assert(instr->operands[0].regClass() == s1);
4623 assert(instr->operands[1].regClass() == s1);
4628 select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
4630 const uint32_t threshold = 4;
4632 if (is_dead(ctx.uses, instr.get())) {
4637 /* convert split_vector into a copy or extract_vector if only one definition is ever used */
4638 if (instr->opcode == aco_opcode::p_split_vector) {
4639 unsigned num_used = 0;
4641 unsigned split_offset = 0;
4642 for (unsigned i = 0, offset = 0; i < instr->definitions.size();
4643 offset += instr->definitions[i++].bytes()) {
4644 if (ctx.uses[instr->definitions[i].tempId()]) {
4647 split_offset = offset;
4651 if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
4652 ctx.uses[instr->operands[0].tempId()] == 1) {
4653 Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
4657 for (Operand& vec_op : vec->operands) {
4658 if (off == split_offset) {
4662 off += vec_op.bytes();
4664 if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {
4665 ctx.uses[instr->operands[0].tempId()]--;
4666 for (Operand& vec_op : vec->operands) {
4667 if (vec_op.isTemp())
4668 ctx.uses[vec_op.tempId()]--;
4671 ctx.uses[op.tempId()]++;
4673 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
4674 aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
4675 extract->operands[0] = op;
4676 extract->definitions[0] = instr->definitions[idx];
4677 instr = std::move(extract);
4683 if (!done && num_used == 1 &&
4684 instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
4685 split_offset % instr->definitions[idx].bytes() == 0) {
4686 aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
4687 aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
4688 extract->operands[0] = instr->operands[0];
4689 extract->operands[1] =
4690 Operand::c32((uint32_t)split_offset / instr->definitions[idx].bytes());
4691 extract->definitions[0] = instr->definitions[idx];
4692 instr = std::move(extract);
4696 mad_info* mad_info = NULL;
4697 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
4698 mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val];
4699 /* re-check mad instructions */
4700 if (ctx.uses[mad_info->mul_temp_id] && mad_info->add_instr) {
4701 ctx.uses[mad_info->mul_temp_id]++;
4702 if (instr->operands[0].isTemp())
4703 ctx.uses[instr->operands[0].tempId()]--;
4704 if (instr->operands[1].isTemp())
4705 ctx.uses[instr->operands[1].tempId()]--;
4706 instr.swap(mad_info->add_instr);
4709 /* check literals */
4710 else if (!instr->isDPP() && !instr->isVOP3P() && instr->opcode != aco_opcode::v_fma_f64 &&
4711 instr->opcode != aco_opcode::v_mad_legacy_f32 &&
4712 instr->opcode != aco_opcode::v_fma_legacy_f32) {
4713 /* FMA can only take literals on GFX10+ */
4714 if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
4715 ctx.program->gfx_level < GFX10)
4717 /* There are no v_fmaak_legacy_f16/v_fmamk_legacy_f16 and on chips where VOP3 can take
4718 * literals (GFX10+), these instructions don't exist.
4720 if (instr->opcode == aco_opcode::v_fma_legacy_f16)
4723 uint32_t literal_mask = 0;
4724 uint32_t fp16_mask = 0;
4725 uint32_t sgpr_mask = 0;
4726 uint32_t vgpr_mask = 0;
4727 uint32_t literal_uses = UINT32_MAX;
4728 uint32_t literal_value = 0;
4730 /* Iterate in reverse to prefer v_madak/v_fmaak. */
4731 for (int i = 2; i >= 0; i--) {
4732 Operand& op = instr->operands[i];
4735 if (ctx.info[op.tempId()].is_literal(get_operand_size(instr, i))) {
4736 uint32_t new_literal = ctx.info[op.tempId()].val;
4737 float value = uif(new_literal);
4738 uint16_t fp16_val = _mesa_float_to_half(value);
4739 bool is_denorm = (fp16_val & 0x7fff) != 0 && (fp16_val & 0x7fff) <= 0x3ff;
4740 if (_mesa_half_to_float(fp16_val) == value &&
4741 (!is_denorm || (ctx.fp_mode.denorm16_64 & fp_denorm_keep_in)))
4742 fp16_mask |= 1 << i;
4744 if (!literal_mask || literal_value == new_literal) {
4745 literal_value = new_literal;
4746 literal_uses = MIN2(literal_uses, ctx.uses[op.tempId()]);
4747 literal_mask |= 1 << i;
4751 sgpr_mask |= op.isOfType(RegType::sgpr) << i;
4752 vgpr_mask |= op.isOfType(RegType::vgpr) << i;
4755 /* The constant bus limitations before GFX10 disallows SGPRs. */
4756 if (sgpr_mask && ctx.program->gfx_level < GFX10)
4759 /* Encoding needs a vgpr. */
4763 /* v_madmk/v_fmamk needs a vgpr in the third source. */
4764 if (!(literal_mask & 0b100) && !(vgpr_mask & 0b100))
4767 /* opsel with GFX11+ is the only modifier supported by fmamk/fmaak*/
4768 if (instr->valu().abs || instr->valu().neg || instr->valu().omod || instr->valu().clamp ||
4769 (instr->valu().opsel && ctx.program->gfx_level < GFX11))
4772 if (instr->valu().opsel & ~vgpr_mask)
4775 /* We can't use three unique fp16 literals */
4776 if (fp16_mask == 0b111)
4779 if ((instr->opcode == aco_opcode::v_fma_f32 ||
4780 (instr->opcode == aco_opcode::v_mad_f32 && !instr->definitions[0].isPrecise())) &&
4781 !instr->valu().omod && ctx.program->gfx_level >= GFX10 &&
4782 util_bitcount(fp16_mask) > std::max<uint32_t>(util_bitcount(literal_mask), 1)) {
4783 assert(ctx.program->dev.fused_mad_mix);
4784 u_foreach_bit (i, fp16_mask)
4785 ctx.uses[instr->operands[i].tempId()]--;
4786 mad_info->fp16_mask = fp16_mask;
4790 /* Limit the number of literals to apply to not increase the code
4791 * size too much, but always apply literals for v_mad->v_madak
4792 * because both instructions are 64-bit and this doesn't increase
4794 * TODO: try to apply the literals earlier to lower the number of
4795 * uses below threshold
4797 if (literal_mask && (literal_uses < threshold || (literal_mask & 0b100))) {
4798 u_foreach_bit (i, literal_mask)
4799 ctx.uses[instr->operands[i].tempId()]--;
4800 mad_info->literal_mask = literal_mask;
4806 /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions
4807 * when it isn't beneficial */
4808 if (instr->isBranch() && instr->operands.size() && instr->operands[0].isTemp() &&
4809 instr->operands[0].isFixed() && instr->operands[0].physReg() == scc) {
4810 ctx.info[instr->operands[0].tempId()].set_scc_needed();
4812 } else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
4813 instr->opcode == aco_opcode::s_cselect_b32) &&
4814 instr->operands[2].isTemp()) {
4815 ctx.info[instr->operands[2].tempId()].set_scc_needed();
4816 } else if (instr->opcode == aco_opcode::p_wqm && instr->operands[0].isTemp() &&
4817 ctx.info[instr->definitions[0].tempId()].is_scc_needed()) {
4818 /* Propagate label so it is correctly detected by the uniform bool transform */
4819 ctx.info[instr->operands[0].tempId()].set_scc_needed();
4821 /* Fix definition to SCC, this will prevent RA from adding superfluous moves */
4822 instr->definitions[0].setFixed(scc);
4825 /* check for literals */
4826 if (!instr->isSALU() && !instr->isVALU())
4829 /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
4830 if (instr->definitions.size() && ctx.uses[instr->definitions[0].tempId()] == 0 &&
4831 ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
4832 bool transform_done = to_uniform_bool_instr(ctx, instr);
4834 if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
4835 /* Swap the two definition IDs in order to avoid overusing the SCC.
4836 * This reduces extra moves generated by RA. */
4837 uint32_t def0_id = instr->definitions[0].getTemp().id();
4838 uint32_t def1_id = instr->definitions[1].getTemp().id();
4839 instr->definitions[0].setTemp(Temp(def1_id, s1));
4840 instr->definitions[1].setTemp(Temp(def0_id, s1));
4846 /* This optimization is done late in order to be able to apply otherwise
4847 * unsafe optimizations such as the inverse comparison optimization.
4849 if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_and_b64) {
4850 if (instr->operands[0].isTemp() && fixed_to_exec(instr->operands[1]) &&
4851 ctx.uses[instr->operands[0].tempId()] == 1 &&
4852 ctx.uses[instr->definitions[1].tempId()] == 0 &&
4853 can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), instr->pass_flags)) {
4854 ctx.uses[instr->operands[0].tempId()]--;
4855 ctx.info[instr->operands[0].tempId()].instr->definitions[0].setTemp(
4856 instr->definitions[0].getTemp());
4862 /* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */
4863 if (instr->isVALU() && !instr->isDPP()) {
4864 for (unsigned i = 0; i < instr->operands.size(); i++) {
4865 if (!instr->operands[i].isTemp())
4867 ssa_info info = ctx.info[instr->operands[i].tempId()];
4869 if (!info.is_dpp() || info.instr->pass_flags != instr->pass_flags)
4872 /* We won't eliminate the DPP mov if the operand is used twice */
4873 bool op_used_twice = false;
4874 for (unsigned j = 0; j < instr->operands.size(); j++)
4875 op_used_twice |= i != j && instr->operands[i] == instr->operands[j];
4880 if (!can_swap_operands(instr, &instr->opcode, 0, i))
4882 instr->valu().swapOperands(0, i);
4885 if (!can_use_DPP(ctx.program->gfx_level, instr, info.is_dpp8()))
4888 bool dpp8 = info.is_dpp8();
4889 bool input_mods = can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, 0) &&
4890 get_operand_size(instr, 0) == 32;
4891 bool mov_uses_mods = info.instr->valu().neg[0] || info.instr->valu().abs[0];
4892 if (((dpp8 && ctx.program->gfx_level < GFX11) || !input_mods) && mov_uses_mods)
4895 convert_to_DPP(ctx.program->gfx_level, instr, dpp8);
4898 DPP8_instruction* dpp = &instr->dpp8();
4899 for (unsigned j = 0; j < 8; ++j)
4900 dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j];
4902 instr->format = asVOP3(instr->format);
4904 DPP16_instruction* dpp = &instr->dpp16();
4905 dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl;
4906 dpp->bound_ctrl = info.instr->dpp16().bound_ctrl;
4909 instr->valu().neg[0] ^= info.instr->valu().neg[0] && !instr->valu().abs[0];
4910 instr->valu().abs[0] |= info.instr->valu().abs[0];
4912 if (--ctx.uses[info.instr->definitions[0].tempId()])
4913 ctx.uses[info.instr->operands[0].tempId()]++;
4914 instr->operands[0].setTemp(info.instr->operands[0].getTemp());
4919 /* Use v_fma_mix for f2f32/f2f16 if it has higher throughput.
4920 * Do this late to not disturb other optimizations.
4922 if ((instr->opcode == aco_opcode::v_cvt_f32_f16 || instr->opcode == aco_opcode::v_cvt_f16_f32) &&
4923 ctx.program->gfx_level >= GFX11 && ctx.program->wave_size == 64 && !instr->valu().omod &&
4925 bool is_f2f16 = instr->opcode == aco_opcode::v_cvt_f16_f32;
4926 Instruction* fma = create_instruction<VALU_instruction>(
4927 is_f2f16 ? aco_opcode::v_fma_mixlo_f16 : aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1);
4928 fma->definitions[0] = instr->definitions[0];
4929 fma->operands[0] = instr->operands[0];
4930 fma->valu().opsel_hi[0] = !is_f2f16;
4931 fma->valu().opsel_lo[0] = instr->valu().opsel[0];
4932 fma->valu().clamp = instr->valu().clamp;
4933 fma->valu().abs[0] = instr->valu().abs[0];
4934 fma->valu().neg[0] = instr->valu().neg[0];
4935 fma->operands[1] = Operand::c32(fui(1.0f));
4936 fma->operands[2] = Operand::zero();
4937 /* fma_mix is only dual issued if dst and acc type match */
4938 fma->valu().opsel_hi[2] = is_f2f16;
4939 fma->valu().neg[2] = true;
4941 ctx.info[instr->definitions[0].tempId()].label = 0;
4944 if (instr->isSDWA() || (instr->isVOP3() && ctx.program->gfx_level < GFX10) ||
4945 (instr->isVOP3P() && ctx.program->gfx_level < GFX10))
4946 return; /* some encodings can't ever take literals */
4948 /* we do not apply the literals yet as we don't know if it is profitable */
4949 Operand current_literal(s1);
4951 unsigned literal_id = 0;
4952 unsigned literal_uses = UINT32_MAX;
4953 Operand literal(s1);
4954 unsigned num_operands = 1;
4955 if (instr->isSALU() || (ctx.program->gfx_level >= GFX10 &&
4956 (can_use_VOP3(ctx, instr) || instr->isVOP3P()) && !instr->isDPP()))
4957 num_operands = instr->operands.size();
4958 /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
4959 else if (instr->isVALU() && instr->operands.size() >= 3)
4962 unsigned sgpr_ids[2] = {0, 0};
4963 bool is_literal_sgpr = false;
4966 /* choose a literal to apply */
4967 for (unsigned i = 0; i < num_operands; i++) {
4968 Operand op = instr->operands[i];
4969 unsigned bits = get_operand_size(instr, i);
4971 if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&
4972 op.tempId() != sgpr_ids[0])
4973 sgpr_ids[!!sgpr_ids[0]] = op.tempId();
4975 if (op.isLiteral()) {
4976 current_literal = op;
4978 } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) {
4982 if (!alu_can_accept_constant(instr, i))
4985 if (ctx.uses[op.tempId()] < literal_uses) {
4986 is_literal_sgpr = op.getTemp().type() == RegType::sgpr;
4988 literal = Operand::c32(ctx.info[op.tempId()].val);
4989 literal_uses = ctx.uses[op.tempId()];
4990 literal_id = op.tempId();
4993 mask |= (op.tempId() == literal_id) << i;
4996 /* don't go over the constant bus limit */
4997 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
4998 instr->opcode == aco_opcode::v_lshrrev_b64 ||
4999 instr->opcode == aco_opcode::v_ashrrev_i64;
5000 unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;
5001 if (ctx.program->gfx_level >= GFX10 && !is_shift64)
5002 const_bus_limit = 2;
5004 unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
5005 if (num_sgprs == const_bus_limit && !is_literal_sgpr)
5008 if (literal_id && literal_uses < threshold &&
5009 (current_literal.isUndefined() ||
5010 (current_literal.size() == literal.size() &&
5011 current_literal.constantValue() == literal.constantValue()))) {
5012 /* mark the literal to be applied */
5014 unsigned i = u_bit_scan(&mask);
5015 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)
5016 ctx.uses[instr->operands[i].tempId()]--;
5022 sopk_opcode_for_sopc(aco_opcode opcode)
5025 case aco_opcode::s_cmp_##op##_i32: return aco_opcode::s_cmpk_##op##_i32; \
5026 case aco_opcode::s_cmp_##op##_u32: return aco_opcode::s_cmpk_##op##_u32;
5034 default: return aco_opcode::num_opcodes;
5040 sopc_is_signed(aco_opcode opcode)
5043 case aco_opcode::s_cmp_##op##_i32: return true; \
5044 case aco_opcode::s_cmp_##op##_u32: return false;
5052 default: unreachable("Not a valid SOPC instruction.");
5058 sopc_32_swapped(aco_opcode opcode)
5060 #define SOPC(op1, op2) \
5061 case aco_opcode::s_cmp_##op1##_i32: return aco_opcode::s_cmp_##op2##_i32; \
5062 case aco_opcode::s_cmp_##op1##_u32: return aco_opcode::s_cmp_##op2##_u32;
5070 default: return aco_opcode::num_opcodes;
5076 try_convert_sopc_to_sopk(aco_ptr<Instruction>& instr)
5078 if (sopk_opcode_for_sopc(instr->opcode) == aco_opcode::num_opcodes)
5081 if (instr->operands[0].isLiteral()) {
5082 std::swap(instr->operands[0], instr->operands[1]);
5083 instr->opcode = sopc_32_swapped(instr->opcode);
5086 if (!instr->operands[1].isLiteral())
5089 if (instr->operands[0].isFixed() && instr->operands[0].physReg() >= 128)
5092 uint32_t value = instr->operands[1].constantValue();
5094 const uint32_t i16_mask = 0xffff8000u;
5096 bool value_is_i16 = (value & i16_mask) == 0 || (value & i16_mask) == i16_mask;
5097 bool value_is_u16 = !(value & 0xffff0000u);
5099 if (!value_is_i16 && !value_is_u16)
5102 if (!value_is_i16 && sopc_is_signed(instr->opcode)) {
5103 if (instr->opcode == aco_opcode::s_cmp_lg_i32)
5104 instr->opcode = aco_opcode::s_cmp_lg_u32;
5105 else if (instr->opcode == aco_opcode::s_cmp_eq_i32)
5106 instr->opcode = aco_opcode::s_cmp_eq_u32;
5109 } else if (!value_is_u16 && !sopc_is_signed(instr->opcode)) {
5110 if (instr->opcode == aco_opcode::s_cmp_lg_u32)
5111 instr->opcode = aco_opcode::s_cmp_lg_i32;
5112 else if (instr->opcode == aco_opcode::s_cmp_eq_u32)
5113 instr->opcode = aco_opcode::s_cmp_eq_i32;
5118 static_assert(sizeof(SOPK_instruction) <= sizeof(SOPC_instruction),
5119 "Invalid direct instruction cast.");
5120 instr->format = Format::SOPK;
5121 SOPK_instruction* instr_sopk = &instr->sopk();
5123 instr_sopk->imm = instr_sopk->operands[1].constantValue() & 0xffff;
5124 instr_sopk->opcode = sopk_opcode_for_sopc(instr_sopk->opcode);
5125 instr_sopk->operands.pop_back();
5129 unswizzle_vop3p_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
5131 /* This opt is only beneficial for v_pk_fma_f16 because we can use v_pk_fmac_f16 if the
5132 * instruction doesn't use swizzles. */
5133 if (instr->opcode != aco_opcode::v_pk_fma_f16)
5136 VALU_instruction& vop3p = instr->valu();
5138 unsigned literal_swizzle = ~0u;
5139 for (unsigned i = 0; i < instr->operands.size(); i++) {
5140 if (!instr->operands[i].isLiteral())
5142 unsigned new_swizzle = vop3p.opsel_lo[i] | (vop3p.opsel_hi[i] << 1);
5143 if (literal_swizzle != ~0u && new_swizzle != literal_swizzle)
5144 return; /* Literal swizzles conflict. */
5145 literal_swizzle = new_swizzle;
5148 if (literal_swizzle == 0b10 || literal_swizzle == ~0u)
5149 return; /* already unswizzled */
5151 for (unsigned i = 0; i < instr->operands.size(); i++) {
5152 if (!instr->operands[i].isLiteral())
5154 uint32_t literal = instr->operands[i].constantValue();
5155 literal = (literal >> (16 * (literal_swizzle & 0x1)) & 0xffff) |
5156 (literal >> (8 * (literal_swizzle & 0x2)) << 16);
5157 instr->operands[i] = Operand::literal32(literal);
5158 vop3p.opsel_lo[i] = false;
5159 vop3p.opsel_hi[i] = true;
5164 apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
5166 /* Cleanup Dead Instructions */
5170 /* apply literals on MAD */
5171 if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
5172 mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val];
5173 const bool madak = (info->literal_mask & 0b100);
5174 bool has_dead_literal = false;
5175 u_foreach_bit (i, info->literal_mask | info->fp16_mask)
5176 has_dead_literal |= ctx.uses[instr->operands[i].tempId()] == 0;
5178 if (has_dead_literal && info->fp16_mask) {
5179 instr->format = Format::VOP3P;
5180 instr->opcode = aco_opcode::v_fma_mix_f32;
5182 uint32_t literal = 0;
5183 bool second = false;
5184 u_foreach_bit (i, info->fp16_mask) {
5185 float value = uif(ctx.info[instr->operands[i].tempId()].val);
5186 literal |= _mesa_float_to_half(value) << (second * 16);
5187 instr->valu().opsel_lo[i] = second;
5188 instr->valu().opsel_hi[i] = true;
5192 for (unsigned i = 0; i < 3; i++) {
5193 if (info->fp16_mask & (1 << i))
5194 instr->operands[i] = Operand::literal32(literal);
5197 ctx.instructions.emplace_back(std::move(instr));
5201 if (has_dead_literal || madak) {
5202 aco_opcode new_op = madak ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
5203 if (instr->opcode == aco_opcode::v_fma_f32)
5204 new_op = madak ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
5205 else if (instr->opcode == aco_opcode::v_mad_f16 ||
5206 instr->opcode == aco_opcode::v_mad_legacy_f16)
5207 new_op = madak ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
5208 else if (instr->opcode == aco_opcode::v_fma_f16)
5209 new_op = madak ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
5211 uint32_t literal = ctx.info[instr->operands[ffs(info->literal_mask) - 1].tempId()].val;
5212 instr->format = Format::VOP2;
5213 instr->opcode = new_op;
5214 for (unsigned i = 0; i < 3; i++) {
5215 if (info->literal_mask & (1 << i))
5216 instr->operands[i] = Operand::literal32(literal);
5218 if (madak) { /* add literal -> madak */
5219 if (!instr->operands[1].isOfType(RegType::vgpr))
5220 instr->valu().swapOperands(0, 1);
5221 } else { /* mul literal -> madmk */
5222 if (!(info->literal_mask & 0b10))
5223 instr->valu().swapOperands(0, 1);
5224 instr->valu().swapOperands(1, 2);
5226 ctx.instructions.emplace_back(std::move(instr));
5231 /* apply literals on other SALU/VALU */
5232 if (instr->isSALU() || instr->isVALU()) {
5233 for (unsigned i = 0; i < instr->operands.size(); i++) {
5234 Operand op = instr->operands[i];
5235 unsigned bits = get_operand_size(instr, i);
5236 if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
5237 Operand literal = Operand::literal32(ctx.info[op.tempId()].val);
5238 instr->format = withoutDPP(instr->format);
5239 if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)
5240 instr->format = asVOP3(instr->format);
5241 instr->operands[i] = literal;
5246 if (instr->isSOPC())
5247 try_convert_sopc_to_sopk(instr);
5249 /* allow more s_addk_i32 optimizations if carry isn't used */
5250 if (instr->opcode == aco_opcode::s_add_u32 && ctx.uses[instr->definitions[1].tempId()] == 0 &&
5251 (instr->operands[0].isLiteral() || instr->operands[1].isLiteral()))
5252 instr->opcode = aco_opcode::s_add_i32;
5254 if (instr->isVOP3P())
5255 unswizzle_vop3p_literals(ctx, instr);
5257 ctx.instructions.emplace_back(std::move(instr));
5261 optimize(Program* program)
5264 ctx.program = program;
5265 std::vector<ssa_info> info(program->peekAllocationId());
5266 ctx.info = info.data();
5268 /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
5269 for (Block& block : program->blocks) {
5270 ctx.fp_mode = block.fp_mode;
5271 for (aco_ptr<Instruction>& instr : block.instructions)
5272 label_instruction(ctx, instr);
5275 ctx.uses = dead_code_analysis(program);
5277 /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
5278 for (Block& block : program->blocks) {
5279 ctx.fp_mode = block.fp_mode;
5280 for (aco_ptr<Instruction>& instr : block.instructions)
5281 combine_instruction(ctx, instr);
5284 /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
5285 for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend();
5287 Block* block = &(*block_rit);
5288 ctx.fp_mode = block->fp_mode;
5289 for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend();
5291 select_instruction(ctx, *instr_rit);
5294 /* 4. Add literals to instructions */
5295 for (Block& block : program->blocks) {
5296 ctx.instructions.reserve(block.instructions.size());
5297 ctx.fp_mode = block.fp_mode;
5298 for (aco_ptr<Instruction>& instr : block.instructions)
5299 apply_literals(ctx, instr);
5300 block.instructions = std::move(ctx.instructions);