src/amd/compiler/aco_optimizer.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_builder.h"
  26 #include "aco_ir.h"
  27
  28 #include "util/half_float.h"
  29 #include "util/memstream.h"
  30
  31 #include <algorithm>
  32 #include <array>
  33 #include <vector>
  34
  35 namespace aco {
  36
  37 #ifndef NDEBUG
  38 void
  39 perfwarn(Program* program, bool cond, const char* msg, Instruction* instr)
  40 {
  41    if (cond) {
  42       char* out;
  43       size_t outsize;
  44       struct u_memstream mem;
  45       u_memstream_open(&mem, &out, &outsize);
  46       FILE* const memf = u_memstream_get(&mem);
  47
  48       fprintf(memf, "%s: ", msg);
  49       aco_print_instr(program->gfx_level, instr, memf);
  50       u_memstream_close(&mem);
  51
  52       aco_perfwarn(program, out);
  53       free(out);
  54
  55       if (debug_flags & DEBUG_PERFWARN)
  56          exit(1);
  57    }
  58 }
  59 #endif
  60
  61 /**
  62  * The optimizer works in 4 phases:
  63  * (1) The first pass collects information for each ssa-def,
  64  *     propagates reg->reg operands of the same type, inline constants
  65  *     and neg/abs input modifiers.
  66  * (2) The second pass combines instructions like mad, omod, clamp and
  67  *     propagates sgpr's on VALU instructions.
  68  *     This pass depends on information collected in the first pass.
  69  * (3) The third pass goes backwards, and selects instructions,
  70  *     i.e. decides if a mad instruction is profitable and eliminates dead code.
  71  * (4) The fourth pass cleans up the sequence: literals get applied and dead
  72  *     instructions are removed from the sequence.
  73  */
  74
  75 struct mad_info {
  76    aco_ptr<Instruction> add_instr;
  77    uint32_t mul_temp_id;
  78    uint16_t literal_mask;
  79    uint16_t fp16_mask;
  80
  81    mad_info(aco_ptr<Instruction> instr, uint32_t id)
  82        : add_instr(std::move(instr)), mul_temp_id(id), literal_mask(0), fp16_mask(0)
  83    {}
  84 };
  85
  86 enum Label {
  87    label_vec = 1 << 0,
  88    label_constant_32bit = 1 << 1,
  89    /* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and
  90     * 32-bit operations but this shouldn't cause any issues because we don't
  91     * look through any conversions */
  92    label_abs = 1 << 2,
  93    label_neg = 1 << 3,
  94    label_mul = 1 << 4,
  95    label_temp = 1 << 5,
  96    label_literal = 1 << 6,
  97    label_mad = 1 << 7,
  98    label_omod2 = 1 << 8,
  99    label_omod4 = 1 << 9,
 100    label_omod5 = 1 << 10,
 101    label_clamp = 1 << 12,
 102    label_undefined = 1 << 14,
 103    label_vcc = 1 << 15,
 104    label_b2f = 1 << 16,
 105    label_add_sub = 1 << 17,
 106    label_bitwise = 1 << 18,
 107    label_minmax = 1 << 19,
 108    label_vopc = 1 << 20,
 109    label_uniform_bool = 1 << 21,
 110    label_constant_64bit = 1 << 22,
 111    label_uniform_bitwise = 1 << 23,
 112    label_scc_invert = 1 << 24,
 113    label_scc_needed = 1 << 26,
 114    label_b2i = 1 << 27,
 115    label_fcanonicalize = 1 << 28,
 116    label_constant_16bit = 1 << 29,
 117    label_usedef = 1 << 30,   /* generic label */
 118    label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */
 119    label_canonicalized = 1ull << 32,
 120    label_extract = 1ull << 33,
 121    label_insert = 1ull << 34,
 122    label_dpp16 = 1ull << 35,
 123    label_dpp8 = 1ull << 36,
 124    label_f2f32 = 1ull << 37,
 125    label_f2f16 = 1ull << 38,
 126    label_split = 1ull << 39,
 127    label_subgroup_invocation = 1ull << 40,
 128 };
 129
 130 static constexpr uint64_t instr_usedef_labels =
 131    label_vec | label_mul | label_add_sub | label_vop3p | label_bitwise | label_uniform_bitwise |
 132    label_minmax | label_vopc | label_usedef | label_extract | label_dpp16 | label_dpp8 |
 133    label_f2f32 | label_subgroup_invocation;
 134 static constexpr uint64_t instr_mod_labels =
 135    label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert | label_f2f16;
 136
 137 static constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels | label_split;
 138 static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f |
 139                                         label_uniform_bool | label_scc_invert | label_b2i |
 140                                         label_fcanonicalize;
 141 static constexpr uint32_t val_labels =
 142    label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal | label_mad;
 143
 144 static_assert((instr_labels & temp_labels) == 0, "labels cannot intersect");
 145 static_assert((instr_labels & val_labels) == 0, "labels cannot intersect");
 146 static_assert((temp_labels & val_labels) == 0, "labels cannot intersect");
 147
 148 struct ssa_info {
 149    uint64_t label;
 150    union {
 151       uint32_t val;
 152       Temp temp;
 153       Instruction* instr;
 154    };
 155
 156    ssa_info() : label(0) {}
 157
 158    void add_label(Label new_label)
 159    {
 160       /* Since all the instr_usedef_labels use instr for the same thing
 161        * (indicating the defining instruction), there is usually no need to
 162        * clear any other instr labels. */
 163       if (new_label & instr_usedef_labels)
 164          label &= ~(instr_mod_labels | temp_labels | val_labels); /* instr, temp and val alias */
 165
 166       if (new_label & instr_mod_labels) {
 167          label &= ~instr_labels;
 168          label &= ~(temp_labels | val_labels); /* instr, temp and val alias */
 169       }
 170
 171       if (new_label & temp_labels) {
 172          label &= ~temp_labels;
 173          label &= ~(instr_labels | val_labels); /* instr, temp and val alias */
 174       }
 175
 176       uint32_t const_labels =
 177          label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;
 178       if (new_label & const_labels) {
 179          label &= ~val_labels | const_labels;
 180          label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
 181       } else if (new_label & val_labels) {
 182          label &= ~val_labels;
 183          label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
 184       }
 185
 186       label |= new_label;
 187    }
 188
 189    void set_vec(Instruction* vec)
 190    {
 191       add_label(label_vec);
 192       instr = vec;
 193    }
 194
 195    bool is_vec() { return label & label_vec; }
 196
 197    void set_constant(amd_gfx_level gfx_level, uint64_t constant)
 198    {
 199       Operand op16 = Operand::c16(constant);
 200       Operand op32 = Operand::get_const(gfx_level, constant, 4);
 201       add_label(label_literal);
 202       val = constant;
 203
 204       /* check that no upper bits are lost in case of packed 16bit constants */
 205       if (gfx_level >= GFX8 && !op16.isLiteral() &&
 206           op16.constantValue16(true) == ((constant >> 16) & 0xffff))
 207          add_label(label_constant_16bit);
 208
 209       if (!op32.isLiteral())
 210          add_label(label_constant_32bit);
 211
 212       if (Operand::is_constant_representable(constant, 8))
 213          add_label(label_constant_64bit);
 214
 215       if (label & label_constant_64bit) {
 216          val = Operand::c64(constant).constantValue();
 217          if (val != constant)
 218             label &= ~(label_literal | label_constant_16bit | label_constant_32bit);
 219       }
 220    }
 221
 222    bool is_constant(unsigned bits)
 223    {
 224       switch (bits) {
 225       case 8: return label & label_literal;
 226       case 16: return label & label_constant_16bit;
 227       case 32: return label & label_constant_32bit;
 228       case 64: return label & label_constant_64bit;
 229       }
 230       return false;
 231    }
 232
 233    bool is_literal(unsigned bits)
 234    {
 235       bool is_lit = label & label_literal;
 236       switch (bits) {
 237       case 8: return false;
 238       case 16: return is_lit && ~(label & label_constant_16bit);
 239       case 32: return is_lit && ~(label & label_constant_32bit);
 240       case 64: return false;
 241       }
 242       return false;
 243    }
 244
 245    bool is_constant_or_literal(unsigned bits)
 246    {
 247       if (bits == 64)
 248          return label & label_constant_64bit;
 249       else
 250          return label & label_literal;
 251    }
 252
 253    void set_abs(Temp abs_temp)
 254    {
 255       add_label(label_abs);
 256       temp = abs_temp;
 257    }
 258
 259    bool is_abs() { return label & label_abs; }
 260
 261    void set_neg(Temp neg_temp)
 262    {
 263       add_label(label_neg);
 264       temp = neg_temp;
 265    }
 266
 267    bool is_neg() { return label & label_neg; }
 268
 269    void set_neg_abs(Temp neg_abs_temp)
 270    {
 271       add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));
 272       temp = neg_abs_temp;
 273    }
 274
 275    void set_mul(Instruction* mul)
 276    {
 277       add_label(label_mul);
 278       instr = mul;
 279    }
 280
 281    bool is_mul() { return label & label_mul; }
 282
 283    void set_temp(Temp tmp)
 284    {
 285       add_label(label_temp);
 286       temp = tmp;
 287    }
 288
 289    bool is_temp() { return label & label_temp; }
 290
 291    void set_mad(uint32_t mad_info_idx)
 292    {
 293       add_label(label_mad);
 294       val = mad_info_idx;
 295    }
 296
 297    bool is_mad() { return label & label_mad; }
 298
 299    void set_omod2(Instruction* mul)
 300    {
 301       if (label & temp_labels)
 302          return;
 303       add_label(label_omod2);
 304       instr = mul;
 305    }
 306
 307    bool is_omod2() { return label & label_omod2; }
 308
 309    void set_omod4(Instruction* mul)
 310    {
 311       if (label & temp_labels)
 312          return;
 313       add_label(label_omod4);
 314       instr = mul;
 315    }
 316
 317    bool is_omod4() { return label & label_omod4; }
 318
 319    void set_omod5(Instruction* mul)
 320    {
 321       if (label & temp_labels)
 322          return;
 323       add_label(label_omod5);
 324       instr = mul;
 325    }
 326
 327    bool is_omod5() { return label & label_omod5; }
 328
 329    void set_clamp(Instruction* med3)
 330    {
 331       if (label & temp_labels)
 332          return;
 333       add_label(label_clamp);
 334       instr = med3;
 335    }
 336
 337    bool is_clamp() { return label & label_clamp; }
 338
 339    void set_f2f16(Instruction* conv)
 340    {
 341       if (label & temp_labels)
 342          return;
 343       add_label(label_f2f16);
 344       instr = conv;
 345    }
 346
 347    bool is_f2f16() { return label & label_f2f16; }
 348
 349    void set_undefined() { add_label(label_undefined); }
 350
 351    bool is_undefined() { return label & label_undefined; }
 352
 353    void set_vcc(Temp vcc_val)
 354    {
 355       add_label(label_vcc);
 356       temp = vcc_val;
 357    }
 358
 359    bool is_vcc() { return label & label_vcc; }
 360
 361    void set_b2f(Temp b2f_val)
 362    {
 363       add_label(label_b2f);
 364       temp = b2f_val;
 365    }
 366
 367    bool is_b2f() { return label & label_b2f; }
 368
 369    void set_add_sub(Instruction* add_sub_instr)
 370    {
 371       add_label(label_add_sub);
 372       instr = add_sub_instr;
 373    }
 374
 375    bool is_add_sub() { return label & label_add_sub; }
 376
 377    void set_bitwise(Instruction* bitwise_instr)
 378    {
 379       add_label(label_bitwise);
 380       instr = bitwise_instr;
 381    }
 382
 383    bool is_bitwise() { return label & label_bitwise; }
 384
 385    void set_uniform_bitwise() { add_label(label_uniform_bitwise); }
 386
 387    bool is_uniform_bitwise() { return label & label_uniform_bitwise; }
 388
 389    void set_minmax(Instruction* minmax_instr)
 390    {
 391       add_label(label_minmax);
 392       instr = minmax_instr;
 393    }
 394
 395    bool is_minmax() { return label & label_minmax; }
 396
 397    void set_vopc(Instruction* vopc_instr)
 398    {
 399       add_label(label_vopc);
 400       instr = vopc_instr;
 401    }
 402
 403    bool is_vopc() { return label & label_vopc; }
 404
 405    void set_scc_needed() { add_label(label_scc_needed); }
 406
 407    bool is_scc_needed() { return label & label_scc_needed; }
 408
 409    void set_scc_invert(Temp scc_inv)
 410    {
 411       add_label(label_scc_invert);
 412       temp = scc_inv;
 413    }
 414
 415    bool is_scc_invert() { return label & label_scc_invert; }
 416
 417    void set_uniform_bool(Temp uniform_bool)
 418    {
 419       add_label(label_uniform_bool);
 420       temp = uniform_bool;
 421    }
 422
 423    bool is_uniform_bool() { return label & label_uniform_bool; }
 424
 425    void set_b2i(Temp b2i_val)
 426    {
 427       add_label(label_b2i);
 428       temp = b2i_val;
 429    }
 430
 431    bool is_b2i() { return label & label_b2i; }
 432
 433    void set_usedef(Instruction* label_instr)
 434    {
 435       add_label(label_usedef);
 436       instr = label_instr;
 437    }
 438
 439    bool is_usedef() { return label & label_usedef; }
 440
 441    void set_vop3p(Instruction* vop3p_instr)
 442    {
 443       add_label(label_vop3p);
 444       instr = vop3p_instr;
 445    }
 446
 447    bool is_vop3p() { return label & label_vop3p; }
 448
 449    void set_fcanonicalize(Temp tmp)
 450    {
 451       add_label(label_fcanonicalize);
 452       temp = tmp;
 453    }
 454
 455    bool is_fcanonicalize() { return label & label_fcanonicalize; }
 456
 457    void set_canonicalized() { add_label(label_canonicalized); }
 458
 459    bool is_canonicalized() { return label & label_canonicalized; }
 460
 461    void set_f2f32(Instruction* cvt)
 462    {
 463       add_label(label_f2f32);
 464       instr = cvt;
 465    }
 466
 467    bool is_f2f32() { return label & label_f2f32; }
 468
 469    void set_extract(Instruction* extract)
 470    {
 471       add_label(label_extract);
 472       instr = extract;
 473    }
 474
 475    bool is_extract() { return label & label_extract; }
 476
 477    void set_insert(Instruction* insert)
 478    {
 479       if (label & temp_labels)
 480          return;
 481       add_label(label_insert);
 482       instr = insert;
 483    }
 484
 485    bool is_insert() { return label & label_insert; }
 486
 487    void set_dpp16(Instruction* mov)
 488    {
 489       add_label(label_dpp16);
 490       instr = mov;
 491    }
 492
 493    void set_dpp8(Instruction* mov)
 494    {
 495       add_label(label_dpp8);
 496       instr = mov;
 497    }
 498
 499    bool is_dpp() { return label & (label_dpp16 | label_dpp8); }
 500    bool is_dpp16() { return label & label_dpp16; }
 501    bool is_dpp8() { return label & label_dpp8; }
 502
 503    void set_split(Instruction* split)
 504    {
 505       add_label(label_split);
 506       instr = split;
 507    }
 508
 509    bool is_split() { return label & label_split; }
 510
 511    void set_subgroup_invocation(Instruction* label_instr)
 512    {
 513       add_label(label_subgroup_invocation);
 514       instr = label_instr;
 515    }
 516
 517    bool is_subgroup_invocation() { return label & label_subgroup_invocation; }
 518 };
 519
 520 struct opt_ctx {
 521    Program* program;
 522    float_mode fp_mode;
 523    std::vector<aco_ptr<Instruction>> instructions;
 524    ssa_info* info;
 525    std::pair<uint32_t, Temp> last_literal;
 526    std::vector<mad_info> mad_infos;
 527    std::vector<uint16_t> uses;
 528 };
 529
 530 bool
 531 can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
 532 {
 533    if (instr->isVOP3())
 534       return true;
 535
 536    if (instr->isVOP3P())
 537       return false;
 538
 539    if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->gfx_level < GFX10)
 540       return false;
 541
 542    if (instr->isSDWA())
 543       return false;
 544
 545    if (instr->isDPP() && ctx.program->gfx_level < GFX11)
 546       return false;
 547
 548    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
 549           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
 550           instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
 551           instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
 552           instr->opcode != aco_opcode::v_readlane_b32 &&
 553           instr->opcode != aco_opcode::v_writelane_b32 &&
 554           instr->opcode != aco_opcode::v_readfirstlane_b32;
 555 }
 556
 557 bool
 558 pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsigned index)
 559 {
 560    if (instr->definitions.empty())
 561       return false;
 562
 563    const bool vgpr =
 564       instr->opcode == aco_opcode::p_as_uniform ||
 565       std::all_of(instr->definitions.begin(), instr->definitions.end(),
 566                   [](const Definition& def) { return def.regClass().type() == RegType::vgpr; });
 567
 568    /* don't propagate VGPRs into SGPR instructions */
 569    if (temp.type() == RegType::vgpr && !vgpr)
 570       return false;
 571
 572    bool can_accept_sgpr =
 573       ctx.program->gfx_level >= GFX9 ||
 574       std::none_of(instr->definitions.begin(), instr->definitions.end(),
 575                    [](const Definition& def) { return def.regClass().is_subdword(); });
 576
 577    switch (instr->opcode) {
 578    case aco_opcode::p_phi:
 579    case aco_opcode::p_linear_phi:
 580    case aco_opcode::p_parallelcopy:
 581    case aco_opcode::p_create_vector:
 582       if (temp.bytes() != instr->operands[index].bytes())
 583          return false;
 584       break;
 585    case aco_opcode::p_extract_vector:
 586    case aco_opcode::p_extract:
 587       if (temp.type() == RegType::sgpr && !can_accept_sgpr)
 588          return false;
 589       break;
 590    case aco_opcode::p_split_vector: {
 591       if (temp.type() == RegType::sgpr && !can_accept_sgpr)
 592          return false;
 593       /* don't increase the vector size */
 594       if (temp.bytes() > instr->operands[index].bytes())
 595          return false;
 596       /* We can decrease the vector size as smaller temporaries are only
 597        * propagated by p_as_uniform instructions.
 598        * If this propagation leads to invalid IR or hits the assertion below,
 599        * it means that some undefined bytes within a dword are begin accessed
 600        * and a bug in instruction_selection is likely. */
 601       int decrease = instr->operands[index].bytes() - temp.bytes();
 602       while (decrease > 0) {
 603          decrease -= instr->definitions.back().bytes();
 604          instr->definitions.pop_back();
 605       }
 606       assert(decrease == 0);
 607       break;
 608    }
 609    case aco_opcode::p_as_uniform:
 610       if (temp.regClass() == instr->definitions[0].regClass())
 611          instr->opcode = aco_opcode::p_parallelcopy;
 612       break;
 613    default: return false;
 614    }
 615
 616    instr->operands[index].setTemp(temp);
 617    return true;
 618 }
 619
 620 /* This expects the DPP modifier to be removed. */
 621 bool
 622 can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 623 {
 624    assert(instr->isVALU());
 625    if (instr->isSDWA() && ctx.program->gfx_level < GFX9)
 626       return false;
 627    return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
 628           instr->opcode != aco_opcode::v_readlane_b32 &&
 629           instr->opcode != aco_opcode::v_readlane_b32_e64 &&
 630           instr->opcode != aco_opcode::v_writelane_b32 &&
 631           instr->opcode != aco_opcode::v_writelane_b32_e64 &&
 632           instr->opcode != aco_opcode::v_permlane16_b32 &&
 633           instr->opcode != aco_opcode::v_permlanex16_b32 &&
 634           instr->opcode != aco_opcode::v_interp_p1_f32 &&
 635           instr->opcode != aco_opcode::v_interp_p2_f32 &&
 636           instr->opcode != aco_opcode::v_interp_mov_f32 &&
 637           instr->opcode != aco_opcode::v_interp_p1ll_f16 &&
 638           instr->opcode != aco_opcode::v_interp_p1lv_f16 &&
 639           instr->opcode != aco_opcode::v_interp_p2_legacy_f16 &&
 640           instr->opcode != aco_opcode::v_interp_p2_f16 &&
 641           instr->opcode != aco_opcode::v_interp_p10_f32_inreg &&
 642           instr->opcode != aco_opcode::v_interp_p2_f32_inreg &&
 643           instr->opcode != aco_opcode::v_interp_p10_f16_f32_inreg &&
 644           instr->opcode != aco_opcode::v_interp_p2_f16_f32_inreg &&
 645           instr->opcode != aco_opcode::v_interp_p10_rtz_f16_f32_inreg &&
 646           instr->opcode != aco_opcode::v_interp_p2_rtz_f16_f32_inreg;
 647 }
 648
 649 bool
 650 is_operand_vgpr(Operand op)
 651 {
 652    return op.isTemp() && op.getTemp().type() == RegType::vgpr;
 653 }
 654
 655 /* only covers special cases */
 656 bool
 657 alu_can_accept_constant(const aco_ptr<Instruction>& instr, unsigned operand)
 658 {
 659    /* Fixed operands can't accept constants because we need them
 660     * to be in their fixed register.
 661     */
 662    assert(instr->operands.size() > operand);
 663    if (instr->operands[operand].isFixed())
 664       return false;
 665
 666    /* SOPP instructions can't use constants. */
 667    if (instr->isSOPP())
 668       return false;
 669
 670    switch (instr->opcode) {
 671    case aco_opcode::v_mac_f32:
 672    case aco_opcode::v_writelane_b32:
 673    case aco_opcode::v_writelane_b32_e64:
 674    case aco_opcode::v_cndmask_b32: return operand != 2;
 675    case aco_opcode::s_addk_i32:
 676    case aco_opcode::s_mulk_i32:
 677    case aco_opcode::p_extract_vector:
 678    case aco_opcode::p_split_vector:
 679    case aco_opcode::v_readlane_b32:
 680    case aco_opcode::v_readlane_b32_e64:
 681    case aco_opcode::v_readfirstlane_b32:
 682    case aco_opcode::p_extract:
 683    case aco_opcode::p_insert: return operand != 0;
 684    case aco_opcode::p_bpermute_readlane:
 685    case aco_opcode::p_bpermute_shared_vgpr:
 686    case aco_opcode::p_bpermute_permlane:
 687    case aco_opcode::p_interp_gfx11:
 688    case aco_opcode::p_dual_src_export_gfx11:
 689    case aco_opcode::v_interp_p1_f32:
 690    case aco_opcode::v_interp_p2_f32:
 691    case aco_opcode::v_interp_mov_f32:
 692    case aco_opcode::v_interp_p1ll_f16:
 693    case aco_opcode::v_interp_p1lv_f16:
 694    case aco_opcode::v_interp_p2_legacy_f16:
 695    case aco_opcode::v_interp_p10_f32_inreg:
 696    case aco_opcode::v_interp_p2_f32_inreg:
 697    case aco_opcode::v_interp_p10_f16_f32_inreg:
 698    case aco_opcode::v_interp_p2_f16_f32_inreg:
 699    case aco_opcode::v_interp_p10_rtz_f16_f32_inreg:
 700    case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return false;
 701    default: return true;
 702    }
 703 }
 704
 705 bool
 706 valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
 707 {
 708    if (instr->opcode == aco_opcode::v_readlane_b32 ||
 709        instr->opcode == aco_opcode::v_readlane_b32_e64 ||
 710        instr->opcode == aco_opcode::v_writelane_b32 ||
 711        instr->opcode == aco_opcode::v_writelane_b32_e64)
 712       return operand != 1;
 713    if (instr->opcode == aco_opcode::v_permlane16_b32 ||
 714        instr->opcode == aco_opcode::v_permlanex16_b32)
 715       return operand == 0;
 716    return true;
 717 }
 718
 719 /* check constant bus and literal limitations */
 720 bool
 721 check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand* operands)
 722 {
 723    int limit = ctx.program->gfx_level >= GFX10 ? 2 : 1;
 724    Operand literal32(s1);
 725    Operand literal64(s2);
 726    unsigned num_sgprs = 0;
 727    unsigned sgpr[] = {0, 0};
 728
 729    for (unsigned i = 0; i < num_operands; i++) {
 730       Operand op = operands[i];
 731
 732       if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {
 733          /* two reads of the same SGPR count as 1 to the limit */
 734          if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
 735             if (num_sgprs < 2)
 736                sgpr[num_sgprs++] = op.tempId();
 737             limit--;
 738             if (limit < 0)
 739                return false;
 740          }
 741       } else if (op.isLiteral()) {
 742          if (ctx.program->gfx_level < GFX10)
 743             return false;
 744
 745          if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())
 746             return false;
 747          if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())
 748             return false;
 749
 750          /* Any number of 32-bit literals counts as only 1 to the limit. Same
 751           * (but separately) for 64-bit literals. */
 752          if (op.size() == 1 && literal32.isUndefined()) {
 753             limit--;
 754             literal32 = op;
 755          } else if (op.size() == 2 && literal64.isUndefined()) {
 756             limit--;
 757             literal64 = op;
 758          }
 759
 760          if (limit < 0)
 761             return false;
 762       }
 763    }
 764
 765    return true;
 766 }
 767
 768 bool
 769 parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* base, uint32_t* offset,
 770                   bool prevent_overflow)
 771 {
 772    Operand op = instr->operands[op_index];
 773
 774    if (!op.isTemp())
 775       return false;
 776    Temp tmp = op.getTemp();
 777    if (!ctx.info[tmp.id()].is_add_sub())
 778       return false;
 779
 780    Instruction* add_instr = ctx.info[tmp.id()].instr;
 781
 782    unsigned mask = 0x3;
 783    bool is_sub = false;
 784    switch (add_instr->opcode) {
 785    case aco_opcode::v_add_u32:
 786    case aco_opcode::v_add_co_u32:
 787    case aco_opcode::v_add_co_u32_e64:
 788    case aco_opcode::s_add_i32:
 789    case aco_opcode::s_add_u32: break;
 790    case aco_opcode::v_sub_u32:
 791    case aco_opcode::v_sub_i32:
 792    case aco_opcode::v_sub_co_u32:
 793    case aco_opcode::v_sub_co_u32_e64:
 794    case aco_opcode::s_sub_u32:
 795    case aco_opcode::s_sub_i32:
 796       mask = 0x2;
 797       is_sub = true;
 798       break;
 799    case aco_opcode::v_subrev_u32:
 800    case aco_opcode::v_subrev_co_u32:
 801    case aco_opcode::v_subrev_co_u32_e64:
 802       mask = 0x1;
 803       is_sub = true;
 804       break;
 805    default: return false;
 806    }
 807    if (prevent_overflow && !add_instr->definitions[0].isNUW())
 808       return false;
 809
 810    if (add_instr->usesModifiers())
 811       return false;
 812
 813    u_foreach_bit (i, mask) {
 814       if (add_instr->operands[i].isConstant()) {
 815          *offset = add_instr->operands[i].constantValue() * (uint32_t)(is_sub ? -1 : 1);
 816       } else if (add_instr->operands[i].isTemp() &&
 817                  ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) {
 818          *offset = ctx.info[add_instr->operands[i].tempId()].val * (uint32_t)(is_sub ? -1 : 1);
 819       } else {
 820          continue;
 821       }
 822       if (!add_instr->operands[!i].isTemp())
 823          continue;
 824
 825       uint32_t offset2 = 0;
 826       if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
 827          *offset += offset2;
 828       } else {
 829          *base = add_instr->operands[!i].getTemp();
 830       }
 831       return true;
 832    }
 833
 834    return false;
 835 }
 836
 837 void
 838 skip_smem_offset_align(opt_ctx& ctx, SMEM_instruction* smem)
 839 {
 840    bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
 841    if (soe && !smem->operands[1].isConstant())
 842       return;
 843    /* We don't need to check the constant offset because the address seems to be calculated with
 844     * (offset&-4 + const_offset&-4), not (offset+const_offset)&-4.
 845     */
 846
 847    Operand& op = smem->operands[soe ? smem->operands.size() - 1 : 1];
 848    if (!op.isTemp() || !ctx.info[op.tempId()].is_bitwise())
 849       return;
 850
 851    Instruction* bitwise_instr = ctx.info[op.tempId()].instr;
 852    if (bitwise_instr->opcode != aco_opcode::s_and_b32)
 853       return;
 854
 855    if (bitwise_instr->operands[0].constantEquals(-4) &&
 856        bitwise_instr->operands[1].isOfType(op.regClass().type()))
 857       op.setTemp(bitwise_instr->operands[1].getTemp());
 858    else if (bitwise_instr->operands[1].constantEquals(-4) &&
 859             bitwise_instr->operands[0].isOfType(op.regClass().type()))
 860       op.setTemp(bitwise_instr->operands[0].getTemp());
 861 }
 862
 863 void
 864 smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 865 {
 866    /* skip &-4 before offset additions: load((a + 16) & -4, 0) */
 867    if (!instr->operands.empty())
 868       skip_smem_offset_align(ctx, &instr->smem());
 869
 870    /* propagate constants and combine additions */
 871    if (!instr->operands.empty() && instr->operands[1].isTemp()) {
 872       SMEM_instruction& smem = instr->smem();
 873       ssa_info info = ctx.info[instr->operands[1].tempId()];
 874
 875       Temp base;
 876       uint32_t offset;
 877       if (info.is_constant_or_literal(32) &&
 878           ((ctx.program->gfx_level == GFX6 && info.val <= 0x3FF) ||
 879            (ctx.program->gfx_level == GFX7 && info.val <= 0xFFFFFFFF) ||
 880            (ctx.program->gfx_level >= GFX8 && info.val <= 0xFFFFF))) {
 881          instr->operands[1] = Operand::c32(info.val);
 882       } else if (parse_base_offset(ctx, instr.get(), 1, &base, &offset, true) &&
 883                  base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->gfx_level >= GFX9 &&
 884                  offset % 4u == 0) {
 885          bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4);
 886          if (soe) {
 887             if (ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) &&
 888                 ctx.info[smem.operands.back().tempId()].val == 0) {
 889                smem.operands[1] = Operand::c32(offset);
 890                smem.operands.back() = Operand(base);
 891             }
 892          } else {
 893             SMEM_instruction* new_instr = create_instruction<SMEM_instruction>(
 894                smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size());
 895             new_instr->operands[0] = smem.operands[0];
 896             new_instr->operands[1] = Operand::c32(offset);
 897             if (smem.definitions.empty())
 898                new_instr->operands[2] = smem.operands[2];
 899             new_instr->operands.back() = Operand(base);
 900             if (!smem.definitions.empty())
 901                new_instr->definitions[0] = smem.definitions[0];
 902             new_instr->sync = smem.sync;
 903             new_instr->glc = smem.glc;
 904             new_instr->dlc = smem.dlc;
 905             new_instr->nv = smem.nv;
 906             new_instr->disable_wqm = smem.disable_wqm;
 907             instr.reset(new_instr);
 908          }
 909       }
 910    }
 911
 912    /* skip &-4 after offset additions: load(a & -4, 16) */
 913    if (!instr->operands.empty())
 914       skip_smem_offset_align(ctx, &instr->smem());
 915 }
 916
 917 Operand
 918 get_constant_op(opt_ctx& ctx, ssa_info info, uint32_t bits)
 919 {
 920    if (bits == 64)
 921       return Operand::c32_or_c64(info.val, true);
 922    return Operand::get_const(ctx.program->gfx_level, info.val, bits / 8u);
 923 }
 924
 925 void
 926 propagate_constants_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info, unsigned i)
 927 {
 928    if (!info.is_constant_or_literal(32))
 929       return;
 930
 931    assert(instr->operands[i].isTemp());
 932    unsigned bits = get_operand_size(instr, i);
 933    if (info.is_constant(bits)) {
 934       instr->operands[i] = get_constant_op(ctx, info, bits);
 935       return;
 936    }
 937
 938    /* The accumulation operand of dot product instructions ignores opsel. */
 939    bool cannot_use_opsel =
 940       (instr->opcode == aco_opcode::v_dot4_i32_i8 || instr->opcode == aco_opcode::v_dot2_i32_i16 ||
 941        instr->opcode == aco_opcode::v_dot4_i32_iu8 || instr->opcode == aco_opcode::v_dot4_u32_u8 ||
 942        instr->opcode == aco_opcode::v_dot2_u32_u16) &&
 943       i == 2;
 944    if (cannot_use_opsel)
 945       return;
 946
 947    /* try to fold inline constants */
 948    VALU_instruction* vop3p = &instr->valu();
 949    bool opsel_lo = vop3p->opsel_lo[i];
 950    bool opsel_hi = vop3p->opsel_hi[i];
 951
 952    Operand const_op[2];
 953    bool const_opsel[2] = {false, false};
 954    for (unsigned j = 0; j < 2; j++) {
 955       if ((unsigned)opsel_lo != j && (unsigned)opsel_hi != j)
 956          continue; /* this half is unused */
 957
 958       uint16_t val = info.val >> (j ? 16 : 0);
 959       Operand op = Operand::get_const(ctx.program->gfx_level, val, bits / 8u);
 960       if (bits == 32 && op.isLiteral()) /* try sign extension */
 961          op = Operand::get_const(ctx.program->gfx_level, val | 0xffff0000, 4);
 962       if (bits == 32 && op.isLiteral()) { /* try shifting left */
 963          op = Operand::get_const(ctx.program->gfx_level, val << 16, 4);
 964          const_opsel[j] = true;
 965       }
 966       if (op.isLiteral())
 967          return;
 968       const_op[j] = op;
 969    }
 970
 971    Operand const_lo = const_op[0];
 972    Operand const_hi = const_op[1];
 973    bool const_lo_opsel = const_opsel[0];
 974    bool const_hi_opsel = const_opsel[1];
 975
 976    if (opsel_lo == opsel_hi) {
 977       /* use the single 16bit value */
 978       instr->operands[i] = opsel_lo ? const_hi : const_lo;
 979
 980       /* opsel must point the same for both halves */
 981       opsel_lo = opsel_lo ? const_hi_opsel : const_lo_opsel;
 982       opsel_hi = opsel_lo;
 983    } else if (const_lo == const_hi) {
 984       /* both constants are the same */
 985       instr->operands[i] = const_lo;
 986
 987       /* opsel must point the same for both halves */
 988       opsel_lo = const_lo_opsel;
 989       opsel_hi = const_lo_opsel;
 990    } else if (const_lo.constantValue16(const_lo_opsel) ==
 991               const_hi.constantValue16(!const_hi_opsel)) {
 992       instr->operands[i] = const_hi;
 993
 994       /* redirect opsel selection */
 995       opsel_lo = opsel_lo ? const_hi_opsel : !const_hi_opsel;
 996       opsel_hi = opsel_hi ? const_hi_opsel : !const_hi_opsel;
 997    } else if (const_hi.constantValue16(const_hi_opsel) ==
 998               const_lo.constantValue16(!const_lo_opsel)) {
 999       instr->operands[i] = const_lo;
1000
1001       /* redirect opsel selection */
1002       opsel_lo = opsel_lo ? !const_lo_opsel : const_lo_opsel;
1003       opsel_hi = opsel_hi ? !const_lo_opsel : const_lo_opsel;
1004    } else if (bits == 16 && const_lo.constantValue() == (const_hi.constantValue() ^ (1 << 15))) {
1005       assert(const_lo_opsel == false && const_hi_opsel == false);
1006
1007       /* const_lo == -const_hi */
1008       if (!can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i))
1009          return;
1010
1011       instr->operands[i] = Operand::c16(const_lo.constantValue() & 0x7FFF);
1012       bool neg_lo = const_lo.constantValue() & (1 << 15);
1013       vop3p->neg_lo[i] ^= opsel_lo ^ neg_lo;
1014       vop3p->neg_hi[i] ^= opsel_hi ^ neg_lo;
1015
1016       /* opsel must point to lo for both operands */
1017       opsel_lo = false;
1018       opsel_hi = false;
1019    }
1020
1021    vop3p->opsel_lo[i] = opsel_lo;
1022    vop3p->opsel_hi[i] = opsel_hi;
1023 }
1024
1025 bool
1026 fixed_to_exec(Operand op)
1027 {
1028    return op.isFixed() && op.physReg() == exec;
1029 }
1030
1031 SubdwordSel
1032 parse_extract(Instruction* instr)
1033 {
1034    if (instr->opcode == aco_opcode::p_extract) {
1035       unsigned size = instr->operands[2].constantValue() / 8;
1036       unsigned offset = instr->operands[1].constantValue() * size;
1037       bool sext = instr->operands[3].constantEquals(1);
1038       return SubdwordSel(size, offset, sext);
1039    } else if (instr->opcode == aco_opcode::p_insert && instr->operands[1].constantEquals(0)) {
1040       return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
1041    } else if (instr->opcode == aco_opcode::p_extract_vector) {
1042       unsigned size = instr->definitions[0].bytes();
1043       unsigned offset = instr->operands[1].constantValue() * size;
1044       if (size <= 2)
1045          return SubdwordSel(size, offset, false);
1046    } else if (instr->opcode == aco_opcode::p_split_vector) {
1047       assert(instr->operands[0].bytes() == 4 && instr->definitions[1].bytes() == 2);
1048       return SubdwordSel(2, 2, false);
1049    }
1050
1051    return SubdwordSel();
1052 }
1053
1054 SubdwordSel
1055 parse_insert(Instruction* instr)
1056 {
1057    if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) &&
1058        instr->operands[1].constantEquals(0)) {
1059       return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
1060    } else if (instr->opcode == aco_opcode::p_insert) {
1061       unsigned size = instr->operands[2].constantValue() / 8;
1062       unsigned offset = instr->operands[1].constantValue() * size;
1063       return SubdwordSel(size, offset, false);
1064    } else {
1065       return SubdwordSel();
1066    }
1067 }
1068
1069 bool
1070 can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
1071 {
1072    Temp tmp = info.instr->operands[0].getTemp();
1073    SubdwordSel sel = parse_extract(info.instr);
1074
1075    if (!sel) {
1076       return false;
1077    } else if (sel.size() == 4) {
1078       return true;
1079    } else if ((instr->opcode == aco_opcode::v_cvt_f32_u32 ||
1080                instr->opcode == aco_opcode::v_cvt_f32_i32) &&
1081               sel.size() == 1 && !sel.sign_extend()) {
1082       return true;
1083    } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() &&
1084               sel.offset() == 0 &&
1085               ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
1086                (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
1087       return true;
1088    } else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
1089               !instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
1090               (instr->operands[!idx].is16bit() ||
1091                instr->operands[!idx].constantValue() <= UINT16_MAX)) {
1092       return true;
1093    } else if (idx < 2 && can_use_SDWA(ctx.program->gfx_level, instr, true) &&
1094               (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
1095       if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword)
1096          return false;
1097       return true;
1098    } else if (instr->isVALU() && sel.size() == 2 && !instr->valu().opsel[idx] &&
1099               can_use_opsel(ctx.program->gfx_level, instr->opcode, idx)) {
1100       return true;
1101    } else if (instr->opcode == aco_opcode::p_extract) {
1102       SubdwordSel instrSel = parse_extract(instr.get());
1103
1104       /* the outer offset must be within extracted range */
1105       if (instrSel.offset() >= sel.size())
1106          return false;
1107
1108       /* don't remove the sign-extension when increasing the size further */
1109       if (instrSel.size() > sel.size() && !instrSel.sign_extend() && sel.sign_extend())
1110          return false;
1111
1112       return true;
1113    }
1114
1115    return false;
1116 }
1117
1118 /* Combine an p_extract (or p_insert, in some cases) instruction with instr.
1119  * instr(p_extract(...)) -> instr()
1120  */
1121 void
1122 apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
1123 {
1124    Temp tmp = info.instr->operands[0].getTemp();
1125    SubdwordSel sel = parse_extract(info.instr);
1126    assert(sel);
1127
1128    instr->operands[idx].set16bit(false);
1129    instr->operands[idx].set24bit(false);
1130
1131    ctx.info[tmp.id()].label &= ~label_insert;
1132
1133    if (sel.size() == 4) {
1134       /* full dword selection */
1135    } else if ((instr->opcode == aco_opcode::v_cvt_f32_u32 ||
1136                instr->opcode == aco_opcode::v_cvt_f32_i32) &&
1137               sel.size() == 1 && !sel.sign_extend()) {
1138       switch (sel.offset()) {
1139       case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
1140       case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
1141       case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
1142       case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
1143       }
1144    } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() &&
1145               sel.offset() == 0 &&
1146               ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
1147                (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
1148       /* The undesirable upper bits are already shifted out. */
1149       return;
1150    } else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
1151               !instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&
1152               (instr->operands[!idx].is16bit() ||
1153                instr->operands[!idx].constantValue() <= UINT16_MAX)) {
1154       Instruction* mad =
1155          create_instruction<VALU_instruction>(aco_opcode::v_mad_u32_u16, Format::VOP3, 3, 1);
1156       mad->definitions[0] = instr->definitions[0];
1157       mad->operands[0] = instr->operands[0];
1158       mad->operands[1] = instr->operands[1];
1159       mad->operands[2] = Operand::zero();
1160       mad->valu().opsel[idx] = sel.offset();
1161       mad->pass_flags = instr->pass_flags;
1162       instr.reset(mad);
1163    } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) &&
1164               (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
1165       convert_to_SDWA(ctx.program->gfx_level, instr);
1166       instr->sdwa().sel[idx] = sel;
1167    } else if (instr->isVALU()) {
1168       if (sel.offset()) {
1169          instr->valu().opsel[idx] = true;
1170
1171          /* VOP12C cannot use opsel with SGPRs. */
1172          if (!instr->isVOP3() && !instr->isVINTERP_INREG() &&
1173              !info.instr->operands[0].isOfType(RegType::vgpr))
1174             instr->format = asVOP3(instr->format);
1175       }
1176    } else if (instr->opcode == aco_opcode::p_extract) {
1177       SubdwordSel instrSel = parse_extract(instr.get());
1178
1179       unsigned size = std::min(sel.size(), instrSel.size());
1180       unsigned offset = sel.offset() + instrSel.offset();
1181       unsigned sign_extend =
1182          instrSel.sign_extend() && (sel.sign_extend() || instrSel.size() <= sel.size());
1183
1184       instr->operands[1] = Operand::c32(offset / size);
1185       instr->operands[2] = Operand::c32(size * 8u);
1186       instr->operands[3] = Operand::c32(sign_extend);
1187       return;
1188    }
1189
1190    /* These are the only labels worth keeping at the moment. */
1191    for (Definition& def : instr->definitions) {
1192       ctx.info[def.tempId()].label &=
1193          (label_mul | label_minmax | label_usedef | label_vopc | label_f2f32 | instr_mod_labels);
1194       if (ctx.info[def.tempId()].label & instr_usedef_labels)
1195          ctx.info[def.tempId()].instr = instr.get();
1196    }
1197 }
1198
1199 void
1200 check_sdwa_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1201 {
1202    for (unsigned i = 0; i < instr->operands.size(); i++) {
1203       Operand op = instr->operands[i];
1204       if (!op.isTemp())
1205          continue;
1206       ssa_info& info = ctx.info[op.tempId()];
1207       if (info.is_extract() && (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
1208                                 op.getTemp().type() == RegType::sgpr)) {
1209          if (!can_apply_extract(ctx, instr, i, info))
1210             info.label &= ~label_extract;
1211       }
1212    }
1213 }
1214
1215 bool
1216 does_fp_op_flush_denorms(opt_ctx& ctx, aco_opcode op)
1217 {
1218    switch (op) {
1219    case aco_opcode::v_min_f32:
1220    case aco_opcode::v_max_f32:
1221    case aco_opcode::v_med3_f32:
1222    case aco_opcode::v_min3_f32:
1223    case aco_opcode::v_max3_f32:
1224    case aco_opcode::v_min_f16:
1225    case aco_opcode::v_max_f16: return ctx.program->gfx_level > GFX8;
1226    case aco_opcode::v_cndmask_b32:
1227    case aco_opcode::v_cndmask_b16:
1228    case aco_opcode::v_mov_b32:
1229    case aco_opcode::v_mov_b16: return false;
1230    default: return true;
1231    }
1232 }
1233
1234 bool
1235 can_eliminate_fcanonicalize(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp tmp, unsigned idx)
1236 {
1237    float_mode* fp = &ctx.fp_mode;
1238    if (ctx.info[tmp.id()].is_canonicalized() ||
1239        (tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
1240       return true;
1241
1242    aco_opcode op = instr->opcode;
1243    return can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, idx) &&
1244           does_fp_op_flush_denorms(ctx, op);
1245 }
1246
1247 bool
1248 can_eliminate_and_exec(opt_ctx& ctx, Temp tmp, unsigned pass_flags)
1249 {
1250    if (ctx.info[tmp.id()].is_vopc()) {
1251       Instruction* vopc_instr = ctx.info[tmp.id()].instr;
1252       /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus
1253        * already produces the same result */
1254       return vopc_instr->pass_flags == pass_flags;
1255    }
1256    if (ctx.info[tmp.id()].is_bitwise()) {
1257       Instruction* instr = ctx.info[tmp.id()].instr;
1258       if (instr->operands.size() != 2 || instr->pass_flags != pass_flags)
1259          return false;
1260       if (!(instr->operands[0].isTemp() && instr->operands[1].isTemp()))
1261          return false;
1262       if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_and_b64) {
1263          return can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), pass_flags) ||
1264                 can_eliminate_and_exec(ctx, instr->operands[1].getTemp(), pass_flags);
1265       } else {
1266          return can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), pass_flags) &&
1267                 can_eliminate_and_exec(ctx, instr->operands[1].getTemp(), pass_flags);
1268       }
1269    }
1270    return false;
1271 }
1272
1273 bool
1274 is_copy_label(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info, unsigned idx)
1275 {
1276    return info.is_temp() ||
1277           (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp, idx));
1278 }
1279
1280 bool
1281 is_op_canonicalized(opt_ctx& ctx, Operand op)
1282 {
1283    float_mode* fp = &ctx.fp_mode;
1284    if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) ||
1285        (op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
1286       return true;
1287
1288    if (op.isConstant() || (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(32))) {
1289       uint32_t val = op.isTemp() ? ctx.info[op.tempId()].val : op.constantValue();
1290       if (op.bytes() == 2)
1291          return (val & 0x7fff) == 0 || (val & 0x7fff) > 0x3ff;
1292       else if (op.bytes() == 4)
1293          return (val & 0x7fffffff) == 0 || (val & 0x7fffffff) > 0x7fffff;
1294    }
1295    return false;
1296 }
1297
1298 bool
1299 is_scratch_offset_valid(opt_ctx& ctx, Instruction* instr, int64_t offset0, int64_t offset1)
1300 {
1301    bool negative_unaligned_scratch_offset_bug = ctx.program->gfx_level == GFX10;
1302    int32_t min = ctx.program->dev.scratch_global_offset_min;
1303    int32_t max = ctx.program->dev.scratch_global_offset_max;
1304
1305    int64_t offset = offset0 + offset1;
1306
1307    bool has_vgpr_offset = instr && !instr->operands[0].isUndefined();
1308    if (negative_unaligned_scratch_offset_bug && has_vgpr_offset && offset < 0 && offset % 4)
1309       return false;
1310
1311    return offset >= min && offset <= max;
1312 }
1313
1314 bool
1315 detect_clamp(Instruction* instr, unsigned* clamped_idx)
1316 {
1317    VALU_instruction& valu = instr->valu();
1318    if (valu.omod != 0 || valu.opsel != 0)
1319       return false;
1320
1321    unsigned idx = 0;
1322    bool found_zero = false, found_one = false;
1323    bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
1324    for (unsigned i = 0; i < 3; i++) {
1325       if (!valu.neg[i] && instr->operands[i].constantEquals(0))
1326          found_zero = true;
1327       else if (!valu.neg[i] &&
1328                instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
1329          found_one = true;
1330       else
1331          idx = i;
1332    }
1333    if (found_zero && found_one && instr->operands[idx].isTemp()) {
1334       *clamped_idx = idx;
1335       return true;
1336    } else {
1337       return false;
1338    }
1339 }
1340
1341 void
1342 label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
1343 {
1344    if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) {
1345       ASSERTED bool all_const = false;
1346       for (Operand& op : instr->operands)
1347          all_const =
1348             all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));
1349       perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get());
1350
1351       ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 ||
1352                               instr->opcode == aco_opcode::s_mov_b64 ||
1353                               instr->opcode == aco_opcode::v_mov_b32;
1354       perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead",
1355                instr.get());
1356    }
1357
1358    if (instr->isSMEM())
1359       smem_combine(ctx, instr);
1360
1361    for (unsigned i = 0; i < instr->operands.size(); i++) {
1362       if (!instr->operands[i].isTemp())
1363          continue;
1364
1365       ssa_info info = ctx.info[instr->operands[i].tempId()];
1366       /* propagate undef */
1367       if (info.is_undefined() && is_phi(instr))
1368          instr->operands[i] = Operand(instr->operands[i].regClass());
1369       /* propagate reg->reg of same type */
1370       while (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {
1371          instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);
1372          info = ctx.info[info.temp.id()];
1373       }
1374
1375       /* PSEUDO: propagate temporaries */
1376       if (instr->isPseudo()) {
1377          while (info.is_temp()) {
1378             pseudo_propagate_temp(ctx, instr, info.temp, i);
1379             info = ctx.info[info.temp.id()];
1380          }
1381       }
1382
1383       /* SALU / PSEUDO: propagate inline constants */
1384       if (instr->isSALU() || instr->isPseudo()) {
1385          unsigned bits = get_operand_size(instr, i);
1386          if ((info.is_constant(bits) || (info.is_literal(bits) && instr->isPseudo())) &&
1387              alu_can_accept_constant(instr, i)) {
1388             instr->operands[i] = get_constant_op(ctx, info, bits);
1389             continue;
1390          }
1391       }
1392
1393       /* VALU: propagate neg, abs & inline constants */
1394       else if (instr->isVALU()) {
1395          if (is_copy_label(ctx, instr, info, i) && info.temp.type() == RegType::vgpr &&
1396              valu_can_accept_vgpr(instr, i)) {
1397             instr->operands[i].setTemp(info.temp);
1398             info = ctx.info[info.temp.id()];
1399          }
1400          /* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
1401          if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
1402              instr->operands.size() == 1) {
1403             instr->format = withoutDPP(instr->format);
1404             instr->operands[i].setTemp(info.temp);
1405             info = ctx.info[info.temp.id()];
1406          }
1407
1408          /* for instructions other than v_cndmask_b32, the size of the instruction should match the
1409           * operand size */
1410          unsigned can_use_mod =
1411             instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
1412          can_use_mod =
1413             can_use_mod && can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i);
1414
1415          if (instr->isSDWA())
1416             can_use_mod = can_use_mod && instr->sdwa().sel[i].size() == 4;
1417          else
1418             can_use_mod = can_use_mod && (instr->isDPP16() || can_use_VOP3(ctx, instr));
1419
1420          unsigned bits = get_operand_size(instr, i);
1421          bool mod_bitsize_compat = instr->operands[i].bytes() * 8 == bits;
1422
1423          if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32 && mod_bitsize_compat) {
1424             instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
1425             instr->operands[i].setTemp(info.temp);
1426          } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16 && mod_bitsize_compat) {
1427             instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
1428             instr->operands[i].setTemp(info.temp);
1429          } else if (info.is_neg() && can_use_mod && mod_bitsize_compat &&
1430                     can_eliminate_fcanonicalize(ctx, instr, info.temp, i)) {
1431             if (!instr->isDPP() && !instr->isSDWA())
1432                instr->format = asVOP3(instr->format);
1433             instr->operands[i].setTemp(info.temp);
1434             if (!instr->valu().abs[i])
1435                instr->valu().neg[i] = true;
1436          }
1437          if (info.is_abs() && can_use_mod && mod_bitsize_compat &&
1438              can_eliminate_fcanonicalize(ctx, instr, info.temp, i)) {
1439             if (!instr->isDPP() && !instr->isSDWA())
1440                instr->format = asVOP3(instr->format);
1441             instr->operands[i] = Operand(info.temp);
1442             instr->valu().abs[i] = true;
1443             continue;
1444          }
1445
1446          if (instr->isVOP3P()) {
1447             propagate_constants_vop3p(ctx, instr, info, i);
1448             continue;
1449          }
1450
1451          if (info.is_constant(bits) && alu_can_accept_constant(instr, i) &&
1452              (!instr->isSDWA() || ctx.program->gfx_level >= GFX9) && (!instr->isDPP() || i != 1)) {
1453             Operand op = get_constant_op(ctx, info, bits);
1454             perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,
1455                      "v_cndmask_b32 with a constant selector", instr.get());
1456             if (i == 0 || instr->isSDWA() || instr->opcode == aco_opcode::v_readlane_b32 ||
1457                 instr->opcode == aco_opcode::v_writelane_b32) {
1458                instr->format = withoutDPP(instr->format);
1459                instr->operands[i] = op;
1460                continue;
1461             } else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) {
1462                instr->operands[i] = op;
1463                instr->valu().swapOperands(0, i);
1464                continue;
1465             } else if (can_use_VOP3(ctx, instr)) {
1466                instr->format = asVOP3(instr->format);
1467                instr->operands[i] = op;
1468                continue;
1469             }
1470          }
1471       }
1472
1473       /* MUBUF: propagate constants and combine additions */
1474       else if (instr->isMUBUF()) {
1475          MUBUF_instruction& mubuf = instr->mubuf();
1476          Temp base;
1477          uint32_t offset;
1478          while (info.is_temp())
1479             info = ctx.info[info.temp.id()];
1480
1481          /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
1482           * overflow for scratch accesses works only on GFX9+ and saddr overflow
1483           * never works. Since swizzling is the only thing that separates
1484           * scratch accesses and other accesses and swizzling changing how
1485           * addressing works significantly, this probably applies to swizzled
1486           * MUBUF accesses. */
1487          bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->gfx_level < GFX9;
1488
1489          if (mubuf.offen && mubuf.idxen && i == 1 && info.is_vec() &&
1490              info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
1491              info.instr->operands[0].regClass() == v1 && info.instr->operands[1].isConstant() &&
1492              mubuf.offset + info.instr->operands[1].constantValue() < 4096) {
1493             instr->operands[1] = info.instr->operands[0];
1494             mubuf.offset += info.instr->operands[1].constantValue();
1495             mubuf.offen = false;
1496             continue;
1497          } else if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) &&
1498                     mubuf.offset + info.val < 4096) {
1499             assert(!mubuf.idxen);
1500             instr->operands[1] = Operand(v1);
1501             mubuf.offset += info.val;
1502             mubuf.offen = false;
1503             continue;
1504          } else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) {
1505             instr->operands[2] = Operand::c32(0);
1506             mubuf.offset += info.val;
1507             continue;
1508          } else if (mubuf.offen && i == 1 &&
1509                     parse_base_offset(ctx, instr.get(), i, &base, &offset,
1510                                       vaddr_prevent_overflow) &&
1511                     base.regClass() == v1 && mubuf.offset + offset < 4096) {
1512             assert(!mubuf.idxen);
1513             instr->operands[1].setTemp(base);
1514             mubuf.offset += offset;
1515             continue;
1516          } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
1517                     base.regClass() == s1 && mubuf.offset + offset < 4096 && !mubuf.swizzled) {
1518             instr->operands[i].setTemp(base);
1519             mubuf.offset += offset;
1520             continue;
1521          }
1522       }
1523
1524       else if (instr->isMTBUF()) {
1525          MTBUF_instruction& mtbuf = instr->mtbuf();
1526          while (info.is_temp())
1527             info = ctx.info[info.temp.id()];
1528
1529          if (mtbuf.offen && mtbuf.idxen && i == 1 && info.is_vec() &&
1530              info.instr->operands.size() == 2 && info.instr->operands[0].isTemp() &&
1531              info.instr->operands[0].regClass() == v1 && info.instr->operands[1].isConstant() &&
1532              mtbuf.offset + info.instr->operands[1].constantValue() < 4096) {
1533             instr->operands[1] = info.instr->operands[0];
1534             mtbuf.offset += info.instr->operands[1].constantValue();
1535             mtbuf.offen = false;
1536             continue;
1537          }
1538       }
1539
1540       /* SCRATCH: propagate constants and combine additions */
1541       else if (instr->isScratch()) {
1542          FLAT_instruction& scratch = instr->scratch();
1543          Temp base;
1544          uint32_t offset;
1545          while (info.is_temp())
1546             info = ctx.info[info.temp.id()];
1547
1548          /* The hardware probably does: 'scratch_base + u2u64(saddr) + i2i64(offset)'. This means
1549           * we can't combine the addition if the unsigned addition overflows and offset is
1550           * positive. In theory, there is also issues if
1551           * 'ilt(offset, 0) && ige(saddr, 0) && ilt(saddr + offset, 0)', but that just
1552           * replaces an already out-of-bounds access with a larger one since 'saddr + offset'
1553           * would be larger than INT32_MAX.
1554           */
1555          if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) &&
1556              base.regClass() == instr->operands[i].regClass() &&
1557              is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) {
1558             instr->operands[i].setTemp(base);
1559             scratch.offset += (int32_t)offset;
1560             continue;
1561          } else if (i <= 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
1562                     base.regClass() == instr->operands[i].regClass() && (int32_t)offset < 0 &&
1563                     is_scratch_offset_valid(ctx, instr.get(), scratch.offset, (int32_t)offset)) {
1564             instr->operands[i].setTemp(base);
1565             scratch.offset += (int32_t)offset;
1566             continue;
1567          } else if (i <= 1 && info.is_constant_or_literal(32) &&
1568                     ctx.program->gfx_level >= GFX10_3 &&
1569                     is_scratch_offset_valid(ctx, NULL, scratch.offset, (int32_t)info.val)) {
1570             /* GFX10.3+ can disable both SADDR and ADDR. */
1571             instr->operands[i] = Operand(instr->operands[i].regClass());
1572             scratch.offset += (int32_t)info.val;
1573             continue;
1574          }
1575       }
1576
1577       /* DS: combine additions */
1578       else if (instr->isDS()) {
1579
1580          DS_instruction& ds = instr->ds();
1581          Temp base;
1582          uint32_t offset;
1583          bool has_usable_ds_offset = ctx.program->gfx_level >= GFX7;
1584          if (has_usable_ds_offset && i == 0 &&
1585              parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
1586              base.regClass() == instr->operands[i].regClass() &&
1587              instr->opcode != aco_opcode::ds_swizzle_b32) {
1588             if (instr->opcode == aco_opcode::ds_write2_b32 ||
1589                 instr->opcode == aco_opcode::ds_read2_b32 ||
1590                 instr->opcode == aco_opcode::ds_write2_b64 ||
1591                 instr->opcode == aco_opcode::ds_read2_b64 ||
1592                 instr->opcode == aco_opcode::ds_write2st64_b32 ||
1593                 instr->opcode == aco_opcode::ds_read2st64_b32 ||
1594                 instr->opcode == aco_opcode::ds_write2st64_b64 ||
1595                 instr->opcode == aco_opcode::ds_read2st64_b64) {
1596                bool is64bit = instr->opcode == aco_opcode::ds_write2_b64 ||
1597                               instr->opcode == aco_opcode::ds_read2_b64 ||
1598                               instr->opcode == aco_opcode::ds_write2st64_b64 ||
1599                               instr->opcode == aco_opcode::ds_read2st64_b64;
1600                bool st64 = instr->opcode == aco_opcode::ds_write2st64_b32 ||
1601                            instr->opcode == aco_opcode::ds_read2st64_b32 ||
1602                            instr->opcode == aco_opcode::ds_write2st64_b64 ||
1603                            instr->opcode == aco_opcode::ds_read2st64_b64;
1604                unsigned shifts = (is64bit ? 3 : 2) + (st64 ? 6 : 0);
1605                unsigned mask = BITFIELD_MASK(shifts);
1606
1607                if ((offset & mask) == 0 && ds.offset0 + (offset >> shifts) <= 255 &&
1608                    ds.offset1 + (offset >> shifts) <= 255) {
1609                   instr->operands[i].setTemp(base);
1610                   ds.offset0 += offset >> shifts;
1611                   ds.offset1 += offset >> shifts;
1612                }
1613             } else {
1614                if (ds.offset0 + offset <= 65535) {
1615                   instr->operands[i].setTemp(base);
1616                   ds.offset0 += offset;
1617                }
1618             }
1619          }
1620       }
1621
1622       else if (instr->isBranch()) {
1623          if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
1624             /* Flip the branch instruction to get rid of the scc_invert instruction */
1625             instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
1626                                                                      : aco_opcode::p_cbranch_z;
1627             instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
1628          }
1629       }
1630    }
1631
1632    /* if this instruction doesn't define anything, return */
1633    if (instr->definitions.empty()) {
1634       check_sdwa_extract(ctx, instr);
1635       return;
1636    }
1637
1638    if (instr->isVALU() || instr->isVINTRP()) {
1639       if (instr_info.can_use_output_modifiers[(int)instr->opcode] || instr->isVINTRP() ||
1640           instr->opcode == aco_opcode::v_cndmask_b32) {
1641          bool canonicalized = true;
1642          if (!does_fp_op_flush_denorms(ctx, instr->opcode)) {
1643             unsigned ops = instr->opcode == aco_opcode::v_cndmask_b32 ? 2 : instr->operands.size();
1644             for (unsigned i = 0; canonicalized && (i < ops); i++)
1645                canonicalized = is_op_canonicalized(ctx, instr->operands[i]);
1646          }
1647          if (canonicalized)
1648             ctx.info[instr->definitions[0].tempId()].set_canonicalized();
1649       }
1650
1651       if (instr->isVOPC()) {
1652          ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());
1653          check_sdwa_extract(ctx, instr);
1654          return;
1655       }
1656       if (instr->isVOP3P()) {
1657          ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
1658          return;
1659       }
1660    }
1661
1662    switch (instr->opcode) {
1663    case aco_opcode::p_create_vector: {
1664       bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&
1665                        instr->operands[0].regClass() == instr->definitions[0].regClass();
1666       if (copy_prop) {
1667          ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1668          break;
1669       }
1670
1671       /* expand vector operands */
1672       std::vector<Operand> ops;
1673       unsigned offset = 0;
1674       for (const Operand& op : instr->operands) {
1675          /* ensure that any expanded operands are properly aligned */
1676          bool aligned = offset % 4 == 0 || op.bytes() < 4;
1677          offset += op.bytes();
1678          if (aligned && op.isTemp() && ctx.info[op.tempId()].is_vec()) {
1679             Instruction* vec = ctx.info[op.tempId()].instr;
1680             for (const Operand& vec_op : vec->operands)
1681                ops.emplace_back(vec_op);
1682          } else {
1683             ops.emplace_back(op);
1684          }
1685       }
1686
1687       /* combine expanded operands to new vector */
1688       if (ops.size() != instr->operands.size()) {
1689          assert(ops.size() > instr->operands.size());
1690          Definition def = instr->definitions[0];
1691          instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
1692                                                             Format::PSEUDO, ops.size(), 1));
1693          for (unsigned i = 0; i < ops.size(); i++) {
1694             if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() &&
1695                 ops[i].regClass() == ctx.info[ops[i].tempId()].temp.regClass())
1696                ops[i].setTemp(ctx.info[ops[i].tempId()].temp);
1697             instr->operands[i] = ops[i];
1698          }
1699          instr->definitions[0] = def;
1700       } else {
1701          for (unsigned i = 0; i < ops.size(); i++) {
1702             assert(instr->operands[i] == ops[i]);
1703          }
1704       }
1705       ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
1706
1707       if (instr->operands.size() == 2) {
1708          /* check if this is created from split_vector */
1709          if (instr->operands[1].isTemp() && ctx.info[instr->operands[1].tempId()].is_split()) {
1710             Instruction* split = ctx.info[instr->operands[1].tempId()].instr;
1711             if (instr->operands[0].isTemp() &&
1712                 instr->operands[0].getTemp() == split->definitions[0].getTemp())
1713                ctx.info[instr->definitions[0].tempId()].set_temp(split->operands[0].getTemp());
1714          }
1715       }
1716       break;
1717    }
1718    case aco_opcode::p_split_vector: {
1719       ssa_info& info = ctx.info[instr->operands[0].tempId()];
1720
1721       if (info.is_constant_or_literal(32)) {
1722          uint64_t val = info.val;
1723          for (Definition def : instr->definitions) {
1724             uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u);
1725             ctx.info[def.tempId()].set_constant(ctx.program->gfx_level, val & mask);
1726             val >>= def.bytes() * 8u;
1727          }
1728          break;
1729       } else if (!info.is_vec()) {
1730          if (instr->definitions.size() == 2 && instr->operands[0].isTemp() &&
1731              instr->definitions[0].bytes() == instr->definitions[1].bytes()) {
1732             ctx.info[instr->definitions[1].tempId()].set_split(instr.get());
1733             if (instr->operands[0].bytes() == 4) {
1734                /* D16 subdword split */
1735                ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1736                ctx.info[instr->definitions[1].tempId()].set_extract(instr.get());
1737             }
1738          }
1739          break;
1740       }
1741
1742       Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1743       unsigned split_offset = 0;
1744       unsigned vec_offset = 0;
1745       unsigned vec_index = 0;
1746       for (unsigned i = 0; i < instr->definitions.size();
1747            split_offset += instr->definitions[i++].bytes()) {
1748          while (vec_offset < split_offset && vec_index < vec->operands.size())
1749             vec_offset += vec->operands[vec_index++].bytes();
1750
1751          if (vec_offset != split_offset ||
1752              vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
1753             continue;
1754
1755          Operand vec_op = vec->operands[vec_index];
1756          if (vec_op.isConstant()) {
1757             ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->gfx_level,
1758                                                                   vec_op.constantValue64());
1759          } else if (vec_op.isUndefined()) {
1760             ctx.info[instr->definitions[i].tempId()].set_undefined();
1761          } else {
1762             assert(vec_op.isTemp());
1763             ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());
1764          }
1765       }
1766       break;
1767    }
1768    case aco_opcode::p_extract_vector: { /* mov */
1769       ssa_info& info = ctx.info[instr->operands[0].tempId()];
1770       const unsigned index = instr->operands[1].constantValue();
1771       const unsigned dst_offset = index * instr->definitions[0].bytes();
1772
1773       if (info.is_vec()) {
1774          /* check if we index directly into a vector element */
1775          Instruction* vec = info.instr;
1776          unsigned offset = 0;
1777
1778          for (const Operand& op : vec->operands) {
1779             if (offset < dst_offset) {
1780                offset += op.bytes();
1781                continue;
1782             } else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
1783                break;
1784             }
1785             instr->operands[0] = op;
1786             break;
1787          }
1788       } else if (info.is_constant_or_literal(32)) {
1789          /* propagate constants */
1790          uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
1791          uint32_t val = (info.val >> (dst_offset * 8u)) & mask;
1792          instr->operands[0] =
1793             Operand::get_const(ctx.program->gfx_level, val, instr->definitions[0].bytes());
1794          ;
1795       }
1796
1797       if (instr->operands[0].bytes() != instr->definitions[0].bytes()) {
1798          if (instr->operands[0].size() != 1)
1799             break;
1800
1801          if (index == 0)
1802             ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1803          else
1804             ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
1805          break;
1806       }
1807
1808       /* convert this extract into a copy instruction */
1809       instr->opcode = aco_opcode::p_parallelcopy;
1810       instr->operands.pop_back();
1811       FALLTHROUGH;
1812    }
1813    case aco_opcode::p_parallelcopy: /* propagate */
1814       if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() &&
1815           instr->operands[0].regClass() != instr->definitions[0].regClass()) {
1816          /* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so
1817           * duplicate the vector instead.
1818           */
1819          Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
1820          aco_ptr<Instruction> old_copy = std::move(instr);
1821
1822          instr.reset(create_instruction<Pseudo_instruction>(
1823             aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1));
1824          instr->definitions[0] = old_copy->definitions[0];
1825          std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin());
1826          for (unsigned i = 0; i < vec->operands.size(); i++) {
1827             Operand& op = instr->operands[i];
1828             if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&
1829                 ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())
1830                op.setTemp(ctx.info[op.tempId()].temp);
1831          }
1832          ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
1833          break;
1834       }
1835       FALLTHROUGH;
1836    case aco_opcode::p_as_uniform:
1837       if (instr->definitions[0].isFixed()) {
1838          /* don't copy-propagate copies into fixed registers */
1839       } else if (instr->operands[0].isConstant()) {
1840          ctx.info[instr->definitions[0].tempId()].set_constant(
1841             ctx.program->gfx_level, instr->operands[0].constantValue64());
1842       } else if (instr->operands[0].isTemp()) {
1843          ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1844          if (ctx.info[instr->operands[0].tempId()].is_canonicalized())
1845             ctx.info[instr->definitions[0].tempId()].set_canonicalized();
1846       } else {
1847          assert(instr->operands[0].isFixed());
1848       }
1849       break;
1850    case aco_opcode::v_mov_b32:
1851       if (instr->isDPP16()) {
1852          /* anything else doesn't make sense in SSA */
1853          assert(instr->dpp16().row_mask == 0xf && instr->dpp16().bank_mask == 0xf);
1854          ctx.info[instr->definitions[0].tempId()].set_dpp16(instr.get());
1855       } else if (instr->isDPP8()) {
1856          ctx.info[instr->definitions[0].tempId()].set_dpp8(instr.get());
1857       }
1858       break;
1859    case aco_opcode::p_is_helper:
1860       if (!ctx.program->needs_wqm)
1861          ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->gfx_level, 0u);
1862       break;
1863    case aco_opcode::v_mul_f64: ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); break;
1864    case aco_opcode::v_mul_f16:
1865    case aco_opcode::v_mul_f32:
1866    case aco_opcode::v_mul_legacy_f32: { /* omod */
1867       ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
1868
1869       /* TODO: try to move the negate/abs modifier to the consumer instead */
1870       bool uses_mods = instr->usesModifiers();
1871       bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
1872
1873       for (unsigned i = 0; i < 2; i++) {
1874          if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
1875             if (!instr->isDPP() && !instr->isSDWA() && !instr->valu().opsel &&
1876                 (instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) ||   /* 1.0 */
1877                  instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */
1878                bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u);
1879
1880                VALU_instruction* vop3 = instr->isVOP3() ? &instr->valu() : NULL;
1881                if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod))
1882                   continue;
1883
1884                bool abs = vop3 && vop3->abs[i];
1885                bool neg = neg1 ^ (vop3 && vop3->neg[i]);
1886
1887                Temp other = instr->operands[i].getTemp();
1888                if (abs && neg && other.type() == RegType::vgpr)
1889                   ctx.info[instr->definitions[0].tempId()].set_neg_abs(other);
1890                else if (abs && !neg && other.type() == RegType::vgpr)
1891                   ctx.info[instr->definitions[0].tempId()].set_abs(other);
1892                else if (!abs && neg && other.type() == RegType::vgpr)
1893                   ctx.info[instr->definitions[0].tempId()].set_neg(other);
1894                else if (!abs && !neg)
1895                   ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other);
1896             } else if (uses_mods || ((fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64
1897                                            : ctx.fp_mode.preserve_signed_zero_inf_nan32) &&
1898                                      instr->opcode != aco_opcode::v_mul_legacy_f32)) {
1899                continue; /* omod uses a legacy multiplication. */
1900             } else if (instr->operands[!i].constantValue() == 0u) { /* 0.0 */
1901                ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->gfx_level, 0u);
1902             } else if ((fp16 ? ctx.fp_mode.denorm16_64 : ctx.fp_mode.denorm32) != fp_denorm_flush) {
1903                /* omod has no effect if denormals are enabled. */
1904                continue;
1905             } else if (instr->operands[!i].constantValue() ==
1906                        (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
1907                ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());
1908             } else if (instr->operands[!i].constantValue() ==
1909                        (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
1910                ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());
1911             } else if (instr->operands[!i].constantValue() ==
1912                        (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */
1913                ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());
1914             } else {
1915                continue;
1916             }
1917             break;
1918          }
1919       }
1920       break;
1921    }
1922    case aco_opcode::v_mul_lo_u16:
1923    case aco_opcode::v_mul_lo_u16_e64:
1924    case aco_opcode::v_mul_u32_u24:
1925       ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
1926       break;
1927    case aco_opcode::v_med3_f16:
1928    case aco_opcode::v_med3_f32: { /* clamp */
1929       unsigned idx;
1930       if (detect_clamp(instr.get(), &idx) && !instr->valu().abs && !instr->valu().neg)
1931          ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get());
1932       break;
1933    }
1934    case aco_opcode::v_cndmask_b32:
1935       if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0xFFFFFFFF))
1936          ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
1937       else if (instr->operands[0].constantEquals(0) &&
1938                instr->operands[1].constantEquals(0x3f800000u))
1939          ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
1940       else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(1))
1941          ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
1942
1943       break;
1944    case aco_opcode::v_cmp_lg_u32:
1945       if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
1946           instr->operands[0].constantEquals(0) && instr->operands[1].isTemp() &&
1947           ctx.info[instr->operands[1].tempId()].is_vcc())
1948          ctx.info[instr->definitions[0].tempId()].set_temp(
1949             ctx.info[instr->operands[1].tempId()].temp);
1950       break;
1951    case aco_opcode::p_linear_phi: {
1952       /* lower_bool_phis() can create phis like this */
1953       bool all_same_temp = instr->operands[0].isTemp();
1954       /* this check is needed when moving uniform loop counters out of a divergent loop */
1955       if (all_same_temp)
1956          all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
1957       for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
1958          if (!instr->operands[i].isTemp() ||
1959              instr->operands[i].tempId() != instr->operands[0].tempId())
1960             all_same_temp = false;
1961       }
1962       if (all_same_temp) {
1963          ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
1964       } else {
1965          bool all_undef = instr->operands[0].isUndefined();
1966          for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {
1967             if (!instr->operands[i].isUndefined())
1968                all_undef = false;
1969          }
1970          if (all_undef)
1971             ctx.info[instr->definitions[0].tempId()].set_undefined();
1972       }
1973       break;
1974    }
1975    case aco_opcode::v_add_u32:
1976    case aco_opcode::v_add_co_u32:
1977    case aco_opcode::v_add_co_u32_e64:
1978    case aco_opcode::s_add_i32:
1979    case aco_opcode::s_add_u32:
1980    case aco_opcode::v_subbrev_co_u32:
1981    case aco_opcode::v_sub_u32:
1982    case aco_opcode::v_sub_i32:
1983    case aco_opcode::v_sub_co_u32:
1984    case aco_opcode::v_sub_co_u32_e64:
1985    case aco_opcode::s_sub_u32:
1986    case aco_opcode::s_sub_i32:
1987    case aco_opcode::v_subrev_u32:
1988    case aco_opcode::v_subrev_co_u32:
1989    case aco_opcode::v_subrev_co_u32_e64:
1990       ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
1991       break;
1992    case aco_opcode::s_not_b32:
1993    case aco_opcode::s_not_b64:
1994       if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
1995          ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
1996          ctx.info[instr->definitions[1].tempId()].set_scc_invert(
1997             ctx.info[instr->operands[0].tempId()].temp);
1998       } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
1999          ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
2000          ctx.info[instr->definitions[1].tempId()].set_scc_invert(
2001             ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
2002       }
2003       ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
2004       break;
2005    case aco_opcode::s_and_b32:
2006    case aco_opcode::s_and_b64:
2007       if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
2008          if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
2009             /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a
2010              * uniform bool into divergent */
2011             ctx.info[instr->definitions[1].tempId()].set_temp(
2012                ctx.info[instr->operands[0].tempId()].temp);
2013             ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
2014                ctx.info[instr->operands[0].tempId()].temp);
2015             break;
2016          } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
2017             /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction
2018              * already produces the same SCC */
2019             ctx.info[instr->definitions[1].tempId()].set_temp(
2020                ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
2021             ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
2022                ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
2023             break;
2024          } else if ((ctx.program->stage.num_sw_stages() > 1 ||
2025                      ctx.program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER) &&
2026                     instr->pass_flags == 1) {
2027             /* In case of merged shaders, pass_flags=1 means that all lanes are active (exec=-1), so
2028              * s_and is unnecessary. */
2029             ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
2030             break;
2031          }
2032       }
2033       FALLTHROUGH;
2034    case aco_opcode::s_or_b32:
2035    case aco_opcode::s_or_b64:
2036    case aco_opcode::s_xor_b32:
2037    case aco_opcode::s_xor_b64:
2038       if (std::all_of(instr->operands.begin(), instr->operands.end(),
2039                       [&ctx](const Operand& op)
2040                       {
2041                          return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() ||
2042                                                 ctx.info[op.tempId()].is_uniform_bitwise());
2043                       })) {
2044          ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
2045       }
2046       ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
2047       break;
2048    case aco_opcode::s_lshl_b32:
2049    case aco_opcode::v_or_b32:
2050    case aco_opcode::v_lshlrev_b32:
2051    case aco_opcode::v_bcnt_u32_b32:
2052    case aco_opcode::v_and_b32:
2053    case aco_opcode::v_xor_b32:
2054    case aco_opcode::v_not_b32:
2055       ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
2056       break;
2057    case aco_opcode::v_min_f32:
2058    case aco_opcode::v_min_f16:
2059    case aco_opcode::v_min_u32:
2060    case aco_opcode::v_min_i32:
2061    case aco_opcode::v_min_u16:
2062    case aco_opcode::v_min_i16:
2063    case aco_opcode::v_min_u16_e64:
2064    case aco_opcode::v_min_i16_e64:
2065    case aco_opcode::v_max_f32:
2066    case aco_opcode::v_max_f16:
2067    case aco_opcode::v_max_u32:
2068    case aco_opcode::v_max_i32:
2069    case aco_opcode::v_max_u16:
2070    case aco_opcode::v_max_i16:
2071    case aco_opcode::v_max_u16_e64:
2072    case aco_opcode::v_max_i16_e64:
2073       ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());
2074       break;
2075    case aco_opcode::s_cselect_b64:
2076    case aco_opcode::s_cselect_b32:
2077       if (instr->operands[0].constantEquals((unsigned)-1) && instr->operands[1].constantEquals(0)) {
2078          /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
2079          ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
2080       }
2081       if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
2082          /* Flip the operands to get rid of the scc_invert instruction */
2083          std::swap(instr->operands[0], instr->operands[1]);
2084          instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
2085       }
2086       break;
2087    case aco_opcode::s_mul_i32:
2088       /* Testing every uint32_t shows that 0x3f800000*n is never a denormal.
2089        * This pattern is created from a uniform nir_op_b2f. */
2090       if (instr->operands[0].constantEquals(0x3f800000u))
2091          ctx.info[instr->definitions[0].tempId()].set_canonicalized();
2092       break;
2093    case aco_opcode::p_extract: {
2094       if (instr->definitions[0].bytes() == 4) {
2095          ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
2096          if (instr->operands[0].regClass() == v1 && parse_insert(instr.get()))
2097             ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
2098       }
2099       break;
2100    }
2101    case aco_opcode::p_insert: {
2102       if (instr->operands[0].bytes() == 4) {
2103          if (instr->operands[0].regClass() == v1)
2104             ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
2105          if (parse_extract(instr.get()))
2106             ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
2107          ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
2108       }
2109       break;
2110    }
2111    case aco_opcode::ds_read_u8:
2112    case aco_opcode::ds_read_u8_d16:
2113    case aco_opcode::ds_read_u16:
2114    case aco_opcode::ds_read_u16_d16: {
2115       ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
2116       break;
2117    }
2118    case aco_opcode::v_mbcnt_lo_u32_b32: {
2119       if (instr->operands[0].constantEquals(-1) && instr->operands[1].constantEquals(0)) {
2120          if (ctx.program->wave_size == 32)
2121             ctx.info[instr->definitions[0].tempId()].set_subgroup_invocation(instr.get());
2122          else
2123             ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
2124       }
2125       break;
2126    }
2127    case aco_opcode::v_mbcnt_hi_u32_b32:
2128    case aco_opcode::v_mbcnt_hi_u32_b32_e64: {
2129       if (instr->operands[0].constantEquals(-1) && instr->operands[1].isTemp() &&
2130           ctx.info[instr->operands[1].tempId()].is_usedef()) {
2131          Instruction* usedef_instr = ctx.info[instr->operands[1].tempId()].instr;
2132          if (usedef_instr->opcode == aco_opcode::v_mbcnt_lo_u32_b32 &&
2133              usedef_instr->operands[0].constantEquals(-1) &&
2134              usedef_instr->operands[1].constantEquals(0))
2135             ctx.info[instr->definitions[0].tempId()].set_subgroup_invocation(instr.get());
2136       }
2137       break;
2138    }
2139    case aco_opcode::v_cvt_f16_f32: {
2140       if (instr->operands[0].isTemp())
2141          ctx.info[instr->operands[0].tempId()].set_f2f16(instr.get());
2142       break;
2143    }
2144    case aco_opcode::v_cvt_f32_f16: {
2145       if (instr->operands[0].isTemp())
2146          ctx.info[instr->definitions[0].tempId()].set_f2f32(instr.get());
2147       break;
2148    }
2149    default: break;
2150    }
2151
2152    /* Don't remove label_extract if we can't apply the extract to
2153     * neg/abs instructions because we'll likely combine it into another valu. */
2154    if (!(ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)))
2155       check_sdwa_extract(ctx, instr);
2156 }
2157
2158 unsigned
2159 original_temp_id(opt_ctx& ctx, Temp tmp)
2160 {
2161    if (ctx.info[tmp.id()].is_temp())
2162       return ctx.info[tmp.id()].temp.id();
2163    else
2164       return tmp.id();
2165 }
2166
2167 void
2168 decrease_op_uses_if_dead(opt_ctx& ctx, Instruction* instr)
2169 {
2170    if (is_dead(ctx.uses, instr)) {
2171       for (const Operand& op : instr->operands) {
2172          if (op.isTemp())
2173             ctx.uses[op.tempId()]--;
2174       }
2175    }
2176 }
2177
2178 void
2179 decrease_uses(opt_ctx& ctx, Instruction* instr)
2180 {
2181    ctx.uses[instr->definitions[0].tempId()]--;
2182    decrease_op_uses_if_dead(ctx, instr);
2183 }
2184
2185 Operand
2186 copy_operand(opt_ctx& ctx, Operand op)
2187 {
2188    if (op.isTemp())
2189       ctx.uses[op.tempId()]++;
2190    return op;
2191 }
2192
2193 Instruction*
2194 follow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false)
2195 {
2196    if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels))
2197       return nullptr;
2198    if (!ignore_uses && ctx.uses[op.tempId()] > 1)
2199       return nullptr;
2200
2201    Instruction* instr = ctx.info[op.tempId()].instr;
2202
2203    if (instr->definitions.size() == 2) {
2204       assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
2205       if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2206          return nullptr;
2207    }
2208
2209    for (Operand& operand : instr->operands) {
2210       if (fixed_to_exec(operand))
2211          return nullptr;
2212    }
2213
2214    return instr;
2215 }
2216
2217 /* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
2218  * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
2219 bool
2220 combine_ordering_test(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2221 {
2222    if (instr->definitions[0].regClass() != ctx.program->lane_mask)
2223       return false;
2224    if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2225       return false;
2226
2227    bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
2228
2229    bitarray8 opsel = 0;
2230    Instruction* op_instr[2];
2231    Temp op[2];
2232
2233    unsigned bitsize = 0;
2234    for (unsigned i = 0; i < 2; i++) {
2235       op_instr[i] = follow_operand(ctx, instr->operands[i], true);
2236       if (!op_instr[i])
2237          return false;
2238
2239       aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
2240       unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode);
2241
2242       if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp)
2243          return false;
2244       if (bitsize && op_bitsize != bitsize)
2245          return false;
2246       if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())
2247          return false;
2248
2249       if (op_instr[i]->isSDWA() || op_instr[i]->isDPP())
2250          return false;
2251
2252       VALU_instruction& valu = op_instr[i]->valu();
2253       if (valu.neg[0] != valu.neg[1] || valu.abs[0] != valu.abs[1] ||
2254           valu.opsel[0] != valu.opsel[1])
2255          return false;
2256       opsel[i] = valu.opsel[0];
2257
2258       Temp op0 = op_instr[i]->operands[0].getTemp();
2259       Temp op1 = op_instr[i]->operands[1].getTemp();
2260       if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))
2261          return false;
2262
2263       op[i] = op1;
2264       bitsize = op_bitsize;
2265    }
2266
2267    if (op[1].type() == RegType::sgpr) {
2268       std::swap(op[0], op[1]);
2269       opsel[0].swap(opsel[1]);
2270    }
2271    unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);
2272    if (num_sgprs > (ctx.program->gfx_level >= GFX10 ? 2 : 1))
2273       return false;
2274
2275    aco_opcode new_op = aco_opcode::num_opcodes;
2276    switch (bitsize) {
2277    case 16: new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; break;
2278    case 32: new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; break;
2279    case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break;
2280    }
2281    bool needs_vop3 = num_sgprs > 1 || (opsel[0] && op[0].type() != RegType::vgpr);
2282    VALU_instruction* new_instr = create_instruction<VALU_instruction>(
2283       new_op, needs_vop3 ? asVOP3(Format::VOPC) : Format::VOPC, 2, 1);
2284
2285    new_instr->opsel = opsel;
2286    new_instr->operands[0] = copy_operand(ctx, Operand(op[0]));
2287    new_instr->operands[1] = copy_operand(ctx, Operand(op[1]));
2288    new_instr->definitions[0] = instr->definitions[0];
2289    new_instr->pass_flags = instr->pass_flags;
2290
2291    decrease_uses(ctx, op_instr[0]);
2292    decrease_uses(ctx, op_instr[1]);
2293
2294    ctx.info[instr->definitions[0].tempId()].label = 0;
2295    ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2296
2297    instr.reset(new_instr);
2298
2299    return true;
2300 }
2301
2302 /* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
2303  * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
2304 bool
2305 combine_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2306 {
2307    if (instr->definitions[0].regClass() != ctx.program->lane_mask)
2308       return false;
2309    if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2310       return false;
2311
2312    bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
2313    aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
2314
2315    Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
2316    Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
2317    if (!nan_test || !cmp)
2318       return false;
2319    if (nan_test->isSDWA() || cmp->isSDWA())
2320       return false;
2321
2322    if (get_f32_cmp(cmp->opcode) == expected_nan_test)
2323       std::swap(nan_test, cmp);
2324    else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
2325       return false;
2326
2327    if (!is_fp_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))
2328       return false;
2329
2330    if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
2331       return false;
2332    if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())
2333       return false;
2334
2335    unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());
2336    unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());
2337    unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
2338    unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
2339    VALU_instruction& cmp_valu = cmp->valu();
2340    VALU_instruction& nan_valu = nan_test->valu();
2341    if ((prop_cmp0 != prop_nan0 || cmp_valu.opsel[0] != nan_valu.opsel[0]) &&
2342        (prop_cmp0 != prop_nan1 || cmp_valu.opsel[0] != nan_valu.opsel[1]))
2343       return false;
2344    if ((prop_cmp1 != prop_nan0 || cmp_valu.opsel[1] != nan_valu.opsel[0]) &&
2345        (prop_cmp1 != prop_nan1 || cmp_valu.opsel[1] != nan_valu.opsel[1]))
2346       return false;
2347    if (prop_cmp0 == prop_cmp1 && cmp_valu.opsel[0] == cmp_valu.opsel[1])
2348       return false;
2349
2350    aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
2351    VALU_instruction* new_instr = create_instruction<VALU_instruction>(
2352       new_op, cmp->isVOP3() ? asVOP3(Format::VOPC) : Format::VOPC, 2, 1);
2353    new_instr->neg = cmp_valu.neg;
2354    new_instr->abs = cmp_valu.abs;
2355    new_instr->clamp = cmp_valu.clamp;
2356    new_instr->omod = cmp_valu.omod;
2357    new_instr->opsel = cmp_valu.opsel;
2358    new_instr->operands[0] = copy_operand(ctx, cmp->operands[0]);
2359    new_instr->operands[1] = copy_operand(ctx, cmp->operands[1]);
2360    new_instr->definitions[0] = instr->definitions[0];
2361    new_instr->pass_flags = instr->pass_flags;
2362
2363    decrease_uses(ctx, nan_test);
2364    decrease_uses(ctx, cmp);
2365
2366    ctx.info[instr->definitions[0].tempId()].label = 0;
2367    ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2368
2369    instr.reset(new_instr);
2370
2371    return true;
2372 }
2373
2374 /* Optimize v_cmp of constant with subgroup invocation to a constant mask.
2375  * Ideally, we can trade v_cmp for a constant (or literal).
2376  * In a less ideal case, we trade v_cmp for a SALU instruction, which is still a win.
2377  */
2378 bool
2379 optimize_cmp_subgroup_invocation(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2380 {
2381    /* This optimization only applies to VOPC with 2 operands. */
2382    if (instr->operands.size() != 2)
2383       return false;
2384
2385    /* Find the constant operand or return early if there isn't one. */
2386    const int const_op_idx = instr->operands[0].isConstant()   ? 0
2387                             : instr->operands[1].isConstant() ? 1
2388                                                               : -1;
2389    if (const_op_idx == -1)
2390       return false;
2391
2392    /* Find the operand that has the subgroup invocation. */
2393    const int mbcnt_op_idx = 1 - const_op_idx;
2394    const Operand mbcnt_op = instr->operands[mbcnt_op_idx];
2395    if (!mbcnt_op.isTemp() || !ctx.info[mbcnt_op.tempId()].is_subgroup_invocation())
2396       return false;
2397
2398    /* Adjust opcode so we don't have to care about const_op_idx below. */
2399    const aco_opcode op = const_op_idx == 0 ? get_swapped(instr->opcode) : instr->opcode;
2400    const unsigned wave_size = ctx.program->wave_size;
2401    const unsigned val = instr->operands[const_op_idx].constantValue();
2402
2403    /* Find suitable constant bitmask corresponding to the value. */
2404    unsigned first_bit = 0, num_bits = 0;
2405    switch (op) {
2406    case aco_opcode::v_cmp_eq_u32:
2407    case aco_opcode::v_cmp_eq_i32:
2408       first_bit = val;
2409       num_bits = val >= wave_size ? 0 : 1;
2410       break;
2411    case aco_opcode::v_cmp_le_u32:
2412    case aco_opcode::v_cmp_le_i32:
2413       first_bit = 0;
2414       num_bits = val >= wave_size ? wave_size : (val + 1);
2415       break;
2416    case aco_opcode::v_cmp_lt_u32:
2417    case aco_opcode::v_cmp_lt_i32:
2418       first_bit = 0;
2419       num_bits = val >= wave_size ? wave_size : val;
2420       break;
2421    case aco_opcode::v_cmp_ge_u32:
2422    case aco_opcode::v_cmp_ge_i32:
2423       first_bit = val;
2424       num_bits = val >= wave_size ? 0 : (wave_size - val);
2425       break;
2426    case aco_opcode::v_cmp_gt_u32:
2427    case aco_opcode::v_cmp_gt_i32:
2428       first_bit = val + 1;
2429       num_bits = val >= wave_size ? 0 : (wave_size - val - 1);
2430       break;
2431    default: return false;
2432    }
2433
2434    Instruction* cpy = NULL;
2435    const uint64_t mask = BITFIELD64_RANGE(first_bit, num_bits);
2436    if (wave_size == 64 && mask > 0x7fffffff && mask != -1ull) {
2437       /* Mask can't be represented as a 64-bit constant or literal, use s_bfm_b64. */
2438       cpy = create_instruction<SOP2_instruction>(aco_opcode::s_bfm_b64, Format::SOP2, 2, 1);
2439       cpy->operands[0] = Operand::c32(num_bits);
2440       cpy->operands[1] = Operand::c32(first_bit);
2441    } else {
2442       /* Copy mask as a literal constant. */
2443       cpy =
2444          create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1);
2445       cpy->operands[0] = wave_size == 32 ? Operand::c32((uint32_t)mask) : Operand::c64(mask);
2446    }
2447
2448    cpy->definitions[0] = instr->definitions[0];
2449    ctx.info[instr->definitions[0].tempId()].label = 0;
2450    decrease_uses(ctx, ctx.info[mbcnt_op.tempId()].instr);
2451    instr.reset(cpy);
2452
2453    return true;
2454 }
2455
2456 bool
2457 is_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value)
2458 {
2459    if (op.isConstant()) {
2460       *value = op.constantValue64();
2461       return true;
2462    } else if (op.isTemp()) {
2463       unsigned id = original_temp_id(ctx, op.getTemp());
2464       if (!ctx.info[id].is_constant_or_literal(bit_size))
2465          return false;
2466       *value = get_constant_op(ctx, ctx.info[id], bit_size).constantValue64();
2467       return true;
2468    }
2469    return false;
2470 }
2471
2472 bool
2473 is_constant_nan(uint64_t value, unsigned bit_size)
2474 {
2475    if (bit_size == 16)
2476       return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff);
2477    else if (bit_size == 32)
2478       return ((value >> 23) & 0xff) == 0xff && (value & 0x7fffff);
2479    else
2480       return ((value >> 52) & 0x7ff) == 0x7ff && (value & 0xfffffffffffff);
2481 }
2482
2483 /* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
2484  * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
2485 bool
2486 combine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2487 {
2488    if (instr->definitions[0].regClass() != ctx.program->lane_mask)
2489       return false;
2490    if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2491       return false;
2492
2493    bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
2494
2495    Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
2496    Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
2497
2498    if (!nan_test || !cmp || nan_test->isSDWA() || cmp->isSDWA() || nan_test->isDPP() ||
2499        cmp->isDPP())
2500       return false;
2501
2502    aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
2503    if (get_f32_cmp(cmp->opcode) == expected_nan_test)
2504       std::swap(nan_test, cmp);
2505    else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
2506       return false;
2507
2508    unsigned bit_size = get_cmp_bitsize(cmp->opcode);
2509    if (!is_fp_cmp(cmp->opcode) || get_cmp_bitsize(nan_test->opcode) != bit_size)
2510       return false;
2511
2512    if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
2513       return false;
2514    if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())
2515       return false;
2516
2517    unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
2518    unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
2519    if (prop_nan0 != prop_nan1)
2520       return false;
2521
2522    VALU_instruction& vop3 = nan_test->valu();
2523    if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel[0] != vop3.opsel[1])
2524       return false;
2525
2526    int constant_operand = -1;
2527    for (unsigned i = 0; i < 2; i++) {
2528       if (cmp->operands[i].isTemp() &&
2529           original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0 &&
2530           cmp->valu().opsel[i] == nan_test->valu().opsel[0]) {
2531          constant_operand = !i;
2532          break;
2533       }
2534    }
2535    if (constant_operand == -1)
2536       return false;
2537
2538    uint64_t constant_value;
2539    if (!is_operand_constant(ctx, cmp->operands[constant_operand], bit_size, &constant_value))
2540       return false;
2541    if (is_constant_nan(constant_value >> (cmp->valu().opsel[constant_operand] * 16), bit_size))
2542       return false;
2543
2544    aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
2545    Instruction* new_instr = create_instruction<VALU_instruction>(new_op, cmp->format, 2, 1);
2546    new_instr->valu().neg = cmp->valu().neg;
2547    new_instr->valu().abs = cmp->valu().abs;
2548    new_instr->valu().clamp = cmp->valu().clamp;
2549    new_instr->valu().omod = cmp->valu().omod;
2550    new_instr->valu().opsel = cmp->valu().opsel;
2551    new_instr->operands[0] = copy_operand(ctx, cmp->operands[0]);
2552    new_instr->operands[1] = copy_operand(ctx, cmp->operands[1]);
2553    new_instr->definitions[0] = instr->definitions[0];
2554    new_instr->pass_flags = instr->pass_flags;
2555
2556    decrease_uses(ctx, nan_test);
2557    decrease_uses(ctx, cmp);
2558
2559    ctx.info[instr->definitions[0].tempId()].label = 0;
2560    ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
2561
2562    instr.reset(new_instr);
2563
2564    return true;
2565 }
2566
2567 /* s_not(cmp(a, b)) -> get_inverse(cmp)(a, b) */
2568 bool
2569 combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2570 {
2571    if (ctx.uses[instr->definitions[1].tempId()])
2572       return false;
2573    if (!instr->operands[0].isTemp() || ctx.uses[instr->operands[0].tempId()] != 1)
2574       return false;
2575
2576    Instruction* cmp = follow_operand(ctx, instr->operands[0]);
2577    if (!cmp)
2578       return false;
2579
2580    aco_opcode new_opcode = get_inverse(cmp->opcode);
2581    if (new_opcode == aco_opcode::num_opcodes)
2582       return false;
2583
2584    /* Invert compare instruction and assign this instruction's definition */
2585    cmp->opcode = new_opcode;
2586    ctx.info[instr->definitions[0].tempId()] = ctx.info[cmp->definitions[0].tempId()];
2587    std::swap(instr->definitions[0], cmp->definitions[0]);
2588
2589    ctx.uses[instr->operands[0].tempId()]--;
2590    return true;
2591 }
2592
2593 /* op1(op2(1, 2), 0) if swap = false
2594  * op1(0, op2(1, 2)) if swap = true */
2595 bool
2596 match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap,
2597                    const char* shuffle_str, Operand operands[3], bitarray8& neg, bitarray8& abs,
2598                    bitarray8& opsel, bool* op1_clamp, uint8_t* op1_omod, bool* inbetween_neg,
2599                    bool* inbetween_abs, bool* inbetween_opsel, bool* precise)
2600 {
2601    /* checks */
2602    if (op1_instr->opcode != op1)
2603       return false;
2604
2605    Instruction* op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
2606    if (!op2_instr || op2_instr->opcode != op2)
2607       return false;
2608
2609    VALU_instruction* op1_valu = op1_instr->isVALU() ? &op1_instr->valu() : NULL;
2610    VALU_instruction* op2_valu = op2_instr->isVALU() ? &op2_instr->valu() : NULL;
2611
2612    if (op1_instr->isSDWA() || op2_instr->isSDWA())
2613       return false;
2614    if (op1_instr->isDPP() || op2_instr->isDPP())
2615       return false;
2616
2617    /* don't support inbetween clamp/omod */
2618    if (op2_valu && (op2_valu->clamp || op2_valu->omod))
2619       return false;
2620
2621    /* get operands and modifiers and check inbetween modifiers */
2622    *op1_clamp = op1_valu ? (bool)op1_valu->clamp : false;
2623    *op1_omod = op1_valu ? (unsigned)op1_valu->omod : 0u;
2624
2625    if (inbetween_neg)
2626       *inbetween_neg = op1_valu ? op1_valu->neg[swap] : false;
2627    else if (op1_valu && op1_valu->neg[swap])
2628       return false;
2629
2630    if (inbetween_abs)
2631       *inbetween_abs = op1_valu ? op1_valu->abs[swap] : false;
2632    else if (op1_valu && op1_valu->abs[swap])
2633       return false;
2634
2635    if (inbetween_opsel)
2636       *inbetween_opsel = op1_valu ? op1_valu->opsel[swap] : false;
2637    else if (op1_valu && op1_valu->opsel[swap])
2638       return false;
2639
2640    *precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise();
2641
2642    int shuffle[3];
2643    shuffle[shuffle_str[0] - '0'] = 0;
2644    shuffle[shuffle_str[1] - '0'] = 1;
2645    shuffle[shuffle_str[2] - '0'] = 2;
2646
2647    operands[shuffle[0]] = op1_instr->operands[!swap];
2648    neg[shuffle[0]] = op1_valu ? op1_valu->neg[!swap] : false;
2649    abs[shuffle[0]] = op1_valu ? op1_valu->abs[!swap] : false;
2650    opsel[shuffle[0]] = op1_valu ? op1_valu->opsel[!swap] : false;
2651
2652    for (unsigned i = 0; i < 2; i++) {
2653       operands[shuffle[i + 1]] = op2_instr->operands[i];
2654       neg[shuffle[i + 1]] = op2_valu ? op2_valu->neg[i] : false;
2655       abs[shuffle[i + 1]] = op2_valu ? op2_valu->abs[i] : false;
2656       opsel[shuffle[i + 1]] = op2_valu ? op2_valu->opsel[i] : false;
2657    }
2658
2659    /* check operands */
2660    if (!check_vop3_operands(ctx, 3, operands))
2661       return false;
2662
2663    return true;
2664 }
2665
2666 void
2667 create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
2668                     Operand operands[3], uint8_t neg, uint8_t abs, uint8_t opsel, bool clamp,
2669                     unsigned omod)
2670 {
2671    VALU_instruction* new_instr = create_instruction<VALU_instruction>(opcode, Format::VOP3, 3, 1);
2672    new_instr->neg = neg;
2673    new_instr->abs = abs;
2674    new_instr->clamp = clamp;
2675    new_instr->omod = omod;
2676    new_instr->opsel = opsel;
2677    new_instr->operands[0] = operands[0];
2678    new_instr->operands[1] = operands[1];
2679    new_instr->operands[2] = operands[2];
2680    new_instr->definitions[0] = instr->definitions[0];
2681    new_instr->pass_flags = instr->pass_flags;
2682    ctx.info[instr->definitions[0].tempId()].label = 0;
2683
2684    instr.reset(new_instr);
2685 }
2686
2687 bool
2688 combine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op,
2689                       const char* shuffle, uint8_t ops)
2690 {
2691    for (unsigned swap = 0; swap < 2; swap++) {
2692       if (!((1 << swap) & ops))
2693          continue;
2694
2695       Operand operands[3];
2696       bool clamp, precise;
2697       bitarray8 neg = 0, abs = 0, opsel = 0;
2698       uint8_t omod = 0;
2699       if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg,
2700                              abs, opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
2701          ctx.uses[instr->operands[swap].tempId()]--;
2702          create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
2703          return true;
2704       }
2705    }
2706    return false;
2707 }
2708
2709 /* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */
2710 bool
2711 combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2712 {
2713    bool is_or = instr->opcode == aco_opcode::v_or_b32;
2714    aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32;
2715
2716    if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32,
2717                                       "120", 1 | 2))
2718       return true;
2719    if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32,
2720                                       "120", 1 | 2))
2721       return true;
2722    if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2))
2723       return true;
2724    if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))
2725       return true;
2726
2727    if (instr->isSDWA() || instr->isDPP())
2728       return false;
2729
2730    /* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)
2731     * v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b)
2732     * v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b)
2733     * v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b)
2734     */
2735    for (unsigned i = 0; i < 2; i++) {
2736       Instruction* extins = follow_operand(ctx, instr->operands[i]);
2737       if (!extins)
2738          continue;
2739
2740       aco_opcode op;
2741       Operand operands[3];
2742
2743       if (extins->opcode == aco_opcode::p_insert &&
2744           (extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) {
2745          op = new_op_lshl;
2746          operands[1] =
2747             Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue());
2748       } else if (is_or &&
2749                  (extins->opcode == aco_opcode::p_insert ||
2750                   (extins->opcode == aco_opcode::p_extract &&
2751                    extins->operands[3].constantEquals(0))) &&
2752                  extins->operands[1].constantEquals(0)) {
2753          op = aco_opcode::v_and_or_b32;
2754          operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu);
2755       } else {
2756          continue;
2757       }
2758
2759       operands[0] = extins->operands[0];
2760       operands[2] = instr->operands[!i];
2761
2762       if (!check_vop3_operands(ctx, 3, operands))
2763          continue;
2764
2765       uint8_t neg = 0, abs = 0, opsel = 0, omod = 0;
2766       bool clamp = false;
2767       if (instr->isVOP3())
2768          clamp = instr->valu().clamp;
2769
2770       ctx.uses[instr->operands[i].tempId()]--;
2771       create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod);
2772       return true;
2773    }
2774
2775    return false;
2776 }
2777
2778 /* v_xor(a, s_not(b)) -> v_xnor(a, b)
2779  * v_xor(a, v_not(b)) -> v_xnor(a, b)
2780  */
2781 bool
2782 combine_xor_not(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2783 {
2784    if (instr->usesModifiers())
2785       return false;
2786
2787    for (unsigned i = 0; i < 2; i++) {
2788       Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
2789       if (!op_instr ||
2790           (op_instr->opcode != aco_opcode::v_not_b32 &&
2791            op_instr->opcode != aco_opcode::s_not_b32) ||
2792           op_instr->usesModifiers() || op_instr->operands[0].isLiteral())
2793          continue;
2794
2795       instr->opcode = aco_opcode::v_xnor_b32;
2796       instr->operands[i] = copy_operand(ctx, op_instr->operands[0]);
2797       decrease_uses(ctx, op_instr);
2798       if (instr->operands[0].isOfType(RegType::vgpr))
2799          std::swap(instr->operands[0], instr->operands[1]);
2800       if (!instr->operands[1].isOfType(RegType::vgpr))
2801          instr->format = asVOP3(instr->format);
2802
2803       return true;
2804    }
2805
2806    return false;
2807 }
2808
2809 /* v_not(v_xor(a, b)) -> v_xnor(a, b) */
2810 bool
2811 combine_not_xor(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2812 {
2813    if (instr->usesModifiers())
2814       return false;
2815
2816    Instruction* op_instr = follow_operand(ctx, instr->operands[0]);
2817    if (!op_instr || op_instr->opcode != aco_opcode::v_xor_b32 || op_instr->isSDWA())
2818       return false;
2819
2820    ctx.uses[instr->operands[0].tempId()]--;
2821    std::swap(instr->definitions[0], op_instr->definitions[0]);
2822    op_instr->opcode = aco_opcode::v_xnor_b32;
2823
2824    return true;
2825 }
2826
2827 bool
2828 combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode op3src,
2829                aco_opcode minmax)
2830 {
2831    /* TODO: this can handle SDWA min/max instructions by using opsel */
2832
2833    /* min(min(a, b), c) -> min3(a, b, c)
2834     * max(max(a, b), c) -> max3(a, b, c)
2835     * gfx11: min(-min(a, b), c) -> maxmin(-a, -b, c)
2836     * gfx11: max(-max(a, b), c) -> minmax(-a, -b, c)
2837     */
2838    for (unsigned swap = 0; swap < 2; swap++) {
2839       Operand operands[3];
2840       bool clamp, precise;
2841       bitarray8 opsel = 0, neg = 0, abs = 0;
2842       uint8_t omod = 0;
2843       bool inbetween_neg;
2844       if (match_op3_for_vop3(ctx, instr->opcode, instr->opcode, instr.get(), swap, "120", operands,
2845                              neg, abs, opsel, &clamp, &omod, &inbetween_neg, NULL, NULL,
2846                              &precise) &&
2847           (!inbetween_neg ||
2848            (minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
2849          ctx.uses[instr->operands[swap].tempId()]--;
2850          if (inbetween_neg) {
2851             neg[0] = !neg[0];
2852             neg[1] = !neg[1];
2853             create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
2854          } else {
2855             create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
2856          }
2857          return true;
2858       }
2859    }
2860
2861    /* min(-max(a, b), c) -> min3(-a, -b, c)
2862     * max(-min(a, b), c) -> max3(-a, -b, c)
2863     * gfx11: min(max(a, b), c) -> maxmin(a, b, c)
2864     * gfx11: max(min(a, b), c) -> minmax(a, b, c)
2865     */
2866    for (unsigned swap = 0; swap < 2; swap++) {
2867       Operand operands[3];
2868       bool clamp, precise;
2869       bitarray8 opsel = 0, neg = 0, abs = 0;
2870       uint8_t omod = 0;
2871       bool inbetween_neg;
2872       if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "120", operands, neg,
2873                              abs, opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&
2874           (inbetween_neg ||
2875            (minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
2876          ctx.uses[instr->operands[swap].tempId()]--;
2877          if (inbetween_neg) {
2878             neg[0] = !neg[0];
2879             neg[1] = !neg[1];
2880             create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
2881          } else {
2882             create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
2883          }
2884          return true;
2885       }
2886    }
2887    return false;
2888 }
2889
2890 /* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
2891  * s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
2892  * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
2893  * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
2894  * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
2895  * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
2896 bool
2897 combine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2898 {
2899    /* checks */
2900    if (!instr->operands[0].isTemp())
2901       return false;
2902    if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
2903       return false;
2904
2905    Instruction* op2_instr = follow_operand(ctx, instr->operands[0]);
2906    if (!op2_instr)
2907       return false;
2908    switch (op2_instr->opcode) {
2909    case aco_opcode::s_and_b32:
2910    case aco_opcode::s_or_b32:
2911    case aco_opcode::s_xor_b32:
2912    case aco_opcode::s_and_b64:
2913    case aco_opcode::s_or_b64:
2914    case aco_opcode::s_xor_b64: break;
2915    default: return false;
2916    }
2917
2918    /* create instruction */
2919    std::swap(instr->definitions[0], op2_instr->definitions[0]);
2920    std::swap(instr->definitions[1], op2_instr->definitions[1]);
2921    ctx.uses[instr->operands[0].tempId()]--;
2922    ctx.info[op2_instr->definitions[0].tempId()].label = 0;
2923
2924    switch (op2_instr->opcode) {
2925    case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break;
2926    case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break;
2927    case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break;
2928    case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break;
2929    case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break;
2930    case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break;
2931    default: break;
2932    }
2933
2934    return true;
2935 }
2936
2937 /* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)
2938  * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
2939  * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
2940  * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
2941 bool
2942 combine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2943 {
2944    if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
2945       return false;
2946
2947    for (unsigned i = 0; i < 2; i++) {
2948       Instruction* op2_instr = follow_operand(ctx, instr->operands[i]);
2949       if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 &&
2950                          op2_instr->opcode != aco_opcode::s_not_b64))
2951          continue;
2952       if (ctx.uses[op2_instr->definitions[1].tempId()])
2953          continue;
2954
2955       if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2956           instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2957          continue;
2958
2959       ctx.uses[instr->operands[i].tempId()]--;
2960       instr->operands[0] = instr->operands[!i];
2961       instr->operands[1] = op2_instr->operands[0];
2962       ctx.info[instr->definitions[0].tempId()].label = 0;
2963
2964       switch (instr->opcode) {
2965       case aco_opcode::s_and_b32: instr->opcode = aco_opcode::s_andn2_b32; break;
2966       case aco_opcode::s_or_b32: instr->opcode = aco_opcode::s_orn2_b32; break;
2967       case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_andn2_b64; break;
2968       case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_orn2_b64; break;
2969       default: break;
2970       }
2971
2972       return true;
2973    }
2974    return false;
2975 }
2976
2977 /* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
2978 bool
2979 combine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
2980 {
2981    if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
2982       return false;
2983
2984    for (unsigned i = 0; i < 2; i++) {
2985       Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true);
2986       if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
2987           ctx.uses[op2_instr->definitions[1].tempId()])
2988          continue;
2989       if (!op2_instr->operands[1].isConstant())
2990          continue;
2991
2992       uint32_t shift = op2_instr->operands[1].constantValue();
2993       if (shift < 1 || shift > 4)
2994          continue;
2995
2996       if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
2997           instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
2998          continue;
2999
3000       instr->operands[1] = instr->operands[!i];
3001       instr->operands[0] = copy_operand(ctx, op2_instr->operands[0]);
3002       decrease_uses(ctx, op2_instr);
3003       ctx.info[instr->definitions[0].tempId()].label = 0;
3004
3005       instr->opcode = std::array<aco_opcode, 4>{
3006          aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32,
3007          aco_opcode::s_lshl4_add_u32}[shift - 1];
3008
3009       return true;
3010    }
3011    return false;
3012 }
3013
3014 /* s_abs_i32(s_sub_[iu]32(a, b)) -> s_absdiff_i32(a, b)
3015  * s_abs_i32(s_add_[iu]32(a, #b)) -> s_absdiff_i32(a, -b)
3016  */
3017 bool
3018 combine_sabsdiff(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3019 {
3020    if (!instr->operands[0].isTemp() || !ctx.info[instr->operands[0].tempId()].is_add_sub())
3021       return false;
3022
3023    Instruction* op_instr = follow_operand(ctx, instr->operands[0], false);
3024    if (!op_instr)
3025       return false;
3026
3027    if (op_instr->opcode == aco_opcode::s_add_i32 || op_instr->opcode == aco_opcode::s_add_u32) {
3028       for (unsigned i = 0; i < 2; i++) {
3029          uint64_t constant;
3030          if (op_instr->operands[!i].isLiteral() ||
3031              !is_operand_constant(ctx, op_instr->operands[i], 32, &constant))
3032             continue;
3033
3034          if (op_instr->operands[i].isTemp())
3035             ctx.uses[op_instr->operands[i].tempId()]--;
3036          op_instr->operands[0] = op_instr->operands[!i];
3037          op_instr->operands[1] = Operand::c32(-int32_t(constant));
3038          goto use_absdiff;
3039       }
3040       return false;
3041    }
3042
3043 use_absdiff:
3044    op_instr->opcode = aco_opcode::s_absdiff_i32;
3045    std::swap(instr->definitions[0], op_instr->definitions[0]);
3046    std::swap(instr->definitions[1], op_instr->definitions[1]);
3047    ctx.uses[instr->operands[0].tempId()]--;
3048
3049    return true;
3050 }
3051
3052 bool
3053 combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
3054 {
3055    if (instr->usesModifiers())
3056       return false;
3057
3058    for (unsigned i = 0; i < 2; i++) {
3059       if (!((1 << i) & ops))
3060          continue;
3061       if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2i() &&
3062           ctx.uses[instr->operands[i].tempId()] == 1) {
3063
3064          aco_ptr<Instruction> new_instr;
3065          if (instr->operands[!i].isTemp() &&
3066              instr->operands[!i].getTemp().type() == RegType::vgpr) {
3067             new_instr.reset(create_instruction<VALU_instruction>(new_op, Format::VOP2, 3, 2));
3068          } else if (ctx.program->gfx_level >= GFX10 ||
3069                     (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
3070             new_instr.reset(
3071                create_instruction<VALU_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
3072          } else {
3073             return false;
3074          }
3075          ctx.uses[instr->operands[i].tempId()]--;
3076          new_instr->definitions[0] = instr->definitions[0];
3077          if (instr->definitions.size() == 2) {
3078             new_instr->definitions[1] = instr->definitions[1];
3079          } else {
3080             new_instr->definitions[1] =
3081                Definition(ctx.program->allocateTmp(ctx.program->lane_mask));
3082             /* Make sure the uses vector is large enough and the number of
3083              * uses properly initialized to 0.
3084              */
3085             ctx.uses.push_back(0);
3086          }
3087          new_instr->operands[0] = Operand::zero();
3088          new_instr->operands[1] = instr->operands[!i];
3089          new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
3090          new_instr->pass_flags = instr->pass_flags;
3091          instr = std::move(new_instr);
3092          ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
3093          return true;
3094       }
3095    }
3096
3097    return false;
3098 }
3099
3100 bool
3101 combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3102 {
3103    if (instr->usesModifiers())
3104       return false;
3105
3106    for (unsigned i = 0; i < 2; i++) {
3107       Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
3108       if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
3109           !op_instr->usesModifiers() && op_instr->operands[0].isTemp() &&
3110           op_instr->operands[0].getTemp().type() == RegType::vgpr &&
3111           op_instr->operands[1].constantEquals(0)) {
3112          aco_ptr<Instruction> new_instr{
3113             create_instruction<VALU_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)};
3114          ctx.uses[instr->operands[i].tempId()]--;
3115          new_instr->operands[0] = op_instr->operands[0];
3116          new_instr->operands[1] = instr->operands[!i];
3117          new_instr->definitions[0] = instr->definitions[0];
3118          new_instr->pass_flags = instr->pass_flags;
3119          instr = std::move(new_instr);
3120          ctx.info[instr->definitions[0].tempId()].label = 0;
3121
3122          return true;
3123       }
3124    }
3125
3126    return false;
3127 }
3128
3129 bool
3130 get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3,
3131                 aco_opcode* med3, aco_opcode* minmax, bool* some_gfx9_only)
3132 {
3133    switch (op) {
3134 #define MINMAX(type, gfx9)                                                                         \
3135    case aco_opcode::v_min_##type:                                                                  \
3136    case aco_opcode::v_max_##type:                                                                  \
3137       *min = aco_opcode::v_min_##type;                                                             \
3138       *max = aco_opcode::v_max_##type;                                                             \
3139       *med3 = aco_opcode::v_med3_##type;                                                           \
3140       *min3 = aco_opcode::v_min3_##type;                                                           \
3141       *max3 = aco_opcode::v_max3_##type;                                                           \
3142       *minmax = op == *min ? aco_opcode::v_maxmin_##type : aco_opcode::v_minmax_##type;            \
3143       *some_gfx9_only = gfx9;                                                                      \
3144       return true;
3145 #define MINMAX_INT16(type, gfx9)                                                                   \
3146    case aco_opcode::v_min_##type:                                                                  \
3147    case aco_opcode::v_max_##type:                                                                  \
3148       *min = aco_opcode::v_min_##type;                                                             \
3149       *max = aco_opcode::v_max_##type;                                                             \
3150       *med3 = aco_opcode::v_med3_##type;                                                           \
3151       *min3 = aco_opcode::v_min3_##type;                                                           \
3152       *max3 = aco_opcode::v_max3_##type;                                                           \
3153       *minmax = aco_opcode::num_opcodes;                                                           \
3154       *some_gfx9_only = gfx9;                                                                      \
3155       return true;
3156 #define MINMAX_INT16_E64(type, gfx9)                                                               \
3157    case aco_opcode::v_min_##type##_e64:                                                            \
3158    case aco_opcode::v_max_##type##_e64:                                                            \
3159       *min = aco_opcode::v_min_##type##_e64;                                                       \
3160       *max = aco_opcode::v_max_##type##_e64;                                                       \
3161       *med3 = aco_opcode::v_med3_##type;                                                           \
3162       *min3 = aco_opcode::v_min3_##type;                                                           \
3163       *max3 = aco_opcode::v_max3_##type;                                                           \
3164       *minmax = aco_opcode::num_opcodes;                                                           \
3165       *some_gfx9_only = gfx9;                                                                      \
3166       return true;
3167       MINMAX(f32, false)
3168       MINMAX(u32, false)
3169       MINMAX(i32, false)
3170       MINMAX(f16, true)
3171       MINMAX_INT16(u16, true)
3172       MINMAX_INT16(i16, true)
3173       MINMAX_INT16_E64(u16, true)
3174       MINMAX_INT16_E64(i16, true)
3175 #undef MINMAX_INT16_E64
3176 #undef MINMAX_INT16
3177 #undef MINMAX
3178    default: return false;
3179    }
3180 }
3181
3182 /* when ub > lb:
3183  * v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
3184  * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
3185  */
3186 bool
3187 combine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode min, aco_opcode max,
3188               aco_opcode med)
3189 {
3190    /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
3191     * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
3192     * minVal > maxVal, which means we can always select it to a v_med3_f32 */
3193    aco_opcode other_op;
3194    if (instr->opcode == min)
3195       other_op = max;
3196    else if (instr->opcode == max)
3197       other_op = min;
3198    else
3199       return false;
3200
3201    for (unsigned swap = 0; swap < 2; swap++) {
3202       Operand operands[3];
3203       bool clamp, precise;
3204       bitarray8 opsel = 0, neg = 0, abs = 0;
3205       uint8_t omod = 0;
3206       if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg,
3207                              abs, opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
3208          /* max(min(src, upper), lower) returns upper if src is NaN, but
3209           * med3(src, lower, upper) returns lower.
3210           */
3211          if (precise && instr->opcode != min &&
3212              (min == aco_opcode::v_min_f16 || min == aco_opcode::v_min_f32))
3213             continue;
3214
3215          int const0_idx = -1, const1_idx = -1;
3216          uint32_t const0 = 0, const1 = 0;
3217          for (int i = 0; i < 3; i++) {
3218             uint32_t val;
3219             bool hi16 = opsel & (1 << i);
3220             if (operands[i].isConstant()) {
3221                val = hi16 ? operands[i].constantValue16(true) : operands[i].constantValue();
3222             } else if (operands[i].isTemp() &&
3223                        ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {
3224                val = ctx.info[operands[i].tempId()].val >> (hi16 ? 16 : 0);
3225             } else {
3226                continue;
3227             }
3228             if (const0_idx >= 0) {
3229                const1_idx = i;
3230                const1 = val;
3231             } else {
3232                const0_idx = i;
3233                const0 = val;
3234             }
3235          }
3236          if (const0_idx < 0 || const1_idx < 0)
3237             continue;
3238
3239          int lower_idx = const0_idx;
3240          switch (min) {
3241          case aco_opcode::v_min_f32:
3242          case aco_opcode::v_min_f16: {
3243             float const0_f, const1_f;
3244             if (min == aco_opcode::v_min_f32) {
3245                memcpy(&const0_f, &const0, 4);
3246                memcpy(&const1_f, &const1, 4);
3247             } else {
3248                const0_f = _mesa_half_to_float(const0);
3249                const1_f = _mesa_half_to_float(const1);
3250             }
3251             if (abs[const0_idx])
3252                const0_f = fabsf(const0_f);
3253             if (abs[const1_idx])
3254                const1_f = fabsf(const1_f);
3255             if (neg[const0_idx])
3256                const0_f = -const0_f;
3257             if (neg[const1_idx])
3258                const1_f = -const1_f;
3259             lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
3260             break;
3261          }
3262          case aco_opcode::v_min_u32: {
3263             lower_idx = const0 < const1 ? const0_idx : const1_idx;
3264             break;
3265          }
3266          case aco_opcode::v_min_u16:
3267          case aco_opcode::v_min_u16_e64: {
3268             lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;
3269             break;
3270          }
3271          case aco_opcode::v_min_i32: {
3272             int32_t const0_i =
3273                const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
3274             int32_t const1_i =
3275                const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
3276             lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
3277             break;
3278          }
3279          case aco_opcode::v_min_i16:
3280          case aco_opcode::v_min_i16_e64: {
3281             int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;
3282             int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;
3283             lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
3284             break;
3285          }
3286          default: break;
3287          }
3288          int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
3289
3290          if (instr->opcode == min) {
3291             if (upper_idx != 0 || lower_idx == 0)
3292                return false;
3293          } else {
3294             if (upper_idx == 0 || lower_idx != 0)
3295                return false;
3296          }
3297
3298          ctx.uses[instr->operands[swap].tempId()]--;
3299          create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);
3300
3301          return true;
3302       }
3303    }
3304
3305    return false;
3306 }
3307
3308 void
3309 apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3310 {
3311    bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
3312                      instr->opcode == aco_opcode::v_lshrrev_b64 ||
3313                      instr->opcode == aco_opcode::v_ashrrev_i64;
3314
3315    /* find candidates and create the set of sgprs already read */
3316    unsigned sgpr_ids[2] = {0, 0};
3317    uint32_t operand_mask = 0;
3318    bool has_literal = false;
3319    for (unsigned i = 0; i < instr->operands.size(); i++) {
3320       if (instr->operands[i].isLiteral())
3321          has_literal = true;
3322       if (!instr->operands[i].isTemp())
3323          continue;
3324       if (instr->operands[i].getTemp().type() == RegType::sgpr) {
3325          if (instr->operands[i].tempId() != sgpr_ids[0])
3326             sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
3327       }
3328       ssa_info& info = ctx.info[instr->operands[i].tempId()];
3329       if (is_copy_label(ctx, instr, info, i) && info.temp.type() == RegType::sgpr)
3330          operand_mask |= 1u << i;
3331       if (info.is_extract() && info.instr->operands[0].getTemp().type() == RegType::sgpr)
3332          operand_mask |= 1u << i;
3333    }
3334    unsigned max_sgprs = 1;
3335    if (ctx.program->gfx_level >= GFX10 && !is_shift64)
3336       max_sgprs = 2;
3337    if (has_literal)
3338       max_sgprs--;
3339
3340    unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
3341
3342    /* keep on applying sgprs until there is nothing left to be done */
3343    while (operand_mask) {
3344       uint32_t sgpr_idx = 0;
3345       uint32_t sgpr_info_id = 0;
3346       uint32_t mask = operand_mask;
3347       /* choose a sgpr */
3348       while (mask) {
3349          unsigned i = u_bit_scan(&mask);
3350          uint16_t uses = ctx.uses[instr->operands[i].tempId()];
3351          if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {
3352             sgpr_idx = i;
3353             sgpr_info_id = instr->operands[i].tempId();
3354          }
3355       }
3356       operand_mask &= ~(1u << sgpr_idx);
3357
3358       ssa_info& info = ctx.info[sgpr_info_id];
3359
3360       /* Applying two sgprs require making it VOP3, so don't do it unless it's
3361        * definitively beneficial.
3362        * TODO: this is too conservative because later the use count could be reduced to 1 */
3363       if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() &&
3364           !instr->isSDWA() && instr->format != Format::VOP3P)
3365          break;
3366
3367       Temp sgpr = info.is_extract() ? info.instr->operands[0].getTemp() : info.temp;
3368       bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];
3369       if (new_sgpr && num_sgprs >= max_sgprs)
3370          continue;
3371
3372       if (sgpr_idx == 0)
3373          instr->format = withoutDPP(instr->format);
3374
3375       if (sgpr_idx == 1 && instr->isDPP())
3376          continue;
3377
3378       if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
3379           info.is_extract()) {
3380          /* can_apply_extract() checks SGPR encoding restrictions */
3381          if (info.is_extract() && can_apply_extract(ctx, instr, sgpr_idx, info))
3382             apply_extract(ctx, instr, sgpr_idx, info);
3383          else if (info.is_extract())
3384             continue;
3385          instr->operands[sgpr_idx] = Operand(sgpr);
3386       } else if (can_swap_operands(instr, &instr->opcode) && !instr->valu().opsel[sgpr_idx]) {
3387          instr->operands[sgpr_idx] = instr->operands[0];
3388          instr->operands[0] = Operand(sgpr);
3389          instr->valu().opsel[0].swap(instr->valu().opsel[sgpr_idx]);
3390          /* swap bits using a 4-entry LUT */
3391          uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;
3392          operand_mask = (operand_mask & ~0x3) | swapped;
3393       } else if (can_use_VOP3(ctx, instr) && !info.is_extract()) {
3394          instr->format = asVOP3(instr->format);
3395          instr->operands[sgpr_idx] = Operand(sgpr);
3396       } else {
3397          continue;
3398       }
3399
3400       if (new_sgpr)
3401          sgpr_ids[num_sgprs++] = sgpr.id();
3402       ctx.uses[sgpr_info_id]--;
3403       ctx.uses[sgpr.id()]++;
3404
3405       /* TODO: handle when it's a VGPR */
3406       if ((ctx.info[sgpr.id()].label & (label_extract | label_temp)) &&
3407           ctx.info[sgpr.id()].temp.type() == RegType::sgpr)
3408          operand_mask |= 1u << sgpr_idx;
3409    }
3410 }
3411
3412 /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
3413 bool
3414 apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3415 {
3416    if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||
3417        !instr_info.can_use_output_modifiers[(int)instr->opcode])
3418       return false;
3419
3420    bool can_vop3 = can_use_VOP3(ctx, instr);
3421    bool is_mad_mix =
3422       instr->opcode == aco_opcode::v_fma_mix_f32 || instr->opcode == aco_opcode::v_fma_mixlo_f16;
3423    if (!instr->isSDWA() && !is_mad_mix && !can_vop3)
3424       return false;
3425
3426    /* SDWA omod is GFX9+. */
3427    bool can_use_omod = (can_vop3 || ctx.program->gfx_level >= GFX9) && !instr->isVOP3P();
3428
3429    ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
3430
3431    uint64_t omod_labels = label_omod2 | label_omod4 | label_omod5;
3432    if (!def_info.is_clamp() && !(can_use_omod && (def_info.label & omod_labels)))
3433       return false;
3434    /* if the omod/clamp instruction is dead, then the single user of this
3435     * instruction is a different instruction */
3436    if (!ctx.uses[def_info.instr->definitions[0].tempId()])
3437       return false;
3438
3439    if (def_info.instr->definitions[0].bytes() != instr->definitions[0].bytes())
3440       return false;
3441
3442    /* MADs/FMAs are created later, so we don't have to update the original add */
3443    assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
3444
3445    if (!instr->isSDWA() && !instr->isVOP3P())
3446       instr->format = asVOP3(instr->format);
3447
3448    if (!def_info.is_clamp() && (instr->valu().clamp || instr->valu().omod))
3449       return false;
3450
3451    if (def_info.is_omod2())
3452       instr->valu().omod = 1;
3453    else if (def_info.is_omod4())
3454       instr->valu().omod = 2;
3455    else if (def_info.is_omod5())
3456       instr->valu().omod = 3;
3457    else if (def_info.is_clamp())
3458       instr->valu().clamp = true;
3459
3460    instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
3461    ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert | label_f2f16;
3462    ctx.uses[def_info.instr->definitions[0].tempId()]--;
3463
3464    return true;
3465 }
3466
3467 /* Combine an p_insert (or p_extract, in some cases) instruction with instr.
3468  * p_insert(instr(...)) -> instr_insert().
3469  */
3470 bool
3471 apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3472 {
3473    if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1)
3474       return false;
3475
3476    ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
3477    if (!def_info.is_insert())
3478       return false;
3479    /* if the insert instruction is dead, then the single user of this
3480     * instruction is a different instruction */
3481    if (!ctx.uses[def_info.instr->definitions[0].tempId()])
3482       return false;
3483
3484    /* MADs/FMAs are created later, so we don't have to update the original add */
3485    assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
3486
3487    SubdwordSel sel = parse_insert(def_info.instr);
3488    assert(sel);
3489
3490    if (!can_use_SDWA(ctx.program->gfx_level, instr, true))
3491       return false;
3492
3493    convert_to_SDWA(ctx.program->gfx_level, instr);
3494    if (instr->sdwa().dst_sel.size() != 4)
3495       return false;
3496    instr->sdwa().dst_sel = sel;
3497
3498    instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
3499    ctx.info[instr->definitions[0].tempId()].label = 0;
3500    ctx.uses[def_info.instr->definitions[0].tempId()]--;
3501
3502    return true;
3503 }
3504
3505 /* Remove superfluous extract after ds_read like so:
3506  * p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN()
3507  */
3508 bool
3509 apply_ds_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
3510 {
3511    /* Check if p_extract has a usedef operand and is the only user. */
3512    if (!ctx.info[extract->operands[0].tempId()].is_usedef() ||
3513        ctx.uses[extract->operands[0].tempId()] > 1)
3514       return false;
3515
3516    /* Check if the usedef is a DS instruction. */
3517    Instruction* ds = ctx.info[extract->operands[0].tempId()].instr;
3518    if (ds->format != Format::DS)
3519       return false;
3520
3521    unsigned extract_idx = extract->operands[1].constantValue();
3522    unsigned bits_extracted = extract->operands[2].constantValue();
3523    unsigned sign_ext = extract->operands[3].constantValue();
3524    unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
3525
3526    /* TODO: These are doable, but probably don't occur too often. */
3527    if (extract_idx || sign_ext || dst_bitsize != 32)
3528       return false;
3529
3530    unsigned bits_loaded = 0;
3531    if (ds->opcode == aco_opcode::ds_read_u8 || ds->opcode == aco_opcode::ds_read_u8_d16)
3532       bits_loaded = 8;
3533    else if (ds->opcode == aco_opcode::ds_read_u16 || ds->opcode == aco_opcode::ds_read_u16_d16)
3534       bits_loaded = 16;
3535    else
3536       return false;
3537
3538    /* Shrink the DS load if the extracted bit size is smaller. */
3539    bits_loaded = MIN2(bits_loaded, bits_extracted);
3540
3541    /* Change the DS opcode so it writes the full register. */
3542    if (bits_loaded == 8)
3543       ds->opcode = aco_opcode::ds_read_u8;
3544    else if (bits_loaded == 16)
3545       ds->opcode = aco_opcode::ds_read_u16;
3546    else
3547       unreachable("Forgot to add DS opcode above.");
3548
3549    /* The DS now produces the exact same thing as the extract, remove the extract. */
3550    std::swap(ds->definitions[0], extract->definitions[0]);
3551    ctx.uses[extract->definitions[0].tempId()] = 0;
3552    ctx.info[ds->definitions[0].tempId()].label = 0;
3553    return true;
3554 }
3555
3556 /* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */
3557 bool
3558 combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3559 {
3560    if (instr->usesModifiers())
3561       return false;
3562
3563    for (unsigned i = 0; i < 2; i++) {
3564       Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
3565       if (op_instr && op_instr->opcode == aco_opcode::v_subbrev_co_u32 &&
3566           op_instr->operands[0].constantEquals(0) && op_instr->operands[1].constantEquals(0) &&
3567           !op_instr->usesModifiers()) {
3568
3569          aco_ptr<Instruction> new_instr;
3570          if (instr->operands[!i].isTemp() &&
3571              instr->operands[!i].getTemp().type() == RegType::vgpr) {
3572             new_instr.reset(
3573                create_instruction<VALU_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1));
3574          } else if (ctx.program->gfx_level >= GFX10 ||
3575                     (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
3576             new_instr.reset(create_instruction<VALU_instruction>(aco_opcode::v_cndmask_b32,
3577                                                                  asVOP3(Format::VOP2), 3, 1));
3578          } else {
3579             return false;
3580          }
3581
3582          new_instr->operands[0] = Operand::zero();
3583          new_instr->operands[1] = instr->operands[!i];
3584          new_instr->operands[2] = copy_operand(ctx, op_instr->operands[2]);
3585          new_instr->definitions[0] = instr->definitions[0];
3586          new_instr->pass_flags = instr->pass_flags;
3587          instr = std::move(new_instr);
3588          decrease_uses(ctx, op_instr);
3589          ctx.info[instr->definitions[0].tempId()].label = 0;
3590          return true;
3591       }
3592    }
3593
3594    return false;
3595 }
3596
3597 /* v_and(a, not(b)) -> v_bfi_b32(b, 0, a)
3598  * v_or(a, not(b)) -> v_bfi_b32(b, a, -1)
3599  */
3600 bool
3601 combine_v_andor_not(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3602 {
3603    if (instr->usesModifiers())
3604       return false;
3605
3606    for (unsigned i = 0; i < 2; i++) {
3607       Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
3608       if (op_instr && !op_instr->usesModifiers() &&
3609           (op_instr->opcode == aco_opcode::v_not_b32 ||
3610            op_instr->opcode == aco_opcode::s_not_b32)) {
3611
3612          Operand ops[3] = {
3613             op_instr->operands[0],
3614             Operand::zero(),
3615             instr->operands[!i],
3616          };
3617          if (instr->opcode == aco_opcode::v_or_b32) {
3618             ops[1] = instr->operands[!i];
3619             ops[2] = Operand::c32(-1);
3620          }
3621          if (!check_vop3_operands(ctx, 3, ops))
3622             continue;
3623
3624          Instruction* new_instr =
3625             create_instruction<VALU_instruction>(aco_opcode::v_bfi_b32, Format::VOP3, 3, 1);
3626
3627          if (op_instr->operands[0].isTemp())
3628             ctx.uses[op_instr->operands[0].tempId()]++;
3629          for (unsigned j = 0; j < 3; j++)
3630             new_instr->operands[j] = ops[j];
3631          new_instr->definitions[0] = instr->definitions[0];
3632          new_instr->pass_flags = instr->pass_flags;
3633          instr.reset(new_instr);
3634          decrease_uses(ctx, op_instr);
3635          ctx.info[instr->definitions[0].tempId()].label = 0;
3636          return true;
3637       }
3638    }
3639
3640    return false;
3641 }
3642
3643 /* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c)
3644  * v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c)
3645  * v_sub(c, s_lshl(a, b)) -> v_mad_i32_i24(a, -(1<<b), c)
3646  * v_sub(c, v_lshlrev(a, b)) -> v_mad_i32_i24(b, -(1<<a), c)
3647  */
3648 bool
3649 combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr, bool is_sub)
3650 {
3651    if (instr->usesModifiers())
3652       return false;
3653
3654    /* Substractions: start at operand 1 to avoid mixup such as
3655     * turning v_sub(v_lshlrev(a, b), c) into v_mad_i32_i24(b, -(1<<a), c)
3656     */
3657    unsigned start_op_idx = is_sub ? 1 : 0;
3658
3659    /* Don't allow 24-bit operands on subtraction because
3660     * v_mad_i32_i24 applies a sign extension.
3661     */
3662    bool allow_24bit = !is_sub;
3663
3664    for (unsigned i = start_op_idx; i < 2; i++) {
3665       Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
3666       if (!op_instr)
3667          continue;
3668
3669       if (op_instr->opcode != aco_opcode::s_lshl_b32 &&
3670           op_instr->opcode != aco_opcode::v_lshlrev_b32)
3671          continue;
3672
3673       int shift_op_idx = op_instr->opcode == aco_opcode::s_lshl_b32 ? 1 : 0;
3674
3675       if (op_instr->operands[shift_op_idx].isConstant() &&
3676           ((allow_24bit && op_instr->operands[!shift_op_idx].is24bit()) ||
3677            op_instr->operands[!shift_op_idx].is16bit())) {
3678          uint32_t multiplier = 1 << (op_instr->operands[shift_op_idx].constantValue() % 32u);
3679          if (is_sub)
3680             multiplier = -multiplier;
3681          if (is_sub ? (multiplier < 0xff800000) : (multiplier > 0xffffff))
3682             continue;
3683
3684          Operand ops[3] = {
3685             op_instr->operands[!shift_op_idx],
3686             Operand::c32(multiplier),
3687             instr->operands[!i],
3688          };
3689          if (!check_vop3_operands(ctx, 3, ops))
3690             return false;
3691
3692          ctx.uses[instr->operands[i].tempId()]--;
3693
3694          aco_opcode mad_op = is_sub ? aco_opcode::v_mad_i32_i24 : aco_opcode::v_mad_u32_u24;
3695          aco_ptr<VALU_instruction> new_instr{
3696             create_instruction<VALU_instruction>(mad_op, Format::VOP3, 3, 1)};
3697          for (unsigned op_idx = 0; op_idx < 3; ++op_idx)
3698             new_instr->operands[op_idx] = ops[op_idx];
3699          new_instr->definitions[0] = instr->definitions[0];
3700          new_instr->pass_flags = instr->pass_flags;
3701          instr = std::move(new_instr);
3702          ctx.info[instr->definitions[0].tempId()].label = 0;
3703          return true;
3704       }
3705    }
3706
3707    return false;
3708 }
3709
3710 void
3711 propagate_swizzles(VALU_instruction* instr, bool opsel_lo, bool opsel_hi)
3712 {
3713    /* propagate swizzles which apply to a result down to the instruction's operands:
3714     * result = a.xy + b.xx -> result.yx = a.yx + b.xx */
3715    uint8_t tmp_lo = instr->opsel_lo;
3716    uint8_t tmp_hi = instr->opsel_hi;
3717    uint8_t neg_lo = instr->neg_lo;
3718    uint8_t neg_hi = instr->neg_hi;
3719    if (opsel_lo == 1) {
3720       instr->opsel_lo = tmp_hi;
3721       instr->neg_lo = neg_hi;
3722    }
3723    if (opsel_hi == 0) {
3724       instr->opsel_hi = tmp_lo;
3725       instr->neg_hi = neg_lo;
3726    }
3727 }
3728
3729 void
3730 combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3731 {
3732    VALU_instruction* vop3p = &instr->valu();
3733
3734    /* apply clamp */
3735    if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) &&
3736        vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1 &&
3737        !vop3p->opsel_lo[1] && !vop3p->opsel_hi[1]) {
3738
3739       ssa_info& info = ctx.info[instr->operands[0].tempId()];
3740       if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) {
3741          VALU_instruction* candidate = &ctx.info[instr->operands[0].tempId()].instr->valu();
3742          candidate->clamp = true;
3743          propagate_swizzles(candidate, vop3p->opsel_lo[0], vop3p->opsel_hi[0]);
3744          instr->definitions[0].swapTemp(candidate->definitions[0]);
3745          ctx.info[candidate->definitions[0].tempId()].instr = candidate;
3746          ctx.uses[instr->definitions[0].tempId()]--;
3747          return;
3748       }
3749    }
3750
3751    /* check for fneg modifiers */
3752    for (unsigned i = 0; i < instr->operands.size(); i++) {
3753       if (!can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, i))
3754          continue;
3755       Operand& op = instr->operands[i];
3756       if (!op.isTemp())
3757          continue;
3758
3759       ssa_info& info = ctx.info[op.tempId()];
3760       if (info.is_vop3p() && info.instr->opcode == aco_opcode::v_pk_mul_f16 &&
3761           info.instr->operands[1].constantEquals(0x3C00)) {
3762
3763          VALU_instruction* fneg = &info.instr->valu();
3764
3765          if (fneg->opsel_lo[1] || fneg->opsel_hi[1])
3766             continue;
3767
3768          Operand ops[3];
3769          for (unsigned j = 0; j < instr->operands.size(); j++)
3770             ops[j] = instr->operands[j];
3771          ops[i] = info.instr->operands[0];
3772          if (!check_vop3_operands(ctx, instr->operands.size(), ops))
3773             continue;
3774
3775          if (fneg->clamp)
3776             continue;
3777          instr->operands[i] = fneg->operands[0];
3778
3779          /* opsel_lo/hi is either 0 or 1:
3780           * if 0 - pick selection from fneg->lo
3781           * if 1 - pick selection from fneg->hi
3782           */
3783          bool opsel_lo = vop3p->opsel_lo[i];
3784          bool opsel_hi = vop3p->opsel_hi[i];
3785          bool neg_lo = fneg->neg_lo[0] ^ fneg->neg_lo[1];
3786          bool neg_hi = fneg->neg_hi[0] ^ fneg->neg_hi[1];
3787          vop3p->neg_lo[i] ^= opsel_lo ? neg_hi : neg_lo;
3788          vop3p->neg_hi[i] ^= opsel_hi ? neg_hi : neg_lo;
3789          vop3p->opsel_lo[i] ^= opsel_lo ? !fneg->opsel_hi[0] : fneg->opsel_lo[0];
3790          vop3p->opsel_hi[i] ^= opsel_hi ? !fneg->opsel_hi[0] : fneg->opsel_lo[0];
3791
3792          if (--ctx.uses[fneg->definitions[0].tempId()])
3793             ctx.uses[fneg->operands[0].tempId()]++;
3794       }
3795    }
3796
3797    if (instr->opcode == aco_opcode::v_pk_add_f16 || instr->opcode == aco_opcode::v_pk_add_u16) {
3798       bool fadd = instr->opcode == aco_opcode::v_pk_add_f16;
3799       if (fadd && instr->definitions[0].isPrecise())
3800          return;
3801
3802       Instruction* mul_instr = nullptr;
3803       unsigned add_op_idx = 0;
3804       bitarray8 mul_neg_lo = 0, mul_neg_hi = 0, mul_opsel_lo = 0, mul_opsel_hi = 0;
3805       uint32_t uses = UINT32_MAX;
3806
3807       /* find the 'best' mul instruction to combine with the add */
3808       for (unsigned i = 0; i < 2; i++) {
3809          Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
3810          if (!op_instr)
3811             continue;
3812
3813          if (ctx.info[instr->operands[i].tempId()].is_vop3p()) {
3814             if (fadd) {
3815                if (op_instr->opcode != aco_opcode::v_pk_mul_f16 ||
3816                    op_instr->definitions[0].isPrecise())
3817                   continue;
3818             } else {
3819                if (op_instr->opcode != aco_opcode::v_pk_mul_lo_u16)
3820                   continue;
3821             }
3822
3823             Operand op[3] = {op_instr->operands[0], op_instr->operands[1], instr->operands[1 - i]};
3824             if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
3825                continue;
3826
3827             /* no clamp allowed between mul and add */
3828             if (op_instr->valu().clamp)
3829                continue;
3830
3831             mul_instr = op_instr;
3832             add_op_idx = 1 - i;
3833             uses = ctx.uses[instr->operands[i].tempId()];
3834             mul_neg_lo = mul_instr->valu().neg_lo;
3835             mul_neg_hi = mul_instr->valu().neg_hi;
3836             mul_opsel_lo = mul_instr->valu().opsel_lo;
3837             mul_opsel_hi = mul_instr->valu().opsel_hi;
3838          } else if (instr->operands[i].bytes() == 2) {
3839             if ((fadd && (op_instr->opcode != aco_opcode::v_mul_f16 ||
3840                           op_instr->definitions[0].isPrecise())) ||
3841                 (!fadd && op_instr->opcode != aco_opcode::v_mul_lo_u16 &&
3842                  op_instr->opcode != aco_opcode::v_mul_lo_u16_e64))
3843                continue;
3844
3845             if (op_instr->valu().clamp || op_instr->valu().omod || op_instr->valu().abs)
3846                continue;
3847
3848             if (op_instr->isDPP() || (op_instr->isSDWA() && (op_instr->sdwa().sel[0].size() < 2 ||
3849                                                              op_instr->sdwa().sel[1].size() < 2)))
3850                continue;
3851
3852             Operand op[3] = {op_instr->operands[0], op_instr->operands[1], instr->operands[1 - i]};
3853             if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
3854                continue;
3855
3856             mul_instr = op_instr;
3857             add_op_idx = 1 - i;
3858             uses = ctx.uses[instr->operands[i].tempId()];
3859             mul_neg_lo = mul_instr->valu().neg;
3860             mul_neg_hi = mul_instr->valu().neg;
3861             if (mul_instr->isSDWA()) {
3862                for (unsigned j = 0; j < 2; j++)
3863                   mul_opsel_lo[j] = mul_instr->sdwa().sel[j].offset();
3864             } else {
3865                mul_opsel_lo = mul_instr->valu().opsel;
3866             }
3867             mul_opsel_hi = mul_opsel_lo;
3868          }
3869       }
3870
3871       if (!mul_instr)
3872          return;
3873
3874       /* turn mul + packed add into v_pk_fma_f16 */
3875       aco_opcode mad = fadd ? aco_opcode::v_pk_fma_f16 : aco_opcode::v_pk_mad_u16;
3876       aco_ptr<VALU_instruction> fma{create_instruction<VALU_instruction>(mad, Format::VOP3P, 3, 1)};
3877       fma->operands[0] = copy_operand(ctx, mul_instr->operands[0]);
3878       fma->operands[1] = copy_operand(ctx, mul_instr->operands[1]);
3879       fma->operands[2] = instr->operands[add_op_idx];
3880       fma->clamp = vop3p->clamp;
3881       fma->neg_lo = mul_neg_lo;
3882       fma->neg_hi = mul_neg_hi;
3883       fma->opsel_lo = mul_opsel_lo;
3884       fma->opsel_hi = mul_opsel_hi;
3885       propagate_swizzles(fma.get(), vop3p->opsel_lo[1 - add_op_idx],
3886                          vop3p->opsel_hi[1 - add_op_idx]);
3887       fma->opsel_lo[2] = vop3p->opsel_lo[add_op_idx];
3888       fma->opsel_hi[2] = vop3p->opsel_hi[add_op_idx];
3889       fma->neg_lo[2] = vop3p->neg_lo[add_op_idx];
3890       fma->neg_hi[2] = vop3p->neg_hi[add_op_idx];
3891       fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx];
3892       fma->neg_hi[1] = fma->neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx];
3893       fma->definitions[0] = instr->definitions[0];
3894       fma->pass_flags = instr->pass_flags;
3895       instr = std::move(fma);
3896       ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
3897       decrease_uses(ctx, mul_instr);
3898       return;
3899    }
3900 }
3901
3902 bool
3903 can_use_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3904 {
3905    if (ctx.program->gfx_level < GFX9)
3906       return false;
3907
3908    /* v_mad_mix* on GFX9 always flushes denormals for 16-bit inputs/outputs */
3909    if (ctx.program->gfx_level == GFX9 && ctx.fp_mode.denorm16_64)
3910       return false;
3911
3912    switch (instr->opcode) {
3913    case aco_opcode::v_add_f32:
3914    case aco_opcode::v_sub_f32:
3915    case aco_opcode::v_subrev_f32:
3916    case aco_opcode::v_mul_f32:
3917    case aco_opcode::v_fma_f32: break;
3918    case aco_opcode::v_fma_mix_f32:
3919    case aco_opcode::v_fma_mixlo_f16: return true;
3920    default: return false;
3921    }
3922
3923    if (instr->opcode == aco_opcode::v_fma_f32 && !ctx.program->dev.fused_mad_mix &&
3924        instr->definitions[0].isPrecise())
3925       return false;
3926
3927    return !instr->valu().omod && !instr->isSDWA() && !instr->isDPP();
3928 }
3929
3930 void
3931 to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3932 {
3933    bool is_add = instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32;
3934
3935    aco_ptr<VALU_instruction> vop3p{
3936       create_instruction<VALU_instruction>(aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1)};
3937
3938    for (unsigned i = 0; i < instr->operands.size(); i++) {
3939       vop3p->operands[is_add + i] = instr->operands[i];
3940       vop3p->neg_lo[is_add + i] = instr->valu().neg[i];
3941       vop3p->neg_hi[is_add + i] = instr->valu().abs[i];
3942    }
3943    if (instr->opcode == aco_opcode::v_mul_f32) {
3944       vop3p->operands[2] = Operand::zero();
3945       vop3p->neg_lo[2] = true;
3946    } else if (is_add) {
3947       vop3p->operands[0] = Operand::c32(0x3f800000);
3948       if (instr->opcode == aco_opcode::v_sub_f32)
3949          vop3p->neg_lo[2] ^= true;
3950       else if (instr->opcode == aco_opcode::v_subrev_f32)
3951          vop3p->neg_lo[1] ^= true;
3952    }
3953    vop3p->definitions[0] = instr->definitions[0];
3954    vop3p->clamp = instr->valu().clamp;
3955    vop3p->pass_flags = instr->pass_flags;
3956    instr = std::move(vop3p);
3957
3958    ctx.info[instr->definitions[0].tempId()].label &= label_f2f16 | label_clamp | label_mul;
3959    if (ctx.info[instr->definitions[0].tempId()].label & label_mul)
3960       ctx.info[instr->definitions[0].tempId()].instr = instr.get();
3961 }
3962
3963 bool
3964 combine_output_conversion(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3965 {
3966    ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
3967    if (!def_info.is_f2f16())
3968       return false;
3969    Instruction* conv = def_info.instr;
3970
3971    if (!can_use_mad_mix(ctx, instr) || ctx.uses[instr->definitions[0].tempId()] != 1)
3972       return false;
3973
3974    if (!ctx.uses[conv->definitions[0].tempId()])
3975       return false;
3976
3977    if (conv->usesModifiers())
3978       return false;
3979
3980    if (!instr->isVOP3P())
3981       to_mad_mix(ctx, instr);
3982
3983    instr->opcode = aco_opcode::v_fma_mixlo_f16;
3984    instr->definitions[0].swapTemp(conv->definitions[0]);
3985    if (conv->definitions[0].isPrecise())
3986       instr->definitions[0].setPrecise(true);
3987    ctx.info[instr->definitions[0].tempId()].label &= label_clamp;
3988    ctx.uses[conv->definitions[0].tempId()]--;
3989
3990    return true;
3991 }
3992
3993 void
3994 combine_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)
3995 {
3996    if (!can_use_mad_mix(ctx, instr))
3997       return;
3998
3999    for (unsigned i = 0; i < instr->operands.size(); i++) {
4000       if (!instr->operands[i].isTemp())
4001          continue;
4002       Temp tmp = instr->operands[i].getTemp();
4003       if (!ctx.info[tmp.id()].is_f2f32())
4004          continue;
4005
4006       Instruction* conv = ctx.info[tmp.id()].instr;
4007       if (conv->valu().clamp || conv->valu().omod) {
4008          continue;
4009       } else if (conv->isSDWA() &&
4010                  (conv->sdwa().dst_sel.size() != 4 || conv->sdwa().sel[0].size() != 2)) {
4011          continue;
4012       } else if (conv->isDPP()) {
4013          continue;
4014       }
4015
4016       if (get_operand_size(instr, i) != 32)
4017          continue;
4018
4019       /* Conversion to VOP3P will add inline constant operands, but that shouldn't affect
4020        * check_vop3_operands(). */
4021       Operand op[3];
4022       for (unsigned j = 0; j < instr->operands.size(); j++)
4023          op[j] = instr->operands[j];
4024       op[i] = conv->operands[0];
4025       if (!check_vop3_operands(ctx, instr->operands.size(), op))
4026          continue;
4027
4028       if (!instr->isVOP3P()) {
4029          bool is_add =
4030             instr->opcode != aco_opcode::v_mul_f32 && instr->opcode != aco_opcode::v_fma_f32;
4031          to_mad_mix(ctx, instr);
4032          i += is_add;
4033       }
4034
4035       if (--ctx.uses[tmp.id()])
4036          ctx.uses[conv->operands[0].tempId()]++;
4037       instr->operands[i].setTemp(conv->operands[0].getTemp());
4038       if (conv->definitions[0].isPrecise())
4039          instr->definitions[0].setPrecise(true);
4040       instr->valu().opsel_hi[i] = true;
4041       if (conv->isSDWA() && conv->sdwa().sel[0].offset() == 2)
4042          instr->valu().opsel_lo[i] = true;
4043       else
4044          instr->valu().opsel_lo[i] = conv->valu().opsel[0];
4045       bool neg = conv->valu().neg[0];
4046       bool abs = conv->valu().abs[0];
4047       if (!instr->valu().abs[i]) {
4048          instr->valu().neg[i] ^= neg;
4049          instr->valu().abs[i] = abs;
4050       }
4051    }
4052 }
4053
4054 // TODO: we could possibly move the whole label_instruction pass to combine_instruction:
4055 // this would mean that we'd have to fix the instruction uses while value propagation
4056
4057 /* also returns true for inf */
4058 bool
4059 is_pow_of_two(opt_ctx& ctx, Operand op)
4060 {
4061    if (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(op.bytes() * 8))
4062       return is_pow_of_two(ctx, get_constant_op(ctx, ctx.info[op.tempId()], op.bytes() * 8));
4063    else if (!op.isConstant())
4064       return false;
4065
4066    uint64_t val = op.constantValue64();
4067
4068    if (op.bytes() == 4) {
4069       uint32_t exponent = (val & 0x7f800000) >> 23;
4070       uint32_t fraction = val & 0x007fffff;
4071       return (exponent >= 127) && (fraction == 0);
4072    } else if (op.bytes() == 2) {
4073       uint32_t exponent = (val & 0x7c00) >> 10;
4074       uint32_t fraction = val & 0x03ff;
4075       return (exponent >= 15) && (fraction == 0);
4076    } else {
4077       assert(op.bytes() == 8);
4078       uint64_t exponent = (val & UINT64_C(0x7ff0000000000000)) >> 52;
4079       uint64_t fraction = val & UINT64_C(0x000fffffffffffff);
4080       return (exponent >= 1023) && (fraction == 0);
4081    }
4082 }
4083
4084 void
4085 combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
4086 {
4087    if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
4088       return;
4089
4090    if (instr->isVALU()) {
4091       /* Apply SDWA. Do this after label_instruction() so it can remove
4092        * label_extract if not all instructions can take SDWA. */
4093       for (unsigned i = 0; i < instr->operands.size(); i++) {
4094          Operand& op = instr->operands[i];
4095          if (!op.isTemp())
4096             continue;
4097          ssa_info& info = ctx.info[op.tempId()];
4098          if (!info.is_extract())
4099             continue;
4100          /* if there are that many uses, there are likely better combinations */
4101          // TODO: delay applying extract to a point where we know better
4102          if (ctx.uses[op.tempId()] > 4) {
4103             info.label &= ~label_extract;
4104             continue;
4105          }
4106          if (info.is_extract() &&
4107              (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
4108               instr->operands[i].getTemp().type() == RegType::sgpr) &&
4109              can_apply_extract(ctx, instr, i, info)) {
4110             /* Increase use count of the extract's operand if the extract still has uses. */
4111             apply_extract(ctx, instr, i, info);
4112             if (--ctx.uses[instr->operands[i].tempId()])
4113                ctx.uses[info.instr->operands[0].tempId()]++;
4114             instr->operands[i].setTemp(info.instr->operands[0].getTemp());
4115          }
4116       }
4117
4118       if (can_apply_sgprs(ctx, instr))
4119          apply_sgprs(ctx, instr);
4120       combine_mad_mix(ctx, instr);
4121       while (apply_omod_clamp(ctx, instr) || combine_output_conversion(ctx, instr))
4122          ;
4123       apply_insert(ctx, instr);
4124    }
4125
4126    if (instr->isVOP3P() && instr->opcode != aco_opcode::v_fma_mix_f32 &&
4127        instr->opcode != aco_opcode::v_fma_mixlo_f16)
4128       return combine_vop3p(ctx, instr);
4129
4130    if (instr->isSDWA() || instr->isDPP())
4131       return;
4132
4133    if (instr->opcode == aco_opcode::p_extract) {
4134       ssa_info& info = ctx.info[instr->operands[0].tempId()];
4135       if (info.is_extract() && can_apply_extract(ctx, instr, 0, info)) {
4136          apply_extract(ctx, instr, 0, info);
4137          if (--ctx.uses[instr->operands[0].tempId()])
4138             ctx.uses[info.instr->operands[0].tempId()]++;
4139          instr->operands[0].setTemp(info.instr->operands[0].getTemp());
4140       }
4141
4142       apply_ds_extract(ctx, instr);
4143    }
4144
4145    if (instr->isVOPC()) {
4146       if (optimize_cmp_subgroup_invocation(ctx, instr))
4147          return;
4148    }
4149
4150    /* TODO: There are still some peephole optimizations that could be done:
4151     * - abs(a - b) -> s_absdiff_i32
4152     * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
4153     * - patterns for v_alignbit_b32 and v_alignbyte_b32
4154     * These aren't probably too interesting though.
4155     * There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but
4156     * probably more useful than the previously mentioned optimizations.
4157     * The various comparison optimizations also currently only work with 32-bit
4158     * floats. */
4159
4160    /* neg(mul(a, b)) -> mul(neg(a), b), abs(mul(a, b)) -> mul(abs(a), abs(b)) */
4161    if ((ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)) &&
4162        ctx.uses[instr->operands[1].tempId()] == 1) {
4163       Temp val = ctx.info[instr->definitions[0].tempId()].temp;
4164
4165       if (!ctx.info[val.id()].is_mul())
4166          return;
4167
4168       Instruction* mul_instr = ctx.info[val.id()].instr;
4169
4170       if (mul_instr->operands[0].isLiteral())
4171          return;
4172       if (mul_instr->valu().clamp)
4173          return;
4174       if (mul_instr->isSDWA() || mul_instr->isDPP())
4175          return;
4176       if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32 &&
4177           ctx.fp_mode.preserve_signed_zero_inf_nan32)
4178          return;
4179       if (mul_instr->definitions[0].bytes() != instr->definitions[0].bytes())
4180          return;
4181
4182       /* convert to mul(neg(a), b), mul(abs(a), abs(b)) or mul(neg(abs(a)), abs(b)) */
4183       ctx.uses[mul_instr->definitions[0].tempId()]--;
4184       Definition def = instr->definitions[0];
4185       bool is_neg = ctx.info[instr->definitions[0].tempId()].is_neg();
4186       bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
4187       uint32_t pass_flags = instr->pass_flags;
4188       Format format = mul_instr->format == Format::VOP2 ? asVOP3(Format::VOP2) : mul_instr->format;
4189       instr.reset(create_instruction<VALU_instruction>(mul_instr->opcode, format,
4190                                                        mul_instr->operands.size(), 1));
4191       std::copy(mul_instr->operands.cbegin(), mul_instr->operands.cend(), instr->operands.begin());
4192       instr->pass_flags = pass_flags;
4193       instr->definitions[0] = def;
4194       VALU_instruction& new_mul = instr->valu();
4195       VALU_instruction& mul = mul_instr->valu();
4196       new_mul.neg = mul.neg;
4197       new_mul.abs = mul.abs;
4198       new_mul.omod = mul.omod;
4199       new_mul.opsel = mul.opsel;
4200       new_mul.opsel_lo = mul.opsel_lo;
4201       new_mul.opsel_hi = mul.opsel_hi;
4202       if (is_abs) {
4203          new_mul.neg[0] = new_mul.neg[1] = false;
4204          new_mul.abs[0] = new_mul.abs[1] = true;
4205       }
4206       new_mul.neg[0] ^= is_neg;
4207       new_mul.clamp = false;
4208
4209       ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
4210       return;
4211    }
4212
4213    /* combine mul+add -> mad */
4214    bool is_add_mix =
4215       (instr->opcode == aco_opcode::v_fma_mix_f32 ||
4216        instr->opcode == aco_opcode::v_fma_mixlo_f16) &&
4217       !instr->valu().neg_lo[0] &&
4218       ((instr->operands[0].constantEquals(0x3f800000) && !instr->valu().opsel_hi[0]) ||
4219        (instr->operands[0].constantEquals(0x3C00) && instr->valu().opsel_hi[0] &&
4220         !instr->valu().opsel_lo[0]));
4221    bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 ||
4222                 instr->opcode == aco_opcode::v_subrev_f32;
4223    bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 ||
4224                 instr->opcode == aco_opcode::v_subrev_f16;
4225    bool mad64 = instr->opcode == aco_opcode::v_add_f64;
4226    if (is_add_mix || mad16 || mad32 || mad64) {
4227       Instruction* mul_instr = nullptr;
4228       unsigned add_op_idx = 0;
4229       uint32_t uses = UINT32_MAX;
4230       bool emit_fma = false;
4231       /* find the 'best' mul instruction to combine with the add */
4232       for (unsigned i = is_add_mix ? 1 : 0; i < instr->operands.size(); i++) {
4233          if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_mul())
4234             continue;
4235          ssa_info& info = ctx.info[instr->operands[i].tempId()];
4236
4237          /* no clamp/omod allowed between mul and add */
4238          if (info.instr->isVOP3() && (info.instr->valu().clamp || info.instr->valu().omod))
4239             continue;
4240          if (info.instr->isVOP3P() && info.instr->valu().clamp)
4241             continue;
4242          /* v_fma_mix_f32/etc can't do omod */
4243          if (info.instr->isVOP3P() && instr->isVOP3() && instr->valu().omod)
4244             continue;
4245          /* don't promote fp16 to fp32 or remove fp32->fp16->fp32 conversions */
4246          if (is_add_mix && info.instr->definitions[0].bytes() == 2)
4247             continue;
4248
4249          if (get_operand_size(instr, i) != info.instr->definitions[0].bytes() * 8)
4250             continue;
4251
4252          bool legacy = info.instr->opcode == aco_opcode::v_mul_legacy_f32;
4253          bool mad_mix = is_add_mix || info.instr->isVOP3P();
4254
4255          /* Multiplication by power-of-two should never need rounding. 1/power-of-two also works,
4256           * but using fma removes denormal flushing (0xfffffe * 0.5 + 0x810001a2).
4257           */
4258          bool is_fma_precise = is_pow_of_two(ctx, info.instr->operands[0]) ||
4259                                is_pow_of_two(ctx, info.instr->operands[1]);
4260
4261          bool has_fma = mad16 || mad64 || (legacy && ctx.program->gfx_level >= GFX10_3) ||
4262                         (mad32 && !legacy && !mad_mix && ctx.program->dev.has_fast_fma32) ||
4263                         (mad_mix && ctx.program->dev.fused_mad_mix);
4264          bool has_mad = mad_mix ? !ctx.program->dev.fused_mad_mix
4265                                 : ((mad32 && ctx.program->gfx_level < GFX10_3) ||
4266                                    (mad16 && ctx.program->gfx_level <= GFX9));
4267          bool can_use_fma =
4268             has_fma &&
4269             (!(info.instr->definitions[0].isPrecise() || instr->definitions[0].isPrecise()) ||
4270              is_fma_precise);
4271          bool can_use_mad =
4272             has_mad && (mad_mix || mad32 ? ctx.fp_mode.denorm32 : ctx.fp_mode.denorm16_64) == 0;
4273          if (mad_mix && legacy)
4274             continue;
4275          if (!can_use_fma && !can_use_mad)
4276             continue;
4277
4278          unsigned candidate_add_op_idx = is_add_mix ? (3 - i) : (1 - i);
4279          Operand op[3] = {info.instr->operands[0], info.instr->operands[1],
4280                           instr->operands[candidate_add_op_idx]};
4281          if (info.instr->isSDWA() || info.instr->isDPP() || !check_vop3_operands(ctx, 3, op) ||
4282              ctx.uses[instr->operands[i].tempId()] > uses)
4283             continue;
4284
4285          if (ctx.uses[instr->operands[i].tempId()] == uses) {
4286             unsigned cur_idx = mul_instr->definitions[0].tempId();
4287             unsigned new_idx = info.instr->definitions[0].tempId();
4288             if (cur_idx > new_idx)
4289                continue;
4290          }
4291
4292          mul_instr = info.instr;
4293          add_op_idx = candidate_add_op_idx;
4294          uses = ctx.uses[instr->operands[i].tempId()];
4295          emit_fma = !can_use_mad;
4296       }
4297
4298       if (mul_instr) {
4299          /* turn mul+add into v_mad/v_fma */
4300          Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1],
4301                           instr->operands[add_op_idx]};
4302          ctx.uses[mul_instr->definitions[0].tempId()]--;
4303          if (ctx.uses[mul_instr->definitions[0].tempId()]) {
4304             if (op[0].isTemp())
4305                ctx.uses[op[0].tempId()]++;
4306             if (op[1].isTemp())
4307                ctx.uses[op[1].tempId()]++;
4308          }
4309
4310          bool neg[3] = {false, false, false};
4311          bool abs[3] = {false, false, false};
4312          unsigned omod = 0;
4313          bool clamp = false;
4314          bitarray8 opsel_lo = 0;
4315          bitarray8 opsel_hi = 0;
4316          bitarray8 opsel = 0;
4317          unsigned mul_op_idx = (instr->isVOP3P() ? 3 : 1) - add_op_idx;
4318
4319          VALU_instruction& valu_mul = mul_instr->valu();
4320          neg[0] = valu_mul.neg[0];
4321          neg[1] = valu_mul.neg[1];
4322          abs[0] = valu_mul.abs[0];
4323          abs[1] = valu_mul.abs[1];
4324          opsel_lo = valu_mul.opsel_lo & 0x3;
4325          opsel_hi = valu_mul.opsel_hi & 0x3;
4326          opsel = valu_mul.opsel & 0x3;
4327
4328          VALU_instruction& valu = instr->valu();
4329          neg[2] = valu.neg[add_op_idx];
4330          abs[2] = valu.abs[add_op_idx];
4331          opsel_lo[2] = valu.opsel_lo[add_op_idx];
4332          opsel_hi[2] = valu.opsel_hi[add_op_idx];
4333          opsel[2] = valu.opsel[add_op_idx];
4334          opsel[3] = valu.opsel[3];
4335          omod = valu.omod;
4336          clamp = valu.clamp;
4337          /* abs of the multiplication result */
4338          if (valu.abs[mul_op_idx]) {
4339             neg[0] = false;
4340             neg[1] = false;
4341             abs[0] = true;
4342             abs[1] = true;
4343          }
4344          /* neg of the multiplication result */
4345          neg[1] ^= valu.neg[mul_op_idx];
4346
4347          if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
4348             neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
4349          else if (instr->opcode == aco_opcode::v_subrev_f32 ||
4350                   instr->opcode == aco_opcode::v_subrev_f16)
4351             neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
4352
4353          aco_ptr<Instruction> add_instr = std::move(instr);
4354          aco_ptr<VALU_instruction> mad;
4355          if (add_instr->isVOP3P() || mul_instr->isVOP3P()) {
4356             assert(!omod);
4357             assert(!opsel);
4358
4359             aco_opcode mad_op = add_instr->definitions[0].bytes() == 2 ? aco_opcode::v_fma_mixlo_f16
4360                                                                        : aco_opcode::v_fma_mix_f32;
4361             mad.reset(create_instruction<VALU_instruction>(mad_op, Format::VOP3P, 3, 1));
4362          } else {
4363             assert(!opsel_lo);
4364             assert(!opsel_hi);
4365
4366             aco_opcode mad_op = emit_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
4367             if (mul_instr->opcode == aco_opcode::v_mul_legacy_f32) {
4368                assert(emit_fma == (ctx.program->gfx_level >= GFX10_3));
4369                mad_op = emit_fma ? aco_opcode::v_fma_legacy_f32 : aco_opcode::v_mad_legacy_f32;
4370             } else if (mad16) {
4371                mad_op = emit_fma ? (ctx.program->gfx_level == GFX8 ? aco_opcode::v_fma_legacy_f16
4372                                                                    : aco_opcode::v_fma_f16)
4373                                  : (ctx.program->gfx_level == GFX8 ? aco_opcode::v_mad_legacy_f16
4374                                                                    : aco_opcode::v_mad_f16);
4375             } else if (mad64) {
4376                mad_op = aco_opcode::v_fma_f64;
4377             }
4378
4379             mad.reset(create_instruction<VALU_instruction>(mad_op, Format::VOP3, 3, 1));
4380          }
4381
4382          for (unsigned i = 0; i < 3; i++) {
4383             mad->operands[i] = op[i];
4384             mad->neg[i] = neg[i];
4385             mad->abs[i] = abs[i];
4386          }
4387          mad->omod = omod;
4388          mad->clamp = clamp;
4389          mad->opsel_lo = opsel_lo;
4390          mad->opsel_hi = opsel_hi;
4391          mad->opsel = opsel;
4392          mad->definitions[0] = add_instr->definitions[0];
4393          mad->definitions[0].setPrecise(add_instr->definitions[0].isPrecise() ||
4394                                         mul_instr->definitions[0].isPrecise());
4395          mad->pass_flags = add_instr->pass_flags;
4396
4397          instr = std::move(mad);
4398
4399          /* mark this ssa_def to be re-checked for profitability and literals */
4400          ctx.mad_infos.emplace_back(std::move(add_instr), mul_instr->definitions[0].tempId());
4401          ctx.info[instr->definitions[0].tempId()].set_mad(ctx.mad_infos.size() - 1);
4402          return;
4403       }
4404    }
4405    /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
4406    else if (((instr->opcode == aco_opcode::v_mul_f32 &&
4407               !ctx.fp_mode.preserve_signed_zero_inf_nan32) ||
4408              instr->opcode == aco_opcode::v_mul_legacy_f32) &&
4409             !instr->usesModifiers() && !ctx.fp_mode.must_flush_denorms32) {
4410       for (unsigned i = 0; i < 2; i++) {
4411          if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
4412              ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() &&
4413              instr->operands[!i].getTemp().type() == RegType::vgpr) {
4414             ctx.uses[instr->operands[i].tempId()]--;
4415             ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
4416
4417             aco_ptr<VALU_instruction> new_instr{
4418                create_instruction<VALU_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
4419             new_instr->operands[0] = Operand::zero();
4420             new_instr->operands[1] = instr->operands[!i];
4421             new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
4422             new_instr->definitions[0] = instr->definitions[0];
4423             new_instr->pass_flags = instr->pass_flags;
4424             instr = std::move(new_instr);
4425             ctx.info[instr->definitions[0].tempId()].label = 0;
4426             return;
4427          }
4428       }
4429    } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->gfx_level >= GFX9) {
4430       if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012",
4431                                 1 | 2)) {
4432       } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32,
4433                                        "012", 1 | 2)) {
4434       } else if (combine_add_or_then_and_lshl(ctx, instr)) {
4435       } else if (combine_v_andor_not(ctx, instr)) {
4436       }
4437    } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->gfx_level >= GFX10) {
4438       if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012",
4439                                 1 | 2)) {
4440       } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32,
4441                                        "012", 1 | 2)) {
4442       } else if (combine_xor_not(ctx, instr)) {
4443       }
4444    } else if (instr->opcode == aco_opcode::v_not_b32 && ctx.program->gfx_level >= GFX10) {
4445       combine_not_xor(ctx, instr);
4446    } else if (instr->opcode == aco_opcode::v_add_u16) {
4447       combine_three_valu_op(
4448          ctx, instr, aco_opcode::v_mul_lo_u16,
4449          ctx.program->gfx_level == GFX8 ? aco_opcode::v_mad_legacy_u16 : aco_opcode::v_mad_u16,
4450          "120", 1 | 2);
4451    } else if (instr->opcode == aco_opcode::v_add_u16_e64) {
4452       combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16_e64, aco_opcode::v_mad_u16, "120",
4453                             1 | 2);
4454    } else if (instr->opcode == aco_opcode::v_add_u32) {
4455       if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
4456       } else if (combine_add_bcnt(ctx, instr)) {
4457       } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
4458                                        aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
4459       } else if (ctx.program->gfx_level >= GFX9 && !instr->usesModifiers()) {
4460          if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120",
4461                                    1 | 2)) {
4462          } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32,
4463                                           "120", 1 | 2)) {
4464          } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32,
4465                                           "012", 1 | 2)) {
4466          } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32,
4467                                           "012", 1 | 2)) {
4468          } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
4469                                           "012", 1 | 2)) {
4470          } else if (combine_add_or_then_and_lshl(ctx, instr)) {
4471          }
4472       }
4473    } else if (instr->opcode == aco_opcode::v_add_co_u32 ||
4474               instr->opcode == aco_opcode::v_add_co_u32_e64) {
4475       bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0;
4476       if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
4477       } else if (!carry_out && combine_add_bcnt(ctx, instr)) {
4478       } else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
4479                                                      aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
4480       } else if (!carry_out && combine_add_lshl(ctx, instr, false)) {
4481       }
4482    } else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 ||
4483               instr->opcode == aco_opcode::v_sub_co_u32_e64) {
4484       bool carry_out =
4485          instr->opcode != aco_opcode::v_sub_u32 && ctx.uses[instr->definitions[1].tempId()] > 0;
4486       if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2)) {
4487       } else if (!carry_out && combine_add_lshl(ctx, instr, true)) {
4488       }
4489    } else if (instr->opcode == aco_opcode::v_subrev_u32 ||
4490               instr->opcode == aco_opcode::v_subrev_co_u32 ||
4491               instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
4492       combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
4493    } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->gfx_level >= GFX9) {
4494       combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120",
4495                             2);
4496    } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) &&
4497               ctx.program->gfx_level >= GFX9) {
4498       combine_salu_lshl_add(ctx, instr);
4499    } else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
4500       if (!combine_salu_not_bitwise(ctx, instr))
4501          combine_inverse_comparison(ctx, instr);
4502    } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
4503               instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
4504       if (combine_ordering_test(ctx, instr)) {
4505       } else if (combine_comparison_ordering(ctx, instr)) {
4506       } else if (combine_constant_comparison_ordering(ctx, instr)) {
4507       } else if (combine_salu_n2(ctx, instr)) {
4508       }
4509    } else if (instr->opcode == aco_opcode::s_abs_i32) {
4510       combine_sabsdiff(ctx, instr);
4511    } else if (instr->opcode == aco_opcode::v_and_b32) {
4512       if (combine_and_subbrev(ctx, instr)) {
4513       } else if (combine_v_andor_not(ctx, instr)) {
4514       }
4515    } else if (instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) {
4516       /* set existing v_fma_f32 with label_mad so we can create v_fmamk_f32/v_fmaak_f32.
4517        * since ctx.uses[mad_info::mul_temp_id] is always 0, we don't have to worry about
4518        * select_instruction() using mad_info::add_instr.
4519        */
4520       ctx.mad_infos.emplace_back(nullptr, 0);
4521       ctx.info[instr->definitions[0].tempId()].set_mad(ctx.mad_infos.size() - 1);
4522    } else if (instr->opcode == aco_opcode::v_med3_f32 || instr->opcode == aco_opcode::v_med3_f16) {
4523       unsigned idx;
4524       if (detect_clamp(instr.get(), &idx)) {
4525          instr->format = asVOP3(Format::VOP2);
4526          instr->operands[0] = instr->operands[idx];
4527          instr->operands[1] = Operand::zero();
4528          instr->opcode =
4529             instr->opcode == aco_opcode::v_med3_f32 ? aco_opcode::v_add_f32 : aco_opcode::v_add_f16;
4530          instr->valu().clamp = true;
4531          instr->valu().abs = (uint8_t)instr->valu().abs[idx];
4532          instr->valu().neg = (uint8_t)instr->valu().neg[idx];
4533          instr->operands.pop_back();
4534       }
4535    } else {
4536       aco_opcode min, max, min3, max3, med3, minmax;
4537       bool some_gfx9_only;
4538       if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &minmax,
4539                           &some_gfx9_only) &&
4540           (!some_gfx9_only || ctx.program->gfx_level >= GFX9)) {
4541          if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,
4542                             instr->opcode == min ? min3 : max3, minmax)) {
4543          } else {
4544             combine_clamp(ctx, instr, min, max, med3);
4545          }
4546       }
4547    }
4548 }
4549
4550 bool
4551 to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
4552 {
4553    /* Check every operand to make sure they are suitable. */
4554    for (Operand& op : instr->operands) {
4555       if (!op.isTemp())
4556          return false;
4557       if (!ctx.info[op.tempId()].is_uniform_bool() && !ctx.info[op.tempId()].is_uniform_bitwise())
4558          return false;
4559    }
4560
4561    switch (instr->opcode) {
4562    case aco_opcode::s_and_b32:
4563    case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break;
4564    case aco_opcode::s_or_b32:
4565    case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_or_b32; break;
4566    case aco_opcode::s_xor_b32:
4567    case aco_opcode::s_xor_b64: instr->opcode = aco_opcode::s_absdiff_i32; break;
4568    default:
4569       /* Don't transform other instructions. They are very unlikely to appear here. */
4570       return false;
4571    }
4572
4573    for (Operand& op : instr->operands) {
4574       ctx.uses[op.tempId()]--;
4575
4576       if (ctx.info[op.tempId()].is_uniform_bool()) {
4577          /* Just use the uniform boolean temp. */
4578          op.setTemp(ctx.info[op.tempId()].temp);
4579       } else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
4580          /* Use the SCC definition of the predecessor instruction.
4581           * This allows the predecessor to get picked up by the same optimization (if it has no
4582           * divergent users), and it also makes sure that the current instruction will keep working
4583           * even if the predecessor won't be transformed.
4584           */
4585          Instruction* pred_instr = ctx.info[op.tempId()].instr;
4586          assert(pred_instr->definitions.size() >= 2);
4587          assert(pred_instr->definitions[1].isFixed() &&
4588                 pred_instr->definitions[1].physReg() == scc);
4589          op.setTemp(pred_instr->definitions[1].getTemp());
4590       } else {
4591          unreachable("Invalid operand on uniform bitwise instruction.");
4592       }
4593
4594       ctx.uses[op.tempId()]++;
4595    }
4596
4597    instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
4598    assert(instr->operands[0].regClass() == s1);
4599    assert(instr->operands[1].regClass() == s1);
4600    return true;
4601 }
4602
4603 void
4604 select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
4605 {
4606    const uint32_t threshold = 4;
4607
4608    if (is_dead(ctx.uses, instr.get())) {
4609       instr.reset();
4610       return;
4611    }
4612
4613    /* convert split_vector into a copy or extract_vector if only one definition is ever used */
4614    if (instr->opcode == aco_opcode::p_split_vector) {
4615       unsigned num_used = 0;
4616       unsigned idx = 0;
4617       unsigned split_offset = 0;
4618       for (unsigned i = 0, offset = 0; i < instr->definitions.size();
4619            offset += instr->definitions[i++].bytes()) {
4620          if (ctx.uses[instr->definitions[i].tempId()]) {
4621             num_used++;
4622             idx = i;
4623             split_offset = offset;
4624          }
4625       }
4626       bool done = false;
4627       if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
4628           ctx.uses[instr->operands[0].tempId()] == 1) {
4629          Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
4630
4631          unsigned off = 0;
4632          Operand op;
4633          for (Operand& vec_op : vec->operands) {
4634             if (off == split_offset) {
4635                op = vec_op;
4636                break;
4637             }
4638             off += vec_op.bytes();
4639          }
4640          if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {
4641             ctx.uses[instr->operands[0].tempId()]--;
4642             for (Operand& vec_op : vec->operands) {
4643                if (vec_op.isTemp())
4644                   ctx.uses[vec_op.tempId()]--;
4645             }
4646             if (op.isTemp())
4647                ctx.uses[op.tempId()]++;
4648
4649             aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
4650                aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
4651             extract->operands[0] = op;
4652             extract->definitions[0] = instr->definitions[idx];
4653             instr = std::move(extract);
4654
4655             done = true;
4656          }
4657       }
4658
4659       if (!done && num_used == 1 &&
4660           instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
4661           split_offset % instr->definitions[idx].bytes() == 0) {
4662          aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
4663             aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
4664          extract->operands[0] = instr->operands[0];
4665          extract->operands[1] =
4666             Operand::c32((uint32_t)split_offset / instr->definitions[idx].bytes());
4667          extract->definitions[0] = instr->definitions[idx];
4668          instr = std::move(extract);
4669       }
4670    }
4671
4672    mad_info* mad_info = NULL;
4673    if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
4674       mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val];
4675       /* re-check mad instructions */
4676       if (ctx.uses[mad_info->mul_temp_id] && mad_info->add_instr) {
4677          ctx.uses[mad_info->mul_temp_id]++;
4678          if (instr->operands[0].isTemp())
4679             ctx.uses[instr->operands[0].tempId()]--;
4680          if (instr->operands[1].isTemp())
4681             ctx.uses[instr->operands[1].tempId()]--;
4682          instr.swap(mad_info->add_instr);
4683          mad_info = NULL;
4684       }
4685       /* check literals */
4686       else if (!instr->isDPP() && !instr->isVOP3P() && instr->opcode != aco_opcode::v_fma_f64 &&
4687                instr->opcode != aco_opcode::v_mad_legacy_f32 &&
4688                instr->opcode != aco_opcode::v_fma_legacy_f32) {
4689          /* FMA can only take literals on GFX10+ */
4690          if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
4691              ctx.program->gfx_level < GFX10)
4692             return;
4693          /* There are no v_fmaak_legacy_f16/v_fmamk_legacy_f16 and on chips where VOP3 can take
4694           * literals (GFX10+), these instructions don't exist.
4695           */
4696          if (instr->opcode == aco_opcode::v_fma_legacy_f16)
4697             return;
4698
4699          uint32_t literal_mask = 0;
4700          uint32_t fp16_mask = 0;
4701          uint32_t sgpr_mask = 0;
4702          uint32_t vgpr_mask = 0;
4703          uint32_t literal_uses = UINT32_MAX;
4704          uint32_t literal_value = 0;
4705
4706          /* Iterate in reverse to prefer v_madak/v_fmaak. */
4707          for (int i = 2; i >= 0; i--) {
4708             Operand& op = instr->operands[i];
4709             if (!op.isTemp())
4710                continue;
4711             if (ctx.info[op.tempId()].is_literal(get_operand_size(instr, i))) {
4712                uint32_t new_literal = ctx.info[op.tempId()].val;
4713                float value = uif(new_literal);
4714                uint16_t fp16_val = _mesa_float_to_half(value);
4715                bool is_denorm = (fp16_val & 0x7fff) != 0 && (fp16_val & 0x7fff) <= 0x3ff;
4716                if (_mesa_half_to_float(fp16_val) == value &&
4717                    (!is_denorm || (ctx.fp_mode.denorm16_64 & fp_denorm_keep_in)))
4718                   fp16_mask |= 1 << i;
4719
4720                if (!literal_mask || literal_value == new_literal) {
4721                   literal_value = new_literal;
4722                   literal_uses = MIN2(literal_uses, ctx.uses[op.tempId()]);
4723                   literal_mask |= 1 << i;
4724                   continue;
4725                }
4726             }
4727             sgpr_mask |= op.isOfType(RegType::sgpr) << i;
4728             vgpr_mask |= op.isOfType(RegType::vgpr) << i;
4729          }
4730
4731          /* The constant bus limitations before GFX10 disallows SGPRs. */
4732          if (sgpr_mask && ctx.program->gfx_level < GFX10)
4733             literal_mask = 0;
4734
4735          /* Encoding needs a vgpr. */
4736          if (!vgpr_mask)
4737             literal_mask = 0;
4738
4739          /* v_madmk/v_fmamk needs a vgpr in the third source. */
4740          if (!(literal_mask & 0b100) && !(vgpr_mask & 0b100))
4741             literal_mask = 0;
4742
4743          /* opsel with GFX11+ is the only modifier supported by fmamk/fmaak*/
4744          if (instr->valu().abs || instr->valu().neg || instr->valu().omod || instr->valu().clamp ||
4745              (instr->valu().opsel && ctx.program->gfx_level < GFX11))
4746             literal_mask = 0;
4747
4748          if (instr->valu().opsel & ~vgpr_mask)
4749             literal_mask = 0;
4750
4751          /* We can't use three unique fp16 literals */
4752          if (fp16_mask == 0b111)
4753             fp16_mask = 0b11;
4754
4755          if ((instr->opcode == aco_opcode::v_fma_f32 ||
4756               (instr->opcode == aco_opcode::v_mad_f32 && !instr->definitions[0].isPrecise())) &&
4757              !instr->valu().omod && ctx.program->gfx_level >= GFX10 &&
4758              util_bitcount(fp16_mask) > std::max<uint32_t>(util_bitcount(literal_mask), 1)) {
4759             assert(ctx.program->dev.fused_mad_mix);
4760             u_foreach_bit (i, fp16_mask)
4761                ctx.uses[instr->operands[i].tempId()]--;
4762             mad_info->fp16_mask = fp16_mask;
4763             return;
4764          }
4765
4766          /* Limit the number of literals to apply to not increase the code
4767           * size too much, but always apply literals for v_mad->v_madak
4768           * because both instructions are 64-bit and this doesn't increase
4769           * code size.
4770           * TODO: try to apply the literals earlier to lower the number of
4771           * uses below threshold
4772           */
4773          if (literal_mask && (literal_uses < threshold || (literal_mask & 0b100))) {
4774             u_foreach_bit (i, literal_mask)
4775                ctx.uses[instr->operands[i].tempId()]--;
4776             mad_info->literal_mask = literal_mask;
4777             return;
4778          }
4779       }
4780    }
4781
4782    /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions
4783     * when it isn't beneficial */
4784    if (instr->isBranch() && instr->operands.size() && instr->operands[0].isTemp() &&
4785        instr->operands[0].isFixed() && instr->operands[0].physReg() == scc) {
4786       ctx.info[instr->operands[0].tempId()].set_scc_needed();
4787       return;
4788    } else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
4789                instr->opcode == aco_opcode::s_cselect_b32) &&
4790               instr->operands[2].isTemp()) {
4791       ctx.info[instr->operands[2].tempId()].set_scc_needed();
4792    }
4793
4794    /* check for literals */
4795    if (!instr->isSALU() && !instr->isVALU())
4796       return;
4797
4798    /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
4799    if (instr->definitions.size() && ctx.uses[instr->definitions[0].tempId()] == 0 &&
4800        ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
4801       bool transform_done = to_uniform_bool_instr(ctx, instr);
4802
4803       if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
4804          /* Swap the two definition IDs in order to avoid overusing the SCC.
4805           * This reduces extra moves generated by RA. */
4806          uint32_t def0_id = instr->definitions[0].getTemp().id();
4807          uint32_t def1_id = instr->definitions[1].getTemp().id();
4808          instr->definitions[0].setTemp(Temp(def1_id, s1));
4809          instr->definitions[1].setTemp(Temp(def0_id, s1));
4810       }
4811
4812       return;
4813    }
4814
4815    /* This optimization is done late in order to be able to apply otherwise
4816     * unsafe optimizations such as the inverse comparison optimization.
4817     */
4818    if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_and_b64) {
4819       if (instr->operands[0].isTemp() && fixed_to_exec(instr->operands[1]) &&
4820           ctx.uses[instr->operands[0].tempId()] == 1 &&
4821           ctx.uses[instr->definitions[1].tempId()] == 0 &&
4822           can_eliminate_and_exec(ctx, instr->operands[0].getTemp(), instr->pass_flags)) {
4823          ctx.uses[instr->operands[0].tempId()]--;
4824          ctx.info[instr->operands[0].tempId()].instr->definitions[0].setTemp(
4825             instr->definitions[0].getTemp());
4826          instr.reset();
4827          return;
4828       }
4829    }
4830
4831    /* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */
4832    if (instr->isVALU() && !instr->isDPP()) {
4833       for (unsigned i = 0; i < instr->operands.size(); i++) {
4834          if (!instr->operands[i].isTemp())
4835             continue;
4836          ssa_info info = ctx.info[instr->operands[i].tempId()];
4837
4838          if (!info.is_dpp() || info.instr->pass_flags != instr->pass_flags)
4839             continue;
4840
4841          /* We won't eliminate the DPP mov if the operand is used twice */
4842          bool op_used_twice = false;
4843          for (unsigned j = 0; j < instr->operands.size(); j++)
4844             op_used_twice |= i != j && instr->operands[i] == instr->operands[j];
4845          if (op_used_twice)
4846             continue;
4847
4848          if (i != 0) {
4849             if (!can_swap_operands(instr, &instr->opcode, 0, i))
4850                continue;
4851             instr->valu().swapOperands(0, i);
4852          }
4853
4854          if (!can_use_DPP(ctx.program->gfx_level, instr, info.is_dpp8()))
4855             continue;
4856
4857          bool dpp8 = info.is_dpp8();
4858          bool input_mods = can_use_input_modifiers(ctx.program->gfx_level, instr->opcode, 0) &&
4859                            get_operand_size(instr, 0) == 32;
4860          bool mov_uses_mods = info.instr->valu().neg[0] || info.instr->valu().abs[0];
4861          if (((dpp8 && ctx.program->gfx_level < GFX11) || !input_mods) && mov_uses_mods)
4862             continue;
4863
4864          convert_to_DPP(ctx.program->gfx_level, instr, dpp8);
4865
4866          if (dpp8) {
4867             DPP8_instruction* dpp = &instr->dpp8();
4868             for (unsigned j = 0; j < 8; ++j)
4869                dpp->lane_sel[j] = info.instr->dpp8().lane_sel[j];
4870             if (mov_uses_mods)
4871                instr->format = asVOP3(instr->format);
4872          } else {
4873             DPP16_instruction* dpp = &instr->dpp16();
4874             dpp->dpp_ctrl = info.instr->dpp16().dpp_ctrl;
4875             dpp->bound_ctrl = info.instr->dpp16().bound_ctrl;
4876          }
4877
4878          instr->valu().neg[0] ^= info.instr->valu().neg[0] && !instr->valu().abs[0];
4879          instr->valu().abs[0] |= info.instr->valu().abs[0];
4880
4881          if (--ctx.uses[info.instr->definitions[0].tempId()])
4882             ctx.uses[info.instr->operands[0].tempId()]++;
4883          instr->operands[0].setTemp(info.instr->operands[0].getTemp());
4884          break;
4885       }
4886    }
4887
4888    /* Use v_fma_mix for f2f32/f2f16 if it has higher throughput.
4889     * Do this late to not disturb other optimizations.
4890     */
4891    if ((instr->opcode == aco_opcode::v_cvt_f32_f16 || instr->opcode == aco_opcode::v_cvt_f16_f32) &&
4892        ctx.program->gfx_level >= GFX11 && ctx.program->wave_size == 64 && !instr->valu().omod &&
4893        !instr->isDPP()) {
4894       bool is_f2f16 = instr->opcode == aco_opcode::v_cvt_f16_f32;
4895       Instruction* fma = create_instruction<VALU_instruction>(
4896          is_f2f16 ? aco_opcode::v_fma_mixlo_f16 : aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1);
4897       fma->definitions[0] = instr->definitions[0];
4898       fma->operands[0] = instr->operands[0];
4899       fma->valu().opsel_hi[0] = !is_f2f16;
4900       fma->valu().opsel_lo[0] = instr->valu().opsel[0];
4901       fma->valu().clamp = instr->valu().clamp;
4902       fma->valu().abs[0] = instr->valu().abs[0];
4903       fma->valu().neg[0] = instr->valu().neg[0];
4904       fma->operands[1] = Operand::c32(fui(1.0f));
4905       fma->operands[2] = Operand::zero();
4906       /* fma_mix is only dual issued if dst and acc type match */
4907       fma->valu().opsel_hi[2] = is_f2f16;
4908       fma->valu().neg[2] = true;
4909       instr.reset(fma);
4910       ctx.info[instr->definitions[0].tempId()].label = 0;
4911    }
4912
4913    if (instr->isSDWA() || (instr->isVOP3() && ctx.program->gfx_level < GFX10) ||
4914        (instr->isVOP3P() && ctx.program->gfx_level < GFX10))
4915       return; /* some encodings can't ever take literals */
4916
4917    /* we do not apply the literals yet as we don't know if it is profitable */
4918    Operand current_literal(s1);
4919
4920    unsigned literal_id = 0;
4921    unsigned literal_uses = UINT32_MAX;
4922    Operand literal(s1);
4923    unsigned num_operands = 1;
4924    if (instr->isSALU() || (ctx.program->gfx_level >= GFX10 &&
4925                            (can_use_VOP3(ctx, instr) || instr->isVOP3P()) && !instr->isDPP()))
4926       num_operands = instr->operands.size();
4927    /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
4928    else if (instr->isVALU() && instr->operands.size() >= 3)
4929       return;
4930
4931    unsigned sgpr_ids[2] = {0, 0};
4932    bool is_literal_sgpr = false;
4933    uint32_t mask = 0;
4934
4935    /* choose a literal to apply */
4936    for (unsigned i = 0; i < num_operands; i++) {
4937       Operand op = instr->operands[i];
4938       unsigned bits = get_operand_size(instr, i);
4939
4940       if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&
4941           op.tempId() != sgpr_ids[0])
4942          sgpr_ids[!!sgpr_ids[0]] = op.tempId();
4943
4944       if (op.isLiteral()) {
4945          current_literal = op;
4946          continue;
4947       } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) {
4948          continue;
4949       }
4950
4951       if (!alu_can_accept_constant(instr, i))
4952          continue;
4953
4954       if (ctx.uses[op.tempId()] < literal_uses) {
4955          is_literal_sgpr = op.getTemp().type() == RegType::sgpr;
4956          mask = 0;
4957          literal = Operand::c32(ctx.info[op.tempId()].val);
4958          literal_uses = ctx.uses[op.tempId()];
4959          literal_id = op.tempId();
4960       }
4961
4962       mask |= (op.tempId() == literal_id) << i;
4963    }
4964
4965    /* don't go over the constant bus limit */
4966    bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
4967                      instr->opcode == aco_opcode::v_lshrrev_b64 ||
4968                      instr->opcode == aco_opcode::v_ashrrev_i64;
4969    unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;
4970    if (ctx.program->gfx_level >= GFX10 && !is_shift64)
4971       const_bus_limit = 2;
4972
4973    unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
4974    if (num_sgprs == const_bus_limit && !is_literal_sgpr)
4975       return;
4976
4977    if (literal_id && literal_uses < threshold &&
4978        (current_literal.isUndefined() ||
4979         (current_literal.size() == literal.size() &&
4980          current_literal.constantValue() == literal.constantValue()))) {
4981       /* mark the literal to be applied */
4982       while (mask) {
4983          unsigned i = u_bit_scan(&mask);
4984          if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)
4985             ctx.uses[instr->operands[i].tempId()]--;
4986       }
4987    }
4988 }
4989
4990 static aco_opcode
4991 sopk_opcode_for_sopc(aco_opcode opcode)
4992 {
4993 #define CTOK(op)                                                                                   \
4994    case aco_opcode::s_cmp_##op##_i32: return aco_opcode::s_cmpk_##op##_i32;                        \
4995    case aco_opcode::s_cmp_##op##_u32: return aco_opcode::s_cmpk_##op##_u32;
4996    switch (opcode) {
4997       CTOK(eq)
4998       CTOK(lg)
4999       CTOK(gt)
5000       CTOK(ge)
5001       CTOK(lt)
5002       CTOK(le)
5003    default: return aco_opcode::num_opcodes;
5004    }
5005 #undef CTOK
5006 }
5007
5008 static bool
5009 sopc_is_signed(aco_opcode opcode)
5010 {
5011 #define SOPC(op)                                                                                   \
5012    case aco_opcode::s_cmp_##op##_i32: return true;                                                 \
5013    case aco_opcode::s_cmp_##op##_u32: return false;
5014    switch (opcode) {
5015       SOPC(eq)
5016       SOPC(lg)
5017       SOPC(gt)
5018       SOPC(ge)
5019       SOPC(lt)
5020       SOPC(le)
5021    default: unreachable("Not a valid SOPC instruction.");
5022    }
5023 #undef SOPC
5024 }
5025
5026 static aco_opcode
5027 sopc_32_swapped(aco_opcode opcode)
5028 {
5029 #define SOPC(op1, op2)                                                                             \
5030    case aco_opcode::s_cmp_##op1##_i32: return aco_opcode::s_cmp_##op2##_i32;                       \
5031    case aco_opcode::s_cmp_##op1##_u32: return aco_opcode::s_cmp_##op2##_u32;
5032    switch (opcode) {
5033       SOPC(eq, eq)
5034       SOPC(lg, lg)
5035       SOPC(gt, lt)
5036       SOPC(ge, le)
5037       SOPC(lt, gt)
5038       SOPC(le, ge)
5039    default: return aco_opcode::num_opcodes;
5040    }
5041 #undef SOPC
5042 }
5043
5044 static void
5045 try_convert_sopc_to_sopk(aco_ptr<Instruction>& instr)
5046 {
5047    if (sopk_opcode_for_sopc(instr->opcode) == aco_opcode::num_opcodes)
5048       return;
5049
5050    if (instr->operands[0].isLiteral()) {
5051       std::swap(instr->operands[0], instr->operands[1]);
5052       instr->opcode = sopc_32_swapped(instr->opcode);
5053    }
5054
5055    if (!instr->operands[1].isLiteral())
5056       return;
5057
5058    if (instr->operands[0].isFixed() && instr->operands[0].physReg() >= 128)
5059       return;
5060
5061    uint32_t value = instr->operands[1].constantValue();
5062
5063    const uint32_t i16_mask = 0xffff8000u;
5064
5065    bool value_is_i16 = (value & i16_mask) == 0 || (value & i16_mask) == i16_mask;
5066    bool value_is_u16 = !(value & 0xffff0000u);
5067
5068    if (!value_is_i16 && !value_is_u16)
5069       return;
5070
5071    if (!value_is_i16 && sopc_is_signed(instr->opcode)) {
5072       if (instr->opcode == aco_opcode::s_cmp_lg_i32)
5073          instr->opcode = aco_opcode::s_cmp_lg_u32;
5074       else if (instr->opcode == aco_opcode::s_cmp_eq_i32)
5075          instr->opcode = aco_opcode::s_cmp_eq_u32;
5076       else
5077          return;
5078    } else if (!value_is_u16 && !sopc_is_signed(instr->opcode)) {
5079       if (instr->opcode == aco_opcode::s_cmp_lg_u32)
5080          instr->opcode = aco_opcode::s_cmp_lg_i32;
5081       else if (instr->opcode == aco_opcode::s_cmp_eq_u32)
5082          instr->opcode = aco_opcode::s_cmp_eq_i32;
5083       else
5084          return;
5085    }
5086
5087    static_assert(sizeof(SOPK_instruction) <= sizeof(SOPC_instruction),
5088                  "Invalid direct instruction cast.");
5089    instr->format = Format::SOPK;
5090    SOPK_instruction* instr_sopk = &instr->sopk();
5091
5092    instr_sopk->imm = instr_sopk->operands[1].constantValue() & 0xffff;
5093    instr_sopk->opcode = sopk_opcode_for_sopc(instr_sopk->opcode);
5094    instr_sopk->operands.pop_back();
5095 }
5096
5097 static void
5098 unswizzle_vop3p_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
5099 {
5100    /* This opt is only beneficial for v_pk_fma_f16 because we can use v_pk_fmac_f16 if the
5101     * instruction doesn't use swizzles. */
5102    if (instr->opcode != aco_opcode::v_pk_fma_f16)
5103       return;
5104
5105    VALU_instruction& vop3p = instr->valu();
5106
5107    unsigned literal_swizzle = ~0u;
5108    for (unsigned i = 0; i < instr->operands.size(); i++) {
5109       if (!instr->operands[i].isLiteral())
5110          continue;
5111       unsigned new_swizzle = vop3p.opsel_lo[i] | (vop3p.opsel_hi[i] << 1);
5112       if (literal_swizzle != ~0u && new_swizzle != literal_swizzle)
5113          return; /* Literal swizzles conflict. */
5114       literal_swizzle = new_swizzle;
5115    }
5116
5117    if (literal_swizzle == 0b10 || literal_swizzle == ~0u)
5118       return; /* already unswizzled */
5119
5120    for (unsigned i = 0; i < instr->operands.size(); i++) {
5121       if (!instr->operands[i].isLiteral())
5122          continue;
5123       uint32_t literal = instr->operands[i].constantValue();
5124       literal = (literal >> (16 * (literal_swizzle & 0x1)) & 0xffff) |
5125                 (literal >> (8 * (literal_swizzle & 0x2)) << 16);
5126       instr->operands[i] = Operand::literal32(literal);
5127       vop3p.opsel_lo[i] = false;
5128       vop3p.opsel_hi[i] = true;
5129    }
5130 }
5131
5132 void
5133 apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
5134 {
5135    /* Cleanup Dead Instructions */
5136    if (!instr)
5137       return;
5138
5139    /* apply literals on MAD */
5140    if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
5141       mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val];
5142       const bool madak = (info->literal_mask & 0b100);
5143       bool has_dead_literal = false;
5144       u_foreach_bit (i, info->literal_mask | info->fp16_mask)
5145          has_dead_literal |= ctx.uses[instr->operands[i].tempId()] == 0;
5146
5147       if (has_dead_literal && info->fp16_mask) {
5148          instr->format = Format::VOP3P;
5149          instr->opcode = aco_opcode::v_fma_mix_f32;
5150
5151          uint32_t literal = 0;
5152          bool second = false;
5153          u_foreach_bit (i, info->fp16_mask) {
5154             float value = uif(ctx.info[instr->operands[i].tempId()].val);
5155             literal |= _mesa_float_to_half(value) << (second * 16);
5156             instr->valu().opsel_lo[i] = second;
5157             instr->valu().opsel_hi[i] = true;
5158             second = true;
5159          }
5160
5161          for (unsigned i = 0; i < 3; i++) {
5162             if (info->fp16_mask & (1 << i))
5163                instr->operands[i] = Operand::literal32(literal);
5164          }
5165
5166          ctx.instructions.emplace_back(std::move(instr));
5167          return;
5168       }
5169
5170       if (has_dead_literal || madak) {
5171          aco_opcode new_op = madak ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
5172          if (instr->opcode == aco_opcode::v_fma_f32)
5173             new_op = madak ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
5174          else if (instr->opcode == aco_opcode::v_mad_f16 ||
5175                   instr->opcode == aco_opcode::v_mad_legacy_f16)
5176             new_op = madak ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
5177          else if (instr->opcode == aco_opcode::v_fma_f16)
5178             new_op = madak ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
5179
5180          uint32_t literal = ctx.info[instr->operands[ffs(info->literal_mask) - 1].tempId()].val;
5181          instr->format = Format::VOP2;
5182          instr->opcode = new_op;
5183          for (unsigned i = 0; i < 3; i++) {
5184             if (info->literal_mask & (1 << i))
5185                instr->operands[i] = Operand::literal32(literal);
5186          }
5187          if (madak) { /* add literal -> madak */
5188             if (!instr->operands[1].isOfType(RegType::vgpr))
5189                instr->valu().swapOperands(0, 1);
5190          } else { /* mul literal -> madmk */
5191             if (!(info->literal_mask & 0b10))
5192                instr->valu().swapOperands(0, 1);
5193             instr->valu().swapOperands(1, 2);
5194          }
5195          ctx.instructions.emplace_back(std::move(instr));
5196          return;
5197       }
5198    }
5199
5200    /* apply literals on other SALU/VALU */
5201    if (instr->isSALU() || instr->isVALU()) {
5202       for (unsigned i = 0; i < instr->operands.size(); i++) {
5203          Operand op = instr->operands[i];
5204          unsigned bits = get_operand_size(instr, i);
5205          if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
5206             Operand literal = Operand::literal32(ctx.info[op.tempId()].val);
5207             instr->format = withoutDPP(instr->format);
5208             if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)
5209                instr->format = asVOP3(instr->format);
5210             instr->operands[i] = literal;
5211          }
5212       }
5213    }
5214
5215    if (instr->isSOPC())
5216       try_convert_sopc_to_sopk(instr);
5217
5218    /* allow more s_addk_i32 optimizations if carry isn't used */
5219    if (instr->opcode == aco_opcode::s_add_u32 && ctx.uses[instr->definitions[1].tempId()] == 0 &&
5220        (instr->operands[0].isLiteral() || instr->operands[1].isLiteral()))
5221       instr->opcode = aco_opcode::s_add_i32;
5222
5223    if (instr->isVOP3P())
5224       unswizzle_vop3p_literals(ctx, instr);
5225
5226    ctx.instructions.emplace_back(std::move(instr));
5227 }
5228
5229 void
5230 optimize(Program* program)
5231 {
5232    opt_ctx ctx;
5233    ctx.program = program;
5234    std::vector<ssa_info> info(program->peekAllocationId());
5235    ctx.info = info.data();
5236
5237    /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
5238    for (Block& block : program->blocks) {
5239       ctx.fp_mode = block.fp_mode;
5240       for (aco_ptr<Instruction>& instr : block.instructions)
5241          label_instruction(ctx, instr);
5242    }
5243
5244    ctx.uses = dead_code_analysis(program);
5245
5246    /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
5247    for (Block& block : program->blocks) {
5248       ctx.fp_mode = block.fp_mode;
5249       for (aco_ptr<Instruction>& instr : block.instructions)
5250          combine_instruction(ctx, instr);
5251    }
5252
5253    /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
5254    for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend();
5255         ++block_rit) {
5256       Block* block = &(*block_rit);
5257       ctx.fp_mode = block->fp_mode;
5258       for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend();
5259            ++instr_rit)
5260          select_instruction(ctx, *instr_rit);
5261    }
5262
5263    /* 4. Add literals to instructions */
5264    for (Block& block : program->blocks) {
5265       ctx.instructions.reserve(block.instructions.size());
5266       ctx.fp_mode = block.fp_mode;
5267       for (aco_ptr<Instruction>& instr : block.instructions)
5268          apply_literals(ctx, instr);
5269       block.instructions = std::move(ctx.instructions);
5270    }
5271 }
5272
5273 } // namespace aco