src/amd/compiler/aco_lower_to_hw_instr.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_builder.h"
  26 #include "aco_ir.h"
  27
  28 #include "common/sid.h"
  29
  30 #include <map>
  31 #include <vector>
  32
  33 namespace aco {
  34
  35 struct lower_context {
  36    Program* program;
  37    Block* block;
  38    std::vector<aco_ptr<Instruction>> instructions;
  39 };
  40
  41 /* Class for obtaining where s_sendmsg(MSG_ORDERED_PS_DONE) must be done in a Primitive Ordered
  42  * Pixel Shader on GFX9-10.3.
  43  *
  44  * MSG_ORDERED_PS_DONE must be sent once after the ordered section is done along all execution paths
  45  * from the POPS packer ID hardware register setting to s_endpgm. It is, however, also okay to send
  46  * it if the packer ID is not going to be set at all by the wave, so some conservativeness is fine.
  47  *
  48  * For simplicity, sending the message from top-level blocks as dominance and post-dominance
  49  * checking for any location in the shader is trivial in them. Also, for simplicity, sending it
  50  * regardless of whether the POPS packer ID hardware register has already potentially been set up.
  51  *
  52  * Note that there can be multiple interlock end instructions in the shader.
  53  * SPV_EXT_fragment_shader_interlock requires OpEndInvocationInterlockEXT to be executed exactly
  54  * once by the invocation. However, there may be, for instance, multiple ordered sections, and which
  55  * one will be executed may depend on divergent control flow (some lanes may execute one ordered
  56  * section, other lanes may execute another). MSG_ORDERED_PS_DONE, however, is sent via a scalar
  57  * instruction, so it must be ensured that the message is sent after the last ordered section in the
  58  * entire wave.
  59  */
  60 class gfx9_pops_done_msg_bounds {
  61 public:
  62    explicit gfx9_pops_done_msg_bounds() = default;
  63
  64    explicit gfx9_pops_done_msg_bounds(const Program* const program)
  65    {
  66       /* Find the top-level location after the last ordered section end pseudo-instruction in the
  67        * program.
  68        * Consider `p_pops_gfx9_overlapped_wave_wait_done` a boundary too - make sure the message
  69        * isn't sent if any wait hasn't been fully completed yet (if a begin-end-begin situation
  70        * occurs somehow, as the location of `p_pops_gfx9_ordered_section_done` is controlled by the
  71        * application) for safety, assuming that waits are the only thing that need the packer
  72        * hardware register to be set at some point during or before them, and it won't be set
  73        * anymore after the last wait.
  74        */
  75       int last_top_level_block_idx = -1;
  76       for (int block_idx = (int)program->blocks.size() - 1; block_idx >= 0; block_idx--) {
  77          const Block& block = program->blocks[block_idx];
  78          if (block.kind & block_kind_top_level) {
  79             last_top_level_block_idx = block_idx;
  80          }
  81          for (size_t instr_idx = block.instructions.size() - 1; instr_idx + size_t(1) > 0;
  82               instr_idx--) {
  83             const aco_opcode opcode = block.instructions[instr_idx]->opcode;
  84             if (opcode == aco_opcode::p_pops_gfx9_ordered_section_done ||
  85                 opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done) {
  86                end_block_idx_ = last_top_level_block_idx;
  87                /* The same block if it's already a top-level block, or the beginning of the next
  88                 * top-level block.
  89                 */
  90                instr_after_end_idx_ = block_idx == end_block_idx_ ? instr_idx + 1 : 0;
  91                break;
  92             }
  93          }
  94          if (end_block_idx_ != -1) {
  95             break;
  96          }
  97       }
  98    }
  99
 100    /* If this is not -1, during the normal execution flow (not early exiting), MSG_ORDERED_PS_DONE
 101     * must be sent in this block.
 102     */
 103    int end_block_idx() const { return end_block_idx_; }
 104
 105    /* If end_block_idx() is an existing block, during the normal execution flow (not early exiting),
 106     * MSG_ORDERED_PS_DONE must be sent before this instruction in the block end_block_idx().
 107     * If this is out of the bounds of the instructions in the end block, it must be sent in the end
 108     * of that block.
 109     */
 110    size_t instr_after_end_idx() const { return instr_after_end_idx_; }
 111
 112    /* Whether an instruction doing early exit (such as discard) needs to send MSG_ORDERED_PS_DONE
 113     * before actually ending the program.
 114     */
 115    bool early_exit_needs_done_msg(const int block_idx, const size_t instr_idx) const
 116    {
 117       return block_idx <= end_block_idx_ &&
 118              (block_idx != end_block_idx_ || instr_idx < instr_after_end_idx_);
 119    }
 120
 121 private:
 122    /* Initialize to an empty range for which "is inside" comparisons will be failing for any
 123     * block.
 124     */
 125    int end_block_idx_ = -1;
 126    size_t instr_after_end_idx_ = 0;
 127 };
 128
 129 /* used by handle_operands() indirectly through Builder::copy */
 130 uint8_t int8_mul_table[512] = {
 131    0, 20,  1,  1,   1,  2,   1,  3,   1,  4,   1, 5,   1,  6,   1,  7,   1,  8,   1,  9,
 132    1, 10,  1,  11,  1,  12,  1,  13,  1,  14,  1, 15,  1,  16,  1,  17,  1,  18,  1,  19,
 133    1, 20,  1,  21,  1,  22,  1,  23,  1,  24,  1, 25,  1,  26,  1,  27,  1,  28,  1,  29,
 134    1, 30,  1,  31,  1,  32,  1,  33,  1,  34,  1, 35,  1,  36,  1,  37,  1,  38,  1,  39,
 135    1, 40,  1,  41,  1,  42,  1,  43,  1,  44,  1, 45,  1,  46,  1,  47,  1,  48,  1,  49,
 136    1, 50,  1,  51,  1,  52,  1,  53,  1,  54,  1, 55,  1,  56,  1,  57,  1,  58,  1,  59,
 137    1, 60,  1,  61,  1,  62,  1,  63,  1,  64,  5, 13,  2,  33,  17, 19,  2,  34,  3,  23,
 138    2, 35,  11, 53,  2,  36,  7,  47,  2,  37,  3, 25,  2,  38,  7,  11,  2,  39,  53, 243,
 139    2, 40,  3,  27,  2,  41,  17, 35,  2,  42,  5, 17,  2,  43,  3,  29,  2,  44,  15, 23,
 140    2, 45,  7,  13,  2,  46,  3,  31,  2,  47,  5, 19,  2,  48,  19, 59,  2,  49,  3,  33,
 141    2, 50,  7,  51,  2,  51,  15, 41,  2,  52,  3, 35,  2,  53,  11, 33,  2,  54,  23, 27,
 142    2, 55,  3,  37,  2,  56,  9,  41,  2,  57,  5, 23,  2,  58,  3,  39,  2,  59,  7,  17,
 143    2, 60,  9,  241, 2,  61,  3,  41,  2,  62,  5, 25,  2,  63,  35, 245, 2,  64,  3,  43,
 144    5, 26,  9,  43,  3,  44,  7,  19,  10, 39,  3, 45,  4,  34,  11, 59,  3,  46,  9,  243,
 145    4, 35,  3,  47,  22, 53,  7,  57,  3,  48,  5, 29,  10, 245, 3,  49,  4,  37,  9,  45,
 146    3, 50,  7,  241, 4,  38,  3,  51,  7,  22,  5, 31,  3,  52,  7,  59,  7,  242, 3,  53,
 147    4, 40,  7,  23,  3,  54,  15, 45,  4,  41,  3, 55,  6,  241, 9,  47,  3,  56,  13, 13,
 148    5, 34,  3,  57,  4,  43,  11, 39,  3,  58,  5, 35,  4,  44,  3,  59,  6,  243, 7,  245,
 149    3, 60,  5,  241, 7,  26,  3,  61,  4,  46,  5, 37,  3,  62,  11, 17,  4,  47,  3,  63,
 150    5, 38,  5,  243, 3,  64,  7,  247, 9,  50,  5, 39,  4,  241, 33, 37,  6,  33,  13, 35,
 151    4, 242, 5,  245, 6,  247, 7,  29,  4,  51,  5, 41,  5,  246, 7,  249, 3,  240, 11, 19,
 152    5, 42,  3,  241, 4,  245, 25, 29,  3,  242, 5, 43,  4,  246, 3,  243, 17, 58,  17, 43,
 153    3, 244, 5,  249, 6,  37,  3,  245, 2,  240, 5, 45,  2,  241, 21, 23,  2,  242, 3,  247,
 154    2, 243, 5,  251, 2,  244, 29, 61,  2,  245, 3, 249, 2,  246, 17, 29,  2,  247, 9,  55,
 155    1, 240, 1,  241, 1,  242, 1,  243, 1,  244, 1, 245, 1,  246, 1,  247, 1,  248, 1,  249,
 156    1, 250, 1,  251, 1,  252, 1,  253, 1,  254, 1, 255};
 157
 158 aco_opcode
 159 get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
 160 {
 161    /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
 162     * 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
 163     * DPP with the arithmetic instructions. This requires to sign-extend.
 164     */
 165    switch (op) {
 166    case iadd8:
 167    case iadd16:
 168       if (gfx_level >= GFX10) {
 169          return aco_opcode::v_add_u32;
 170       } else if (gfx_level >= GFX8) {
 171          return aco_opcode::v_add_u16;
 172       } else {
 173          return aco_opcode::v_add_co_u32;
 174       }
 175       break;
 176    case imul8:
 177    case imul16:
 178       if (gfx_level >= GFX10) {
 179          return aco_opcode::v_mul_lo_u16_e64;
 180       } else if (gfx_level >= GFX8) {
 181          return aco_opcode::v_mul_lo_u16;
 182       } else {
 183          return aco_opcode::v_mul_u32_u24;
 184       }
 185       break;
 186    case fadd16: return aco_opcode::v_add_f16;
 187    case fmul16: return aco_opcode::v_mul_f16;
 188    case imax8:
 189    case imax16:
 190       if (gfx_level >= GFX10) {
 191          return aco_opcode::v_max_i32;
 192       } else if (gfx_level >= GFX8) {
 193          return aco_opcode::v_max_i16;
 194       } else {
 195          return aco_opcode::v_max_i32;
 196       }
 197       break;
 198    case imin8:
 199    case imin16:
 200       if (gfx_level >= GFX10) {
 201          return aco_opcode::v_min_i32;
 202       } else if (gfx_level >= GFX8) {
 203          return aco_opcode::v_min_i16;
 204       } else {
 205          return aco_opcode::v_min_i32;
 206       }
 207       break;
 208    case umin8:
 209    case umin16:
 210       if (gfx_level >= GFX10) {
 211          return aco_opcode::v_min_u32;
 212       } else if (gfx_level >= GFX8) {
 213          return aco_opcode::v_min_u16;
 214       } else {
 215          return aco_opcode::v_min_u32;
 216       }
 217       break;
 218    case umax8:
 219    case umax16:
 220       if (gfx_level >= GFX10) {
 221          return aco_opcode::v_max_u32;
 222       } else if (gfx_level >= GFX8) {
 223          return aco_opcode::v_max_u16;
 224       } else {
 225          return aco_opcode::v_max_u32;
 226       }
 227       break;
 228    case fmin16: return aco_opcode::v_min_f16;
 229    case fmax16: return aco_opcode::v_max_f16;
 230    case iadd32: return gfx_level >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
 231    case imul32: return aco_opcode::v_mul_lo_u32;
 232    case fadd32: return aco_opcode::v_add_f32;
 233    case fmul32: return aco_opcode::v_mul_f32;
 234    case imax32: return aco_opcode::v_max_i32;
 235    case imin32: return aco_opcode::v_min_i32;
 236    case umin32: return aco_opcode::v_min_u32;
 237    case umax32: return aco_opcode::v_max_u32;
 238    case fmin32: return aco_opcode::v_min_f32;
 239    case fmax32: return aco_opcode::v_max_f32;
 240    case iand8:
 241    case iand16:
 242    case iand32: return aco_opcode::v_and_b32;
 243    case ixor8:
 244    case ixor16:
 245    case ixor32: return aco_opcode::v_xor_b32;
 246    case ior8:
 247    case ior16:
 248    case ior32: return aco_opcode::v_or_b32;
 249    case iadd64: return aco_opcode::num_opcodes;
 250    case imul64: return aco_opcode::num_opcodes;
 251    case fadd64: return aco_opcode::v_add_f64;
 252    case fmul64: return aco_opcode::v_mul_f64;
 253    case imin64: return aco_opcode::num_opcodes;
 254    case imax64: return aco_opcode::num_opcodes;
 255    case umin64: return aco_opcode::num_opcodes;
 256    case umax64: return aco_opcode::num_opcodes;
 257    case fmin64: return aco_opcode::v_min_f64;
 258    case fmax64: return aco_opcode::v_max_f64;
 259    case iand64: return aco_opcode::num_opcodes;
 260    case ior64: return aco_opcode::num_opcodes;
 261    case ixor64: return aco_opcode::num_opcodes;
 262    default: return aco_opcode::num_opcodes;
 263    }
 264 }
 265
 266 bool
 267 is_vop3_reduce_opcode(aco_opcode opcode)
 268 {
 269    /* 64-bit reductions are VOP3. */
 270    if (opcode == aco_opcode::num_opcodes)
 271       return true;
 272
 273    return instr_info.format[(int)opcode] == Format::VOP3;
 274 }
 275
 276 void
 277 emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
 278 {
 279    Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
 280    if (instr->definitions.size() >= 2) {
 281       assert(instr->definitions[1].regClass() == bld.lm);
 282       instr->definitions[1].setFixed(vcc);
 283    }
 284 }
 285
 286 void
 287 emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
 288                   PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,
 289                   unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)
 290 {
 291    Builder bld(ctx->program, &ctx->instructions);
 292    Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
 293    Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};
 294    Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};
 295    Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
 296    Operand src1_64 = Operand(src1_reg, v2);
 297    Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};
 298    Operand vtmp_op64 = Operand(vtmp_reg, v2);
 299    if (op == iadd64) {
 300       if (ctx->program->gfx_level >= GFX10) {
 301          if (identity)
 302             bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
 303          bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
 304                       bound_ctrl);
 305          bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
 306       } else {
 307          bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
 308                       dpp_ctrl, row_mask, bank_mask, bound_ctrl);
 309       }
 310       bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
 311                    Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
 312    } else if (op == iand64) {
 313       bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
 314                    bound_ctrl);
 315       bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
 316                    bound_ctrl);
 317    } else if (op == ior64) {
 318       bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
 319                    bound_ctrl);
 320       bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
 321                    bound_ctrl);
 322    } else if (op == ixor64) {
 323       bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
 324                    bound_ctrl);
 325       bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
 326                    bound_ctrl);
 327    } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
 328       aco_opcode cmp = aco_opcode::num_opcodes;
 329       switch (op) {
 330       case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
 331       case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
 332       case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
 333       case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
 334       default: break;
 335       }
 336
 337       if (identity) {
 338          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
 339          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
 340       }
 341       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
 342                    bound_ctrl);
 343       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,
 344                    bound_ctrl);
 345
 346       bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
 347       bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
 348       bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
 349    } else if (op == imul64) {
 350       /* t4 = dpp(x_hi)
 351        * t1 = umul_lo(t4, y_lo)
 352        * t3 = dpp(x_lo)
 353        * t0 = umul_lo(t3, y_hi)
 354        * t2 = iadd(t0, t1)
 355        * t5 = umul_hi(t3, y_lo)
 356        * res_hi = iadd(t2, t5)
 357        * res_lo = umul_lo(t3, y_lo)
 358        * Requires that res_hi != src0[0] and res_hi != src1[0]
 359        * and that vtmp[0] != res_hi.
 360        */
 361       if (identity)
 362          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
 363       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,
 364                    bound_ctrl);
 365       bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
 366       if (identity)
 367          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
 368       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
 369                    bound_ctrl);
 370       bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
 371       emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
 372       if (identity)
 373          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
 374       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
 375                    bound_ctrl);
 376       bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
 377       emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
 378       if (identity)
 379          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
 380       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
 381                    bound_ctrl);
 382       bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
 383    }
 384 }
 385
 386 void
 387 emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
 388               ReduceOp op)
 389 {
 390    Builder bld(ctx->program, &ctx->instructions);
 391    Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
 392    RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
 393    Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};
 394    Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
 395    Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
 396    Operand src1_64 = Operand(src1_reg, v2);
 397
 398    if (src0_rc == s1 &&
 399        (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
 400       assert(vtmp.reg() != 0);
 401       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
 402       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
 403       src0_reg = vtmp;
 404       src0[0] = Operand(vtmp, v1);
 405       src0[1] = Operand(PhysReg{vtmp + 1}, v1);
 406       src0_64 = Operand(vtmp, v2);
 407    } else if (src0_rc == s1 && op == iadd64) {
 408       assert(vtmp.reg() != 0);
 409       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
 410       src0[1] = Operand(PhysReg{vtmp + 1}, v1);
 411    }
 412
 413    if (op == iadd64) {
 414       if (ctx->program->gfx_level >= GFX10) {
 415          bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
 416       } else {
 417          bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
 418       }
 419       bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
 420                Operand(vcc, bld.lm));
 421    } else if (op == iand64) {
 422       bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
 423       bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
 424    } else if (op == ior64) {
 425       bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
 426       bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
 427    } else if (op == ixor64) {
 428       bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
 429       bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
 430    } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
 431       aco_opcode cmp = aco_opcode::num_opcodes;
 432       switch (op) {
 433       case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
 434       case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
 435       case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
 436       case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
 437       default: break;
 438       }
 439
 440       bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
 441       bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
 442       bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
 443    } else if (op == imul64) {
 444       if (src1_reg == dst_reg) {
 445          /* it's fine if src0==dst but not if src1==dst */
 446          std::swap(src0_reg, src1_reg);
 447          std::swap(src0[0], src1[0]);
 448          std::swap(src0[1], src1[1]);
 449          std::swap(src0_64, src1_64);
 450       }
 451       assert(!(src0_reg == src1_reg));
 452       /* t1 = umul_lo(x_hi, y_lo)
 453        * t0 = umul_lo(x_lo, y_hi)
 454        * t2 = iadd(t0, t1)
 455        * t5 = umul_hi(x_lo, y_lo)
 456        * res_hi = iadd(t2, t5)
 457        * res_lo = umul_lo(x_lo, y_lo)
 458        * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
 459        */
 460       Definition tmp0_def(PhysReg{src0_reg + 1}, v1);
 461       Definition tmp1_def(PhysReg{src1_reg + 1}, v1);
 462       Operand tmp0_op = src0[1];
 463       Operand tmp1_op = src1[1];
 464       bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
 465       bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
 466       emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
 467       bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
 468       emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
 469       bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
 470    }
 471 }
 472
 473 void
 474 emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
 475             ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,
 476             bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */
 477 {
 478    Builder bld(ctx->program, &ctx->instructions);
 479    RegClass rc = RegClass(RegType::vgpr, size);
 480    Definition dst(dst_reg, rc);
 481    Operand src0(src0_reg, rc);
 482    Operand src1(src1_reg, rc);
 483
 484    aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
 485    bool vop3 = is_vop3_reduce_opcode(opcode);
 486
 487    if (!vop3) {
 488       if (opcode == aco_opcode::v_add_co_u32)
 489          bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask,
 490                       bound_ctrl);
 491       else
 492          bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
 493       return;
 494    }
 495
 496    if (opcode == aco_opcode::num_opcodes) {
 497       emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,
 498                         bound_ctrl, identity);
 499       return;
 500    }
 501
 502    if (identity)
 503       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
 504    if (identity && size >= 2)
 505       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);
 506
 507    for (unsigned i = 0; i < size; i++)
 508       bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
 509                    Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
 510
 511    bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
 512 }
 513
 514 void
 515 emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
 516         ReduceOp op, unsigned size)
 517 {
 518    Builder bld(ctx->program, &ctx->instructions);
 519    RegClass rc = RegClass(RegType::vgpr, size);
 520    Definition dst(dst_reg, rc);
 521    Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
 522    Operand src1(src1_reg, rc);
 523
 524    aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
 525    bool vop3 = is_vop3_reduce_opcode(opcode);
 526
 527    if (opcode == aco_opcode::num_opcodes) {
 528       emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
 529       return;
 530    }
 531
 532    if (vop3) {
 533       bld.vop3(opcode, dst, src0, src1);
 534    } else if (opcode == aco_opcode::v_add_co_u32) {
 535       bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
 536    } else {
 537       bld.vop2(opcode, dst, src0, src1);
 538    }
 539 }
 540
 541 void
 542 emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,
 543              unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
 544 {
 545    Builder bld(ctx->program, &ctx->instructions);
 546    for (unsigned i = 0; i < size; i++) {
 547       bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),
 548                    Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
 549    }
 550 }
 551
 552 void
 553 emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
 554 {
 555    for (unsigned i = 0; i < size; i++) {
 556       bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),
 557              Operand(PhysReg{src + i}, v1), ds_pattern);
 558    }
 559 }
 560
 561 void
 562 emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,
 563                PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
 564 {
 565    assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
 566    assert(cluster_size <= ctx->program->wave_size);
 567
 568    Builder bld(ctx->program, &ctx->instructions);
 569
 570    Operand identity[2];
 571    identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0));
 572    identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1));
 573    Operand vcndmask_identity[2] = {identity[0], identity[1]};
 574
 575    /* First, copy the source to tmp and set inactive lanes to the identity */
 576    bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),
 577             Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));
 578
 579    /* On GFX10+ v_writelane_b32/v_cndmask_b32_e64 can take a literal */
 580    if (ctx->program->gfx_level < GFX10) {
 581       for (unsigned i = 0; i < src.size(); i++) {
 582          /* p_exclusive_scan uses identity for v_writelane_b32 */
 583          if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
 584             bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);
 585             identity[i] = Operand(PhysReg{sitmp + i}, s1);
 586
 587             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
 588             vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
 589          } else if (identity[i].isLiteral()) {
 590             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
 591             vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
 592          }
 593       }
 594    }
 595
 596    for (unsigned i = 0; i < src.size(); i++) {
 597       bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
 598                    vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
 599                    Operand(stmp, bld.lm));
 600    }
 601
 602    if (src.regClass() == v1b) {
 603       if (ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11) {
 604          aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
 605             aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
 606          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
 607          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
 608          bool sext = reduce_op == imin8 || reduce_op == imax8;
 609          sdwa->sel[0] = SubdwordSel(1, 0, sext);
 610          sdwa->dst_sel = SubdwordSel::dword;
 611          bld.insert(std::move(sdwa));
 612       } else {
 613          aco_opcode opcode;
 614
 615          if (reduce_op == imin8 || reduce_op == imax8)
 616             opcode = aco_opcode::v_bfe_i32;
 617          else
 618             opcode = aco_opcode::v_bfe_u32;
 619
 620          bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
 621                   Operand::c32(8u));
 622       }
 623    } else if (src.regClass() == v2b) {
 624       bool is_add_cmp = reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
 625                         reduce_op == umin16 || reduce_op == umax16;
 626       if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level < GFX11 && is_add_cmp) {
 627          aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
 628             aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
 629          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
 630          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
 631          bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
 632          sdwa->sel[0] = SubdwordSel(2, 0, sext);
 633          sdwa->dst_sel = SubdwordSel::dword;
 634          bld.insert(std::move(sdwa));
 635       } else if (ctx->program->gfx_level <= GFX7 ||
 636                  (ctx->program->gfx_level >= GFX11 && is_add_cmp)) {
 637          aco_opcode opcode;
 638
 639          if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
 640             opcode = aco_opcode::v_bfe_i32;
 641          else
 642             opcode = aco_opcode::v_bfe_u32;
 643
 644          bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
 645                   Operand::c32(16u));
 646       }
 647    }
 648
 649    bool reduction_needs_last_op = false;
 650    switch (op) {
 651    case aco_opcode::p_reduce:
 652       if (cluster_size == 1)
 653          break;
 654
 655       if (ctx->program->gfx_level <= GFX7) {
 656          reduction_needs_last_op = true;
 657          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
 658          if (cluster_size == 2)
 659             break;
 660          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
 661          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
 662          if (cluster_size == 4)
 663             break;
 664          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
 665          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
 666          if (cluster_size == 8)
 667             break;
 668          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
 669          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
 670          if (cluster_size == 16)
 671             break;
 672          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
 673          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
 674          if (cluster_size == 32)
 675             break;
 676          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
 677          for (unsigned i = 0; i < src.size(); i++)
 678             bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
 679                          Operand::zero());
 680          // TODO: it would be more effective to do the last reduction step on SALU
 681          emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
 682          reduction_needs_last_op = false;
 683          break;
 684       }
 685
 686       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,
 687                   0xf, false);
 688       if (cluster_size == 2)
 689          break;
 690       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,
 691                   0xf, false);
 692       if (cluster_size == 4)
 693          break;
 694       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,
 695                   false);
 696       if (cluster_size == 8)
 697          break;
 698       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
 699       if (cluster_size == 16)
 700          break;
 701
 702       if (ctx->program->gfx_level >= GFX10) {
 703          /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
 704          for (unsigned i = 0; i < src.size(); i++)
 705             bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
 706                      Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero());
 707
 708          if (cluster_size == 32) {
 709             reduction_needs_last_op = true;
 710             break;
 711          }
 712
 713          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
 714          for (unsigned i = 0; i < src.size(); i++)
 715             bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
 716                          Operand::zero());
 717          // TODO: it would be more effective to do the last reduction step on SALU
 718          emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
 719          break;
 720       }
 721
 722       if (cluster_size == 32) {
 723          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
 724          reduction_needs_last_op = true;
 725          break;
 726       }
 727       assert(cluster_size == 64);
 728       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
 729                   false);
 730       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
 731                   false);
 732       break;
 733    case aco_opcode::p_exclusive_scan:
 734       if (ctx->program->gfx_level >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
 735          /* shift rows right */
 736          emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
 737
 738          /* fill in the gaps in rows 1 and 3 */
 739          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10000u));
 740          if (ctx->program->wave_size == 64)
 741             bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand::c32(0x10000u));
 742          for (unsigned i = 0; i < src.size(); i++) {
 743             Instruction* perm =
 744                bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
 745                         Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
 746                         Operand::c32(0xffffffffu))
 747                   .instr;
 748             perm->valu().opsel = 1; /* FI (Fetch Inactive) */
 749          }
 750          bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand::c64(UINT64_MAX));
 751
 752          if (ctx->program->wave_size == 64) {
 753             /* fill in the gap in row 2 */
 754             for (unsigned i = 0; i < src.size(); i++) {
 755                bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
 756                             Operand::c32(31u));
 757                bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
 758                              Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
 759             }
 760          }
 761          std::swap(tmp, vtmp);
 762       } else if (ctx->program->gfx_level >= GFX8) {
 763          emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
 764       } else {
 765          // TODO: use LDS on CS with a single write and shifted read
 766          /* wavefront shift_right by 1 on SI/CI */
 767          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
 768          emit_ds_swizzle(bld, tmp, tmp, src.size(),
 769                          ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
 770          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10101010u));
 771          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
 772          for (unsigned i = 0; i < src.size(); i++)
 773             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
 774                      Operand(PhysReg{tmp + i}, v1));
 775
 776          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
 777          emit_ds_swizzle(bld, tmp, tmp, src.size(),
 778                          ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
 779          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x01000100u));
 780          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
 781          for (unsigned i = 0; i < src.size(); i++)
 782             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
 783                      Operand(PhysReg{tmp + i}, v1));
 784
 785          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
 786          emit_ds_swizzle(bld, tmp, tmp, src.size(),
 787                          ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
 788          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(1u),
 789                   Operand::c32(16u));
 790          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(1u),
 791                   Operand::c32(16u));
 792          for (unsigned i = 0; i < src.size(); i++)
 793             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
 794                      Operand(PhysReg{tmp + i}, v1));
 795
 796          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
 797          for (unsigned i = 0; i < src.size(); i++) {
 798             bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(),
 799                           Operand(PhysReg{vtmp + i}, v1));
 800             bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
 801                          Operand::zero());
 802             bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
 803                           Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
 804             identity[i] = Operand::zero(); /* prevent further uses of identity */
 805          }
 806          std::swap(tmp, vtmp);
 807       }
 808
 809       for (unsigned i = 0; i < src.size(); i++) {
 810          if (!identity[i].isConstant() ||
 811              identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
 812             if (ctx->program->gfx_level < GFX10)
 813                assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
 814                       identity[i].physReg() == PhysReg{sitmp + i});
 815             bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(),
 816                           Operand(PhysReg{tmp + i}, v1));
 817          }
 818       }
 819       FALLTHROUGH;
 820    case aco_opcode::p_inclusive_scan:
 821       assert(cluster_size == ctx->program->wave_size);
 822       if (ctx->program->gfx_level <= GFX7) {
 823          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
 824          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xAAAAAAAAu));
 825          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
 826          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
 827
 828          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
 829          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
 830          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xCCCCCCCCu));
 831          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
 832          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
 833
 834          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
 835          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
 836          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xF0F0F0F0u));
 837          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
 838          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
 839
 840          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
 841          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
 842          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xFF00FF00u));
 843          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
 844          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
 845
 846          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
 847          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
 848          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),
 849                   Operand::c32(16u));
 850          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(16u),
 851                   Operand::c32(16u));
 852          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
 853
 854          for (unsigned i = 0; i < src.size(); i++)
 855             bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
 856                          Operand::c32(31u));
 857          bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),
 858                   Operand::c32(32u));
 859          emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
 860          break;
 861       }
 862
 863       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,
 864                   identity);
 865       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,
 866                   identity);
 867       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,
 868                   identity);
 869       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,
 870                   identity);
 871       if (ctx->program->gfx_level >= GFX10) {
 872          if (ctx->program->wave_size == 64) {
 873             bld.sop1(aco_opcode::s_bitreplicate_b64_b32, Definition(exec, s2),
 874                      Operand::c32(0xff00ff00u));
 875          } else {
 876             bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),
 877                      Operand::c32(16u));
 878          }
 879          for (unsigned i = 0; i < src.size(); i++) {
 880             Instruction* perm =
 881                bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
 882                         Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
 883                         Operand::c32(0xffffffffu))
 884                   .instr;
 885             perm->valu().opsel = 1; /* FI (Fetch Inactive) */
 886          }
 887          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
 888
 889          if (ctx->program->wave_size == 64) {
 890             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),
 891                      Operand::c32(32u));
 892             for (unsigned i = 0; i < src.size(); i++)
 893                bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
 894                             Operand::c32(31u));
 895             emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
 896          }
 897       } else {
 898          emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
 899                      false, identity);
 900          emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
 901                      false, identity);
 902       }
 903       break;
 904    default: unreachable("Invalid reduction mode");
 905    }
 906
 907    if (op == aco_opcode::p_reduce) {
 908       if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
 909          bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
 910          emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
 911          return;
 912       }
 913
 914       if (reduction_needs_last_op)
 915          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
 916    }
 917
 918    /* restore exec */
 919    bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
 920
 921    if (dst.regClass().type() == RegType::sgpr) {
 922       for (unsigned k = 0; k < src.size(); k++) {
 923          bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),
 924                       Operand::c32(ctx->program->wave_size - 1));
 925       }
 926    } else if (dst.physReg() != tmp) {
 927       for (unsigned k = 0; k < src.size(); k++) {
 928          bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
 929                   Operand(PhysReg{tmp + k}, v1));
 930       }
 931    }
 932 }
 933
 934 void
 935 adjust_bpermute_dst(Builder& bld, Definition dst, Operand input_data)
 936 {
 937    /* RA assumes that the result is always in the low part of the register, so we have to shift,
 938     * if it's not there already.
 939     */
 940    if (input_data.physReg().byte()) {
 941       unsigned right_shift = input_data.physReg().byte() * 8;
 942       bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),
 943                Operand(dst.physReg(), dst.regClass()));
 944    }
 945 }
 946
 947 void
 948 emit_bpermute_permlane(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
 949 {
 950    /* Emulates proper bpermute on GFX11 in wave64 mode.
 951     *
 952     * Similar to emit_gfx10_wave64_bpermute, but uses the new
 953     * v_permlane64_b32 instruction to swap data between lo and hi halves.
 954     */
 955
 956    assert(program->gfx_level >= GFX11);
 957    assert(program->wave_size == 64);
 958
 959    Definition dst = instr->definitions[0];
 960    Definition tmp_exec = instr->definitions[1];
 961    Definition clobber_scc = instr->definitions[2];
 962    Operand tmp_op = instr->operands[0];
 963    Operand index_x4 = instr->operands[1];
 964    Operand input_data = instr->operands[2];
 965    Operand same_half = instr->operands[3];
 966
 967    assert(dst.regClass() == v1);
 968    assert(tmp_exec.regClass() == bld.lm);
 969    assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
 970    assert(same_half.regClass() == bld.lm);
 971    assert(tmp_op.regClass() == v1.as_linear());
 972    assert(index_x4.regClass() == v1);
 973    assert(input_data.regClass().type() == RegType::vgpr);
 974    assert(input_data.bytes() <= 4);
 975
 976    Definition tmp_def(tmp_op.physReg(), tmp_op.regClass());
 977
 978    /* Permute the input within the same half-wave. */
 979    bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
 980
 981    /* Save EXEC and enable all lanes. */
 982    bld.sop1(aco_opcode::s_or_saveexec_b64, tmp_exec, clobber_scc, Definition(exec, s2),
 983             Operand::c32(-1u), Operand(exec, s2));
 984
 985    /* Copy input data from other half to current half's linear VGPR. */
 986    bld.vop1(aco_opcode::v_permlane64_b32, tmp_def, input_data);
 987
 988    /* Permute the input from the other half-wave, write to linear VGPR. */
 989    bld.ds(aco_opcode::ds_bpermute_b32, tmp_def, index_x4, tmp_op);
 990
 991    /* Restore saved EXEC. */
 992    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
 993
 994    /* Select correct permute result. */
 995    bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, tmp_op, Operand(dst.physReg(), dst.regClass()),
 996                 same_half);
 997
 998    adjust_bpermute_dst(bld, dst, input_data);
 999 }
1000
1001 void
1002 emit_bpermute_shared_vgpr(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
1003 {
1004    /* Emulates proper bpermute on GFX10 in wave64 mode.
1005     *
1006     * This is necessary because on GFX10 the bpermute instruction only works
1007     * on half waves (you can think of it as having a cluster size of 32), so we
1008     * manually swap the data between the two halves using two shared VGPRs.
1009     */
1010
1011    assert(program->gfx_level >= GFX10 && program->gfx_level <= GFX10_3);
1012    assert(program->wave_size == 64);
1013
1014    unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256;
1015    Definition dst = instr->definitions[0];
1016    Definition tmp_exec = instr->definitions[1];
1017    Definition clobber_scc = instr->definitions[2];
1018    Operand index_x4 = instr->operands[0];
1019    Operand input_data = instr->operands[1];
1020    Operand same_half = instr->operands[2];
1021
1022    assert(dst.regClass() == v1);
1023    assert(tmp_exec.regClass() == bld.lm);
1024    assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1025    assert(same_half.regClass() == bld.lm);
1026    assert(index_x4.regClass() == v1);
1027    assert(input_data.regClass().type() == RegType::vgpr);
1028    assert(input_data.bytes() <= 4);
1029    assert(dst.physReg() != index_x4.physReg());
1030    assert(dst.physReg() != input_data.physReg());
1031    assert(tmp_exec.physReg() != same_half.physReg());
1032
1033    PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
1034    PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
1035
1036    /* Permute the input within the same half-wave */
1037    bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1038
1039    /* HI: Copy data from high lanes 32-63 to shared vgpr */
1040    bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,
1041                 dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
1042    /* Save EXEC */
1043    bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
1044    /* Set EXEC to enable LO lanes only */
1045    bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::zero());
1046    /* LO: Copy data from low lanes 0-31 to shared vgpr */
1047    bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
1048    /* LO: bpermute shared vgpr (high lanes' data) */
1049    bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,
1050           Operand(shared_vgpr_hi, v1));
1051    /* Set EXEC to enable HI lanes only */
1052    bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::c32(32u));
1053    /* HI: bpermute shared vgpr (low lanes' data) */
1054    bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,
1055           Operand(shared_vgpr_lo, v1));
1056
1057    /* Only enable lanes which use the other half's data */
1058    bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,
1059             Operand(tmp_exec.physReg(), s2), same_half);
1060    /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
1061    bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),
1062                 0x3, 0xf, false);
1063    /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
1064    bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),
1065                 0xc, 0xf, false);
1066
1067    /* Restore saved EXEC */
1068    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1069
1070    adjust_bpermute_dst(bld, dst, input_data);
1071 }
1072
1073 void
1074 emit_bpermute_readlane(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
1075 {
1076    /* Emulates bpermute using readlane instructions */
1077
1078    Operand index = instr->operands[0];
1079    Operand input = instr->operands[1];
1080    Definition dst = instr->definitions[0];
1081    Definition temp_exec = instr->definitions[1];
1082    Definition clobber_vcc = instr->definitions[2];
1083
1084    assert(dst.regClass() == v1);
1085    assert(temp_exec.regClass() == bld.lm);
1086    assert(clobber_vcc.regClass() == bld.lm);
1087    assert(clobber_vcc.physReg() == vcc);
1088    assert(index.regClass() == v1);
1089    assert(index.physReg() != dst.physReg());
1090    assert(input.regClass().type() == RegType::vgpr);
1091    assert(input.bytes() <= 4);
1092    assert(input.physReg() != dst.physReg());
1093
1094    /* Save original EXEC */
1095    bld.sop1(Builder::s_mov, temp_exec, Operand(exec, bld.lm));
1096
1097    /* An "unrolled loop" that is executed per each lane.
1098     * This takes only a few instructions per lane, as opposed to a "real" loop
1099     * with branching, where the branch instruction alone would take 16+ cycles.
1100     */
1101    for (unsigned n = 0; n < program->wave_size; ++n) {
1102       /* Activate the lane which has N for its source index */
1103       bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), clobber_vcc, Operand::c32(n),
1104                index);
1105       /* Read the data from lane N */
1106       bld.readlane(Definition(vcc, s1), input, Operand::c32(n));
1107       /* On the active lane, move the data we read from lane N to the destination VGPR */
1108       bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
1109       /* Restore original EXEC */
1110       bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(temp_exec.physReg(), bld.lm));
1111    }
1112
1113    adjust_bpermute_dst(bld, dst, input);
1114 }
1115
1116 struct copy_operation {
1117    Operand op;
1118    Definition def;
1119    unsigned bytes;
1120    union {
1121       uint8_t uses[8];
1122       uint64_t is_used = 0;
1123    };
1124 };
1125
1126 void
1127 split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
1128            const copy_operation& src, bool ignore_uses, unsigned max_size)
1129 {
1130    PhysReg def_reg = src.def.physReg();
1131    PhysReg op_reg = src.op.physReg();
1132    def_reg.reg_b += offset;
1133    op_reg.reg_b += offset;
1134
1135    /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10, and on GFX11
1136     * v_lshrrev_b64 doesn't get dual issued. */
1137    if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) &&
1138        src.def.regClass().type() == RegType::vgpr)
1139       max_size = MIN2(max_size, 4);
1140    unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
1141
1142    /* make sure the size is a power of two and reg % bytes == 0 */
1143    unsigned bytes = 1;
1144    for (; bytes <= max_size; bytes *= 2) {
1145       unsigned next = bytes * 2u;
1146       bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
1147                           offset + next <= src.bytes && next <= max_size;
1148       if (!src.op.isConstant() && can_increase)
1149          can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
1150       for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
1151          can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
1152       if (!can_increase)
1153          break;
1154    }
1155
1156    *def = Definition(src.def.tempId(), def_reg, src.def.regClass().resize(bytes));
1157    if (src.op.isConstant()) {
1158       assert(bytes >= 1 && bytes <= 8);
1159       uint64_t val = src.op.constantValue64() >> (offset * 8u);
1160       *op = Operand::get_const(ctx->program->gfx_level, val, bytes);
1161    } else {
1162       RegClass op_cls = src.op.regClass().resize(bytes);
1163       *op = Operand(op_reg, op_cls);
1164       op->setTemp(Temp(src.op.tempId(), op_cls));
1165    }
1166 }
1167
1168 uint32_t
1169 get_intersection_mask(int a_start, int a_size, int b_start, int b_size)
1170 {
1171    int intersection_start = MAX2(b_start - a_start, 0);
1172    int intersection_end = MAX2(b_start + b_size - a_start, 0);
1173    if (intersection_start >= a_size || intersection_end == 0)
1174       return 0;
1175
1176    uint32_t mask = u_bit_consecutive(0, a_size);
1177    return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
1178 }
1179
1180 /* src1 are bytes 0-3. dst/src0 are bytes 4-7. */
1181 void
1182 create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1,
1183              Operand src0 = Operand(v1))
1184 {
1185    uint32_t swiz_packed =
1186       swiz[0] | ((uint32_t)swiz[1] << 8) | ((uint32_t)swiz[2] << 16) | ((uint32_t)swiz[3] << 24);
1187
1188    dst = Definition(PhysReg(dst.physReg().reg()), v1);
1189    if (!src1.isConstant())
1190       src1 = Operand(PhysReg(src1.physReg().reg()), v1);
1191    if (src0.isUndefined())
1192       src0 = Operand(dst.physReg(), v1);
1193    else if (!src0.isConstant())
1194       src0 = Operand(PhysReg(src0.physReg().reg()), v1);
1195    bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed));
1196 }
1197
1198 void
1199 emit_v_mov_b16(Builder& bld, Definition dst, Operand op)
1200 {
1201    /* v_mov_b16 uses 32bit inline constants. */
1202    if (op.isConstant()) {
1203       if (!op.isLiteral() && op.physReg() >= 240) {
1204          /* v_add_f16 is smaller because it can use 16bit fp inline constants. */
1205          Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero());
1206          instr->valu().opsel[3] = dst.physReg().byte() == 2;
1207          return;
1208       }
1209       op = Operand::c32((int32_t)(int16_t)op.constantValue());
1210    }
1211
1212    Instruction* instr = bld.vop1(aco_opcode::v_mov_b16, dst, op);
1213    instr->valu().opsel[0] = op.physReg().byte() == 2;
1214    instr->valu().opsel[3] = dst.physReg().byte() == 2;
1215 }
1216
1217 void
1218 copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
1219 {
1220    assert(op.bytes() == dst.bytes());
1221
1222    if (dst.bytes() == 4 && op.isLiteral()) {
1223       uint32_t imm = op.constantValue();
1224       if (dst.regClass() == s1 && (imm >= 0xffff8000 || imm <= 0x7fff)) {
1225          bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
1226          return;
1227       } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) {
1228          uint32_t rev = util_bitreverse(imm);
1229          if (dst.regClass() == s1)
1230             bld.sop1(aco_opcode::s_brev_b32, dst, Operand::c32(rev));
1231          else
1232             bld.vop1(aco_opcode::v_bfrev_b32, dst, Operand::c32(rev));
1233          return;
1234       } else if (dst.regClass() == s1) {
1235          unsigned start = (ffs(imm) - 1) & 0x1f;
1236          unsigned size = util_bitcount(imm) & 0x1f;
1237          if (BITFIELD_RANGE(start, size) == imm) {
1238             bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start));
1239             return;
1240          }
1241          if (ctx->program->gfx_level >= GFX9) {
1242             Operand op_lo = Operand::c32(int32_t(int16_t(imm)));
1243             Operand op_hi = Operand::c32(int32_t(int16_t(imm >> 16)));
1244             if (!op_lo.isLiteral() && !op_hi.isLiteral()) {
1245                bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op_lo, op_hi);
1246                return;
1247             }
1248          }
1249       }
1250    }
1251
1252    if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->gfx_level >= GFX8)
1253       op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
1254
1255    if (dst.regClass() == s1) {
1256       bld.sop1(aco_opcode::s_mov_b32, dst, op);
1257    } else if (dst.regClass() == s2) {
1258       /* s_ashr_i64 writes SCC, so we can't use it */
1259       assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false));
1260       uint64_t imm = op.constantValue64();
1261       if (op.isLiteral()) {
1262          unsigned start = (ffsll(imm) - 1) & 0x3f;
1263          unsigned size = util_bitcount64(imm) & 0x3f;
1264          if (BITFIELD64_RANGE(start, size) == imm) {
1265             bld.sop2(aco_opcode::s_bfm_b64, dst, Operand::c32(size), Operand::c32(start));
1266             return;
1267          }
1268       }
1269       bld.sop1(aco_opcode::s_mov_b64, dst, op);
1270    } else if (dst.regClass() == v2) {
1271       if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
1272          bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op);
1273       } else {
1274          assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
1275          bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op);
1276       }
1277    } else if (dst.regClass() == v1) {
1278       bld.vop1(aco_opcode::v_mov_b32, dst, op);
1279    } else {
1280       assert(dst.regClass() == v1b || dst.regClass() == v2b);
1281
1282       bool use_sdwa = ctx->program->gfx_level >= GFX9 && ctx->program->gfx_level < GFX11;
1283       /* We need the v_perm_b32 (VOP3) to be able to take literals, and that's a GFX10+ feature. */
1284       bool can_use_perm = ctx->program->gfx_level >= GFX10 &&
1285                           (op.constantEquals(0) || op.constantEquals(0xff) ||
1286                            op.constantEquals(0xffff) || op.constantEquals(0xff00));
1287       if (dst.regClass() == v1b && use_sdwa) {
1288          uint8_t val = op.constantValue();
1289          Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
1290          if (op32.isLiteral()) {
1291             uint32_t a = (uint32_t)int8_mul_table[val * 2];
1292             uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
1293             bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,
1294                           Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)),
1295                           Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u)));
1296          } else {
1297             bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
1298          }
1299       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1300          emit_v_mov_b16(bld, dst, op);
1301       } else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) {
1302          if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
1303             /* use v_mov_b32 to avoid possible issues with denormal flushing or
1304              * NaN. v_add_f16 is still needed for float constants. */
1305             uint32_t val32 = (int32_t)(int16_t)op.constantValue();
1306             bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32));
1307          } else {
1308             bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero());
1309          }
1310       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10 &&
1311                  (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in)) {
1312          if (dst.physReg().byte() == 2) {
1313             Operand def_lo(dst.physReg().advance(-2), v2b);
1314             Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, def_lo, op);
1315             instr->valu().opsel = 0;
1316          } else {
1317             assert(dst.physReg().byte() == 0);
1318             Operand def_hi(dst.physReg().advance(2), v2b);
1319             Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, op, def_hi);
1320             instr->valu().opsel = 2;
1321          }
1322       } else if (can_use_perm) {
1323          uint8_t swiz[] = {4, 5, 6, 7};
1324          swiz[dst.physReg().byte()] = op.constantValue() & 0xff ? bperm_255 : bperm_0;
1325          if (dst.bytes() == 2)
1326             swiz[dst.physReg().byte() + 1] = op.constantValue() >> 8 ? bperm_255 : bperm_0;
1327          create_bperm(bld, swiz, dst, Operand::zero());
1328       } else {
1329          uint32_t offset = dst.physReg().byte() * 8u;
1330          uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset;
1331          uint32_t val = (op.constantValue() << offset) & mask;
1332          dst = Definition(PhysReg(dst.physReg().reg()), v1);
1333          Operand def_op(dst.physReg(), v1);
1334          if (val != mask)
1335             bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op);
1336          if (val != 0)
1337             bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op);
1338       }
1339    }
1340 }
1341
1342 void
1343 copy_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
1344 {
1345    if (preserve_scc)
1346       bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
1347
1348    for (unsigned i = 0; i < 2; i++) {
1349       if (def.size() == 2)
1350          bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1351       else
1352          bld.vop1(aco_opcode::v_mov_b32, def, op);
1353
1354       bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
1355                Operand(exec, bld.lm));
1356    }
1357
1358    if (preserve_scc)
1359       bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
1360                Operand::zero());
1361 }
1362
1363 void
1364 swap_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
1365 {
1366    if (preserve_scc)
1367       bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
1368
1369    Operand def_as_op = Operand(def.physReg(), def.regClass());
1370    Definition op_as_def = Definition(op.physReg(), op.regClass());
1371
1372    for (unsigned i = 0; i < 2; i++) {
1373       if (bld.program->gfx_level >= GFX9) {
1374          bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1375       } else {
1376          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1377          bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1378          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1379       }
1380
1381       bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
1382                Operand(exec, bld.lm));
1383    }
1384
1385    if (preserve_scc)
1386       bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
1387                Operand::zero());
1388 }
1389
1390 void
1391 addsub_subdword_gfx11(Builder& bld, Definition dst, Operand src0, Operand src1, bool sub)
1392 {
1393    Instruction* instr =
1394       bld.vop3(sub ? aco_opcode::v_sub_u16_e64 : aco_opcode::v_add_u16_e64, dst, src0, src1).instr;
1395    if (src0.physReg().byte() == 2)
1396       instr->valu().opsel |= 0x1;
1397    if (src1.physReg().byte() == 2)
1398       instr->valu().opsel |= 0x2;
1399    if (dst.physReg().byte() == 2)
1400       instr->valu().opsel |= 0x8;
1401 }
1402
1403 bool
1404 do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
1405         PhysReg scratch_sgpr)
1406 {
1407    bool did_copy = false;
1408    for (unsigned offset = 0; offset < copy.bytes;) {
1409       if (copy.uses[offset]) {
1410          offset++;
1411          continue;
1412       }
1413
1414       Definition def;
1415       Operand op;
1416       split_copy(ctx, offset, &def, &op, copy, false, 8);
1417
1418       if (def.physReg() == scc) {
1419          bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero());
1420          *preserve_scc = true;
1421       } else if (op.isConstant()) {
1422          copy_constant(ctx, bld, def, op);
1423       } else if (def.regClass().is_linear_vgpr()) {
1424          copy_linear_vgpr(bld, def, op, *preserve_scc, scratch_sgpr);
1425       } else if (def.regClass() == v1) {
1426          bld.vop1(aco_opcode::v_mov_b32, def, op);
1427       } else if (def.regClass() == v2) {
1428          bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1429       } else if (def.regClass() == s1) {
1430          bld.sop1(aco_opcode::s_mov_b32, def, op);
1431       } else if (def.regClass() == s2) {
1432          bld.sop1(aco_opcode::s_mov_b64, def, op);
1433       } else if (def.regClass().is_subdword() && ctx->program->gfx_level < GFX8) {
1434          if (op.physReg().byte()) {
1435             assert(def.physReg().byte() == 0);
1436             bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand::c32(op.physReg().byte() * 8), op);
1437          } else if (def.physReg().byte()) {
1438             assert(op.physReg().byte() == 0);
1439             /* preserve the target's lower half */
1440             uint32_t bits = def.physReg().byte() * 8;
1441             PhysReg lo_reg = PhysReg(def.physReg().reg());
1442             Definition lo_half =
1443                Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
1444             Definition dst =
1445                Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
1446
1447             if (def.physReg().reg() == op.physReg().reg()) {
1448                bld.vop2(aco_opcode::v_and_b32, lo_half, Operand::c32((1 << bits) - 1u),
1449                         Operand(lo_reg, lo_half.regClass()));
1450                if (def.physReg().byte() == 1) {
1451                   bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand::c32((1 << bits) + 1u), op);
1452                } else if (def.physReg().byte() == 2) {
1453                   bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
1454                } else if (def.physReg().byte() == 3) {
1455                   bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1),
1456                            Operand::c32((1 << bits) + 1u));
1457                   bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
1458                }
1459             } else {
1460                lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
1461                bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand::c32(32 - bits),
1462                         Operand(lo_reg, lo_half.regClass()));
1463                bld.vop3(aco_opcode::v_alignbyte_b32, dst, op,
1464                         Operand(lo_half.physReg(), lo_half.regClass()),
1465                         Operand::c32(4 - def.physReg().byte()));
1466             }
1467          } else {
1468             bld.vop1(aco_opcode::v_mov_b32, def, op);
1469          }
1470       } else if (def.regClass() == v1b && ctx->program->gfx_level >= GFX11) {
1471          uint8_t swiz[] = {4, 5, 6, 7};
1472          swiz[def.physReg().byte()] = op.physReg().byte();
1473          create_bperm(bld, swiz, def, op);
1474       } else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1475          emit_v_mov_b16(bld, def, op);
1476       } else if (def.regClass().is_subdword()) {
1477          bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
1478       } else {
1479          unreachable("unsupported copy");
1480       }
1481
1482       did_copy = true;
1483       offset += def.bytes();
1484    }
1485    return did_copy;
1486 }
1487
1488 void
1489 swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
1490 {
1491    if (def.physReg().reg() == op.physReg().reg()) {
1492       assert(def.bytes() != 2); /* handled by caller */
1493       uint8_t swiz[] = {4, 5, 6, 7};
1494       std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]);
1495       create_bperm(bld, swiz, def, Operand::zero());
1496       return;
1497    }
1498
1499    if (def.bytes() == 2) {
1500       Operand def_as_op = Operand(def.physReg(), def.regClass());
1501       Definition op_as_def = Definition(op.physReg(), op.regClass());
1502       addsub_subdword_gfx11(bld, def, def_as_op, op, false);
1503       addsub_subdword_gfx11(bld, op_as_def, def_as_op, op, true);
1504       addsub_subdword_gfx11(bld, def, def_as_op, op, true);
1505    } else {
1506       PhysReg op_half = op.physReg();
1507       op_half.reg_b &= ~1;
1508
1509       PhysReg def_other_half = def.physReg();
1510       def_other_half.reg_b &= ~1;
1511       def_other_half.reg_b ^= 2;
1512
1513       /* We can only swap individual bytes within a single VGPR, so temporarily move both bytes
1514        * into the same VGPR.
1515        */
1516       swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1517       swap_subdword_gfx11(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b));
1518       swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1519    }
1520 }
1521
1522 void
1523 do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,
1524         Pseudo_instruction* pi)
1525 {
1526    unsigned offset = 0;
1527
1528    if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1529        (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1530       /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte
1531        * swap */
1532       PhysReg op = copy.op.physReg();
1533       PhysReg def = copy.def.physReg();
1534       op.reg_b &= ~0x3;
1535       def.reg_b &= ~0x3;
1536
1537       copy_operation tmp;
1538       tmp.op = Operand(op, v1);
1539       tmp.def = Definition(def, v1);
1540       tmp.bytes = 4;
1541       memset(tmp.uses, 1, 4);
1542       do_swap(ctx, bld, tmp, preserve_scc, pi);
1543
1544       op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1545       def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1546       tmp.op = Operand(op, v1b);
1547       tmp.def = Definition(def, v1b);
1548       tmp.bytes = 1;
1549       tmp.uses[0] = 1;
1550       do_swap(ctx, bld, tmp, preserve_scc, pi);
1551
1552       offset = copy.bytes;
1553    }
1554
1555    for (; offset < copy.bytes;) {
1556       Definition def;
1557       Operand op;
1558       unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
1559       split_copy(ctx, offset, &def, &op, copy, true, max_size);
1560
1561       assert(op.regClass() == def.regClass());
1562       Operand def_as_op = Operand(def.physReg(), def.regClass());
1563       Definition op_as_def = Definition(op.physReg(), op.regClass());
1564       if (def.regClass().is_linear_vgpr()) {
1565          swap_linear_vgpr(bld, def, op, preserve_scc, pi->scratch_sgpr);
1566       } else if (ctx->program->gfx_level >= GFX9 && def.regClass() == v1) {
1567          bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1568       } else if (def.regClass() == v1) {
1569          assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1570          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1571          bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1572          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1573       } else if (op.physReg() == scc || def.physReg() == scc) {
1574          /* we need to swap scc and another sgpr */
1575          assert(!preserve_scc);
1576
1577          PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1578
1579          bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1580          bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1),
1581                   Operand::zero());
1582          bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1583       } else if (def.regClass() == s1) {
1584          if (preserve_scc) {
1585             bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1586             bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1587             bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1588          } else {
1589             bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1590             bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1591             bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1592          }
1593       } else if (def.regClass() == s2) {
1594          if (preserve_scc)
1595             bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1596          bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1597          bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1598          bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1599          if (preserve_scc)
1600             bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
1601                      Operand::zero());
1602       } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1603          bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
1604                   Operand::c32(2u));
1605       } else {
1606          assert(def.regClass().is_subdword());
1607          if (ctx->program->gfx_level >= GFX11) {
1608             swap_subdword_gfx11(bld, def, op);
1609          } else {
1610             bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1611             bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1612             bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1613          }
1614       }
1615
1616       offset += def.bytes();
1617    }
1618
1619    if (ctx->program->gfx_level <= GFX7)
1620       return;
1621
1622    /* fixup in case we swapped bytes we shouldn't have */
1623    copy_operation tmp_copy = copy;
1624    tmp_copy.op.setFixed(copy.def.physReg());
1625    tmp_copy.def.setFixed(copy.op.physReg());
1626    do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1627 }
1628
1629 void
1630 do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1631 {
1632    if (lo.isConstant() && hi.isConstant()) {
1633       copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
1634       return;
1635    }
1636
1637    bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
1638                        (ctx->program->gfx_level >= GFX10 ||
1639                         (ctx->program->gfx_level >= GFX9 && !lo.isLiteral() && !hi.isLiteral()));
1640
1641    if (can_use_pack) {
1642       Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1643       /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1644       instr->valu().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1645       return;
1646    }
1647
1648    /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
1649    if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1650        (!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
1651                                                     ctx->program->gfx_level >= GFX10)))) {
1652       if (hi.isConstant())
1653          bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
1654                   Operand::c32(2u));
1655       else
1656          bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1657       return;
1658    }
1659
1660    Definition def_lo = Definition(def.physReg(), v2b);
1661    Definition def_hi = Definition(def.physReg().advance(2), v2b);
1662
1663    if (lo.isConstant()) {
1664       /* move hi and zero low bits */
1665       if (hi.physReg().byte() == 0)
1666          bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1667       else
1668          bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
1669       if (lo.constantValue())
1670          bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1671                   Operand(def.physReg(), v1));
1672       return;
1673    }
1674    if (hi.isConstant()) {
1675       /* move lo and zero high bits */
1676       if (lo.physReg().byte() == 2)
1677          bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1678       else if (ctx->program->gfx_level >= GFX11)
1679          bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
1680       else
1681          bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
1682       if (hi.constantValue())
1683          bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1684                   Operand(def.physReg(), v1));
1685       return;
1686    }
1687
1688    if (lo.physReg().reg() == def.physReg().reg()) {
1689       /* lo is in the high bits of def */
1690       assert(lo.physReg().byte() == 2);
1691       bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1692       lo.setFixed(def.physReg());
1693    } else if (hi.physReg() == def.physReg()) {
1694       /* hi is in the low bits of def */
1695       assert(hi.physReg().byte() == 0);
1696       bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1697       hi.setFixed(def.physReg().advance(2));
1698    } else if (ctx->program->gfx_level >= GFX8) {
1699       /* Either lo or hi can be placed with just a v_mov. SDWA is not needed, because
1700        * op.physReg().byte()==def.physReg().byte() and the other half will be overwritten.
1701        */
1702       assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
1703       Operand& op = lo.physReg().byte() == 0 ? lo : hi;
1704       PhysReg reg = def.physReg().advance(op.physReg().byte());
1705       bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
1706       op.setFixed(reg);
1707    }
1708
1709    /* either hi or lo are already placed correctly */
1710    if (ctx->program->gfx_level >= GFX11) {
1711       if (lo.physReg().reg() == def.physReg().reg())
1712          emit_v_mov_b16(bld, def_hi, hi);
1713       else
1714          emit_v_mov_b16(bld, def_lo, lo);
1715       return;
1716    } else if (ctx->program->gfx_level >= GFX8) {
1717       if (lo.physReg().reg() == def.physReg().reg())
1718          bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
1719       else
1720          bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
1721       return;
1722    }
1723
1724    /* alignbyte needs the operands in the following way:
1725     * | xx hi | lo xx | >> 2 byte */
1726    if (lo.physReg().byte() != hi.physReg().byte()) {
1727       /* | xx lo | hi xx | => | lo hi | lo hi | */
1728       assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 2);
1729       bld.vop3(aco_opcode::v_alignbyte_b32, def, lo, hi, Operand::c32(2u));
1730       lo = Operand(def_hi.physReg(), v2b);
1731       hi = Operand(def_lo.physReg(), v2b);
1732    } else if (lo.physReg().byte() == 0) {
1733       /* | xx hi | xx lo | => | xx hi | lo 00 | */
1734       bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), lo);
1735       lo = Operand(def_hi.physReg(), v2b);
1736    } else {
1737       /* | hi xx | lo xx | => | 00 hi | lo xx | */
1738       assert(hi.physReg().byte() == 2);
1739       bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), hi);
1740       hi = Operand(def_lo.physReg(), v2b);
1741    }
1742    /* perform the alignbyte */
1743    bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1744 }
1745
1746 void
1747 try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,
1748                     copy_operation& copy)
1749 {
1750    // TODO try more relaxed alignment for subdword copies
1751    unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);
1752    unsigned next_op_align = next_def_align;
1753    if (copy.def.regClass().type() == RegType::vgpr)
1754       next_def_align = MIN2(next_def_align, 4);
1755    if (copy.op.regClass().type() == RegType::vgpr)
1756       next_op_align = MIN2(next_op_align, 4);
1757
1758    if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align ||
1759        (!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align))
1760       return;
1761
1762    auto other = copy_map.find(copy.def.physReg().advance(copy.bytes));
1763    if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 ||
1764        copy.op.isConstant() != other->second.op.isConstant())
1765       return;
1766
1767    /* don't create 64-bit copies before GFX10 */
1768    if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
1769        ctx->program->gfx_level < GFX10)
1770       return;
1771
1772    unsigned new_size = copy.bytes + other->second.bytes;
1773    if (copy.op.isConstant()) {
1774       uint64_t val =
1775          copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
1776       if (!util_is_power_of_two_or_zero(new_size))
1777          return;
1778       if (!Operand::is_constant_representable(val, new_size, true,
1779                                               copy.def.regClass().type() == RegType::vgpr))
1780          return;
1781       copy.op = Operand::get_const(ctx->program->gfx_level, val, new_size);
1782    } else {
1783       if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))
1784          return;
1785       copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size));
1786    }
1787
1788    copy.bytes = new_size;
1789    copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes));
1790    copy_map.erase(other);
1791 }
1792
1793 void
1794 handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
1795                 amd_gfx_level gfx_level, Pseudo_instruction* pi)
1796 {
1797    Builder bld(ctx->program, &ctx->instructions);
1798    unsigned num_instructions_before = ctx->instructions.size();
1799    aco_ptr<Instruction> mov;
1800    bool writes_scc = false;
1801
1802    /* count the number of uses for each dst reg */
1803    for (auto it = copy_map.begin(); it != copy_map.end();) {
1804
1805       if (it->second.def.physReg() == scc)
1806          writes_scc = true;
1807
1808       assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
1809
1810       /* if src and dst reg are the same, remove operation */
1811       if (it->first == it->second.op.physReg()) {
1812          it = copy_map.erase(it);
1813          continue;
1814       }
1815
1816       /* split large copies */
1817       if (it->second.bytes > 8) {
1818          assert(!it->second.op.isConstant());
1819          assert(!it->second.def.regClass().is_subdword());
1820          RegClass rc = RegClass(it->second.def.regClass().type(), it->second.def.size() - 2);
1821          Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1822          rc = RegClass(it->second.op.regClass().type(), it->second.op.size() - 2);
1823          Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1824          copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1825          copy_map[hi_def.physReg()] = copy;
1826          assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1827          it->second.op = Operand(it->second.op.physReg(),
1828                                  it->second.op.regClass().type() == RegType::sgpr ? s2 : v2);
1829          it->second.def = Definition(it->second.def.physReg(),
1830                                      it->second.def.regClass().type() == RegType::sgpr ? s2 : v2);
1831          it->second.bytes = 8;
1832       }
1833
1834       try_coalesce_copies(ctx, copy_map, it->second);
1835
1836       /* check if the definition reg is used by another copy operation */
1837       for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1838          if (copy.second.op.isConstant())
1839             continue;
1840          for (uint16_t i = 0; i < it->second.bytes; i++) {
1841             /* distance might underflow */
1842             unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1843             if (distance < copy.second.bytes)
1844                it->second.uses[i] += 1;
1845          }
1846       }
1847
1848       ++it;
1849    }
1850
1851    /* first, handle paths in the location transfer graph */
1852    bool preserve_scc = pi->tmp_in_scc && !writes_scc;
1853    bool skip_partial_copies = true;
1854    for (auto it = copy_map.begin();;) {
1855       if (copy_map.empty()) {
1856          ctx->program->statistics[aco_statistic_copies] +=
1857             ctx->instructions.size() - num_instructions_before;
1858          return;
1859       }
1860       if (it == copy_map.end()) {
1861          if (!skip_partial_copies)
1862             break;
1863          skip_partial_copies = false;
1864          it = copy_map.begin();
1865       }
1866
1867       /* check if we can pack one register at once */
1868       if (it->first.byte() == 0 && it->second.bytes == 2) {
1869          PhysReg reg_hi = it->first.advance(2);
1870          std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1871          if (other != copy_map.end() && other->second.bytes == 2) {
1872             /* check if the target register is otherwise unused */
1873             bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&
1874                                                      other->second.op.physReg() == it->first);
1875             bool unused_hi = !other->second.is_used ||
1876                              (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);
1877             if (unused_lo && unused_hi) {
1878                Operand lo = it->second.op;
1879                Operand hi = other->second.op;
1880                do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1881                copy_map.erase(it);
1882                copy_map.erase(other);
1883
1884                for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) {
1885                   for (uint16_t i = 0; i < other2.second.bytes; i++) {
1886                      /* distance might underflow */
1887                      unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b;
1888                      unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b;
1889                      if (distance_lo < 2 || distance_hi < 2)
1890                         other2.second.uses[i] -= 1;
1891                   }
1892                }
1893                it = copy_map.begin();
1894                continue;
1895             }
1896          }
1897       }
1898
1899       /* on GFX6/7, we need some small workarounds as there is no
1900        * SDWA instruction to do partial register writes */
1901       if (ctx->program->gfx_level < GFX8 && it->second.bytes < 4) {
1902          if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used &&
1903              pi->opcode == aco_opcode::p_split_vector) {
1904             /* Other operations might overwrite the high bits, so change all users
1905              * of the high bits to the new target where they are still available.
1906              * This mechanism depends on also emitting dead definitions. */
1907             PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes);
1908             while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) {
1909                std::map<PhysReg, copy_operation>::iterator other = copy_map.begin();
1910                for (other = copy_map.begin(); other != copy_map.end(); other++) {
1911                   /* on GFX6/7, if the high bits are used as operand, they cannot be a target */
1912                   if (other->second.op.physReg() == reg_hi) {
1913                      other->second.op.setFixed(it->first.advance(reg_hi.byte()));
1914                      break; /* break because an operand can only be used once */
1915                   }
1916                }
1917                reg_hi = reg_hi.advance(it->second.bytes);
1918             }
1919          } else if (it->first.byte()) {
1920             assert(pi->opcode == aco_opcode::p_create_vector);
1921             /* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,
1922              * move to the target operand's high bits. This is save to do as it cannot be an operand
1923              */
1924             PhysReg lo = PhysReg(it->first.reg());
1925             std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);
1926             if (other != copy_map.end()) {
1927                assert(other->second.bytes == it->first.byte());
1928                PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte());
1929                it->second.def = Definition(new_reg_hi, it->second.def.regClass());
1930                it->second.is_used = 0;
1931                other->second.bytes += it->second.bytes;
1932                other->second.def.setTemp(Temp(other->second.def.tempId(),
1933                                               RegClass::get(RegType::vgpr, other->second.bytes)));
1934                other->second.op.setTemp(Temp(other->second.op.tempId(),
1935                                              RegClass::get(RegType::vgpr, other->second.bytes)));
1936                /* if the new target's high bits are also a target, change uses */
1937                std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);
1938                if (target != copy_map.end()) {
1939                   for (unsigned i = 0; i < it->second.bytes; i++)
1940                      target->second.uses[i]++;
1941                }
1942             }
1943          }
1944       }
1945
1946       /* find portions where the target reg is not used as operand for any other copy */
1947       if (it->second.is_used) {
1948          if (it->second.op.isConstant() || skip_partial_copies) {
1949             /* we have to skip constants until is_used=0.
1950              * we also skip partial copies at the beginning to help coalescing */
1951             ++it;
1952             continue;
1953          }
1954
1955          unsigned has_zero_use_bytes = 0;
1956          for (unsigned i = 0; i < it->second.bytes; i++)
1957             has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1958
1959          if (has_zero_use_bytes) {
1960             /* Skipping partial copying and doing a v_swap_b32 and then fixup
1961              * copies is usually beneficial for sub-dword copies, but if doing
1962              * a partial copy allows further copies, it should be done instead. */
1963             bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1964             for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1965                /* on GFX6/7, we can only do copies with full registers */
1966                if (partial_copy || ctx->program->gfx_level <= GFX7)
1967                   break;
1968                for (uint16_t i = 0; i < copy.second.bytes; i++) {
1969                   /* distance might underflow */
1970                   unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1971                   if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1972                       !it->second.uses[distance])
1973                      partial_copy = true;
1974                }
1975             }
1976
1977             if (!partial_copy) {
1978                ++it;
1979                continue;
1980             }
1981          } else {
1982             /* full target reg is used: register swapping needed */
1983             ++it;
1984             continue;
1985          }
1986       }
1987
1988       bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1989       skip_partial_copies = did_copy;
1990       std::pair<PhysReg, copy_operation> copy = *it;
1991
1992       if (it->second.is_used == 0) {
1993          /* the target reg is not used as operand for any other copy, so we
1994           * copied to all of it */
1995          copy_map.erase(it);
1996          it = copy_map.begin();
1997       } else {
1998          /* we only performed some portions of this copy, so split it to only
1999           * leave the portions that still need to be done */
2000          copy_operation original = it->second; /* the map insertion below can overwrite this */
2001          copy_map.erase(it);
2002          for (unsigned offset = 0; offset < original.bytes;) {
2003             if (original.uses[offset] == 0) {
2004                offset++;
2005                continue;
2006             }
2007             Definition def;
2008             Operand op;
2009             split_copy(ctx, offset, &def, &op, original, false, 8);
2010
2011             copy_operation new_copy = {op, def, def.bytes()};
2012             for (unsigned i = 0; i < new_copy.bytes; i++)
2013                new_copy.uses[i] = original.uses[i + offset];
2014             copy_map[def.physReg()] = new_copy;
2015
2016             offset += def.bytes();
2017          }
2018
2019          it = copy_map.begin();
2020       }
2021
2022       /* Reduce the number of uses of the operand reg by one. Do this after
2023        * splitting the copy or removing it in case the copy writes to it's own
2024        * operand (for example, v[7:8] = v[8:9]) */
2025       if (did_copy && !copy.second.op.isConstant()) {
2026          for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
2027             for (uint16_t i = 0; i < other.second.bytes; i++) {
2028                /* distance might underflow */
2029                unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
2030                if (distance < copy.second.bytes && !copy.second.uses[distance])
2031                   other.second.uses[i] -= 1;
2032             }
2033          }
2034       }
2035    }
2036
2037    /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
2038    unsigned largest = 0;
2039    for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
2040       largest = MAX2(largest, op.second.bytes);
2041
2042    while (!copy_map.empty()) {
2043
2044       /* Perform larger swaps first, because larger swaps swaps can make other
2045        * swaps unnecessary. */
2046       auto it = copy_map.begin();
2047       for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
2048          if (it2->second.bytes > it->second.bytes) {
2049             it = it2;
2050             if (it->second.bytes == largest)
2051                break;
2052          }
2053       }
2054
2055       /* should already be done */
2056       assert(!it->second.op.isConstant());
2057
2058       assert(it->second.op.isFixed());
2059       assert(it->second.def.regClass() == it->second.op.regClass());
2060
2061       if (it->first == it->second.op.physReg()) {
2062          copy_map.erase(it);
2063          continue;
2064       }
2065
2066       if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
2067          assert(!(it->second.def.physReg() == pi->scratch_sgpr));
2068
2069       /* to resolve the cycle, we have to swap the src reg with the dst reg */
2070       copy_operation swap = it->second;
2071
2072       /* if this is self-intersecting, we have to split it because
2073        * self-intersecting swaps don't make sense */
2074       PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
2075       if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
2076          unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
2077
2078          copy_operation remaining;
2079          src.reg_b += offset;
2080          dst.reg_b += offset;
2081          remaining.bytes = swap.bytes - offset;
2082          memcpy(remaining.uses, swap.uses + offset, remaining.bytes);
2083          remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes));
2084          remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes));
2085          copy_map[dst] = remaining;
2086
2087          memset(swap.uses + offset, 0, swap.bytes - offset);
2088          swap.bytes = offset;
2089       }
2090
2091       /* GFX6-7 can only swap full registers */
2092       if (ctx->program->gfx_level <= GFX7)
2093          swap.bytes = align(swap.bytes, 4);
2094
2095       do_swap(ctx, bld, swap, preserve_scc, pi);
2096
2097       /* remove from map */
2098       copy_map.erase(it);
2099
2100       /* change the operand reg of the target's uses and split uses if needed */
2101       uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
2102       for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {
2103          if (target->second.op.physReg() == swap.def.physReg() &&
2104              swap.bytes == target->second.bytes) {
2105             target->second.op.setFixed(swap.op.physReg());
2106             break;
2107          }
2108
2109          uint32_t imask =
2110             get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
2111                                   target->second.op.physReg().reg_b, target->second.bytes);
2112
2113          if (!imask)
2114             continue;
2115
2116          int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
2117
2118          /* split and update the middle (the portion that reads the swap's
2119           * definition) to read the swap's operand instead */
2120          int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
2121          int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
2122          int before_bytes = MAX2(-offset, 0);
2123          int after_bytes = MAX2(target_op_end - swap_def_end, 0);
2124          int middle_bytes = target->second.bytes - before_bytes - after_bytes;
2125
2126          if (after_bytes) {
2127             unsigned after_offset = before_bytes + middle_bytes;
2128             assert(after_offset > 0);
2129             copy_operation copy;
2130             copy.bytes = after_bytes;
2131             memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
2132             RegClass rc = target->second.op.regClass().resize(after_bytes);
2133             copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
2134             copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
2135             copy_map[copy.def.physReg()] = copy;
2136          }
2137
2138          if (middle_bytes) {
2139             copy_operation copy;
2140             copy.bytes = middle_bytes;
2141             memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
2142             RegClass rc = target->second.op.regClass().resize(middle_bytes);
2143             copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
2144             copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
2145             copy_map[copy.def.physReg()] = copy;
2146          }
2147
2148          if (before_bytes) {
2149             copy_operation copy;
2150             target->second.bytes = before_bytes;
2151             RegClass rc = target->second.op.regClass().resize(before_bytes);
2152             target->second.op = Operand(target->second.op.physReg(), rc);
2153             target->second.def = Definition(target->second.def.physReg(), rc);
2154             memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
2155          }
2156
2157          /* break early since we know each byte of the swap's definition is used
2158           * at most once */
2159          bytes_left &= ~imask;
2160          if (!bytes_left)
2161             break;
2162       }
2163    }
2164    ctx->program->statistics[aco_statistic_copies] +=
2165       ctx->instructions.size() - num_instructions_before;
2166 }
2167
2168 void
2169 emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
2170 {
2171    if (bld.program->gfx_level >= GFX10) {
2172       if (set_round)
2173          bld.sopp(aco_opcode::s_round_mode, -1, new_mode.round);
2174       if (set_denorm)
2175          bld.sopp(aco_opcode::s_denorm_mode, -1, new_mode.denorm);
2176    } else if (set_round || set_denorm) {
2177       /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
2178       bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(new_mode.val), (7 << 11) | 1);
2179    }
2180 }
2181
2182 void
2183 emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set)
2184 {
2185    float_mode config_mode;
2186    config_mode.val = program.config->float_mode;
2187
2188    bool set_round = always_set && block->fp_mode.round != config_mode.round;
2189    bool set_denorm = always_set && block->fp_mode.denorm != config_mode.denorm;
2190    if (block->kind & block_kind_top_level) {
2191       for (unsigned pred : block->linear_preds) {
2192          if (program.blocks[pred].fp_mode.round != block->fp_mode.round)
2193             set_round = true;
2194          if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm)
2195             set_denorm = true;
2196       }
2197    }
2198    /* only allow changing modes at top-level blocks so this doesn't break
2199     * the "jump over empty blocks" optimization */
2200    assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));
2201    emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
2202 }
2203
2204 void
2205 hw_init_scratch(Builder& bld, Definition def, Operand scratch_addr, Operand scratch_offset)
2206 {
2207    /* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16
2208     * bits in the same instruction that we add the carry.
2209     */
2210    Operand hi_add = Operand::c32(0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1));
2211    Operand scratch_addr_lo(scratch_addr.physReg(), s1);
2212    Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1);
2213
2214    if (bld.program->gfx_level >= GFX10) {
2215       PhysReg scratch_lo = def.physReg();
2216       PhysReg scratch_hi = def.physReg().advance(4);
2217
2218       bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo, s1), Definition(scc, s1),
2219                scratch_addr_lo, scratch_offset);
2220       bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi, s1), Definition(scc, s1),
2221                scratch_addr_hi, hi_add, Operand(scc, s1));
2222
2223       /* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register
2224        * 20/21) */
2225       bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_lo, s1), (31 << 11) | 20);
2226       bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_hi, s1), (31 << 11) | 21);
2227    } else {
2228       bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1),
2229                scratch_addr_lo, scratch_offset);
2230       bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1),
2231                scratch_addr_hi, hi_add, Operand(scc, s1));
2232    }
2233 }
2234
2235 void
2236 lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
2237 {
2238    Operand linear_vgpr = instr->operands[3];
2239
2240    unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
2241    unsigned vaddr_size = linear_vgpr.size();
2242    unsigned num_copied_vgprs = instr->operands.size() - 4;
2243    nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
2244                  ? nsa_size
2245                  : 0;
2246
2247    Operand vaddr[16];
2248    unsigned num_vaddr = 0;
2249
2250    if (nsa_size) {
2251       assert(num_copied_vgprs <= nsa_size);
2252       for (unsigned i = 0; i < num_copied_vgprs; i++)
2253          vaddr[num_vaddr++] = instr->operands[4 + i];
2254       for (unsigned i = num_copied_vgprs; i < std::min(vaddr_size, nsa_size); i++)
2255          vaddr[num_vaddr++] = Operand(linear_vgpr.physReg().advance(i * 4), v1);
2256       if (vaddr_size > nsa_size) {
2257          RegClass rc = RegClass::get(RegType::vgpr, (vaddr_size - nsa_size) * 4);
2258          vaddr[num_vaddr++] = Operand(PhysReg(linear_vgpr.physReg().advance(nsa_size * 4)), rc);
2259       }
2260    } else {
2261       PhysReg reg = linear_vgpr.physReg();
2262       std::map<PhysReg, copy_operation> copy_operations;
2263       for (unsigned i = 4; i < instr->operands.size(); i++) {
2264          Operand arg = instr->operands[i];
2265          Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
2266          copy_operations[def.physReg()] = {arg, def, def.bytes()};
2267          reg = reg.advance(arg.bytes());
2268       }
2269       vaddr[num_vaddr++] = linear_vgpr;
2270
2271       Pseudo_instruction pi = {};
2272       handle_operands(copy_operations, ctx, ctx->program->gfx_level, &pi);
2273    }
2274
2275    instr->mimg().strict_wqm = false;
2276
2277    if ((3 + num_vaddr) > instr->operands.size()) {
2278       MIMG_instruction* new_instr = create_instruction<MIMG_instruction>(
2279          instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
2280       std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
2281                 new_instr->definitions.begin());
2282       new_instr->operands[0] = instr->operands[0];
2283       new_instr->operands[1] = instr->operands[1];
2284       new_instr->operands[2] = instr->operands[2];
2285       memcpy((uint8_t*)new_instr + sizeof(Instruction), (uint8_t*)instr.get() + sizeof(Instruction),
2286              sizeof(MIMG_instruction) - sizeof(Instruction));
2287       instr.reset(new_instr);
2288    } else {
2289       while (instr->operands.size() > (3 + num_vaddr))
2290          instr->operands.pop_back();
2291    }
2292    std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
2293 }
2294
2295 void
2296 lower_to_hw_instr(Program* program)
2297 {
2298    gfx9_pops_done_msg_bounds pops_done_msg_bounds;
2299    if (program->has_pops_overlapped_waves_wait && program->gfx_level < GFX11) {
2300       pops_done_msg_bounds = gfx9_pops_done_msg_bounds(program);
2301    }
2302
2303    Block* discard_exit_block = NULL;
2304    Block* discard_pops_done_and_exit_block = NULL;
2305
2306    bool should_dealloc_vgprs = dealloc_vgprs(program);
2307
2308    for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
2309       Block* block = &program->blocks[block_idx];
2310       lower_context ctx;
2311       ctx.program = program;
2312       ctx.block = block;
2313       ctx.instructions.reserve(block->instructions.size());
2314       Builder bld(program, &ctx.instructions);
2315
2316       emit_set_mode_from_block(bld, *program, block, (block_idx == 0));
2317
2318       for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) {
2319          aco_ptr<Instruction>& instr = block->instructions[instr_idx];
2320
2321          /* Send the ordered section done message from the middle of the block if needed (if the
2322           * ordered section is ended by an instruction inside this block).
2323           * Also make sure the done message is sent if it's needed in case early exit happens for
2324           * any reason.
2325           */
2326          if ((block_idx == pops_done_msg_bounds.end_block_idx() &&
2327               instr_idx == pops_done_msg_bounds.instr_after_end_idx()) ||
2328              (instr->opcode == aco_opcode::s_endpgm &&
2329               pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx))) {
2330             bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2331          }
2332
2333          aco_ptr<Instruction> mov;
2334          if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) {
2335             Pseudo_instruction* pi = &instr->pseudo();
2336
2337             switch (instr->opcode) {
2338             case aco_opcode::p_extract_vector: {
2339                PhysReg reg = instr->operands[0].physReg();
2340                Definition& def = instr->definitions[0];
2341                reg.reg_b += instr->operands[1].constantValue() * def.bytes();
2342
2343                if (reg == def.physReg())
2344                   break;
2345
2346                RegClass op_rc = def.regClass().is_subdword()
2347                                    ? def.regClass()
2348                                    : RegClass(instr->operands[0].getTemp().type(), def.size());
2349                std::map<PhysReg, copy_operation> copy_operations;
2350                copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
2351                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2352                break;
2353             }
2354             case aco_opcode::p_create_vector: {
2355                std::map<PhysReg, copy_operation> copy_operations;
2356                PhysReg reg = instr->definitions[0].physReg();
2357
2358                for (const Operand& op : instr->operands) {
2359                   if (op.isConstant()) {
2360                      const Definition def = Definition(
2361                         reg, instr->definitions[0].getTemp().regClass().resize(op.bytes()));
2362                      copy_operations[reg] = {op, def, op.bytes()};
2363                      reg.reg_b += op.bytes();
2364                      continue;
2365                   }
2366                   if (op.isUndefined()) {
2367                      // TODO: coalesce subdword copies if dst byte is 0
2368                      reg.reg_b += op.bytes();
2369                      continue;
2370                   }
2371
2372                   RegClass rc_def =
2373                      op.regClass().is_subdword()
2374                         ? op.regClass()
2375                         : instr->definitions[0].getTemp().regClass().resize(op.bytes());
2376                   const Definition def = Definition(reg, rc_def);
2377                   copy_operations[def.physReg()] = {op, def, op.bytes()};
2378                   reg.reg_b += op.bytes();
2379                }
2380                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2381                break;
2382             }
2383             case aco_opcode::p_split_vector: {
2384                std::map<PhysReg, copy_operation> copy_operations;
2385                PhysReg reg = instr->operands[0].physReg();
2386
2387                for (const Definition& def : instr->definitions) {
2388                   RegClass rc_op = def.regClass().is_subdword()
2389                                       ? def.regClass()
2390                                       : instr->operands[0].getTemp().regClass().resize(def.bytes());
2391                   const Operand op = Operand(reg, rc_op);
2392                   copy_operations[def.physReg()] = {op, def, def.bytes()};
2393                   reg.reg_b += def.bytes();
2394                }
2395                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2396                break;
2397             }
2398             case aco_opcode::p_parallelcopy: {
2399                std::map<PhysReg, copy_operation> copy_operations;
2400                for (unsigned j = 0; j < instr->operands.size(); j++) {
2401                   assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
2402                   copy_operations[instr->definitions[j].physReg()] = {
2403                      instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
2404                }
2405                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2406                break;
2407             }
2408             case aco_opcode::p_start_linear_vgpr: {
2409                if (instr->operands.empty())
2410                   break;
2411
2412                Definition def(instr->definitions[0].physReg(),
2413                               RegClass::get(RegType::vgpr, instr->definitions[0].bytes()));
2414
2415                std::map<PhysReg, copy_operation> copy_operations;
2416                copy_operations[def.physReg()] = {instr->operands[0], def,
2417                                                  instr->operands[0].bytes()};
2418                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2419                break;
2420             }
2421             case aco_opcode::p_exit_early_if: {
2422                /* don't bother with an early exit near the end of the program */
2423                if ((block->instructions.size() - 1 - instr_idx) <= 4 &&
2424                    block->instructions.back()->opcode == aco_opcode::s_endpgm) {
2425                   unsigned null_exp_dest =
2426                      program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2427                   bool ignore_early_exit = true;
2428
2429                   for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {
2430                      const aco_ptr<Instruction>& instr2 = block->instructions[k];
2431                      if (instr2->opcode == aco_opcode::s_endpgm ||
2432                          instr2->opcode == aco_opcode::p_logical_end)
2433                         continue;
2434                      else if (instr2->opcode == aco_opcode::exp &&
2435                               instr2->exp().dest == null_exp_dest &&
2436                               instr2->exp().enabled_mask == 0)
2437                         continue;
2438                      else if (instr2->opcode == aco_opcode::p_parallelcopy &&
2439                               instr2->definitions[0].isFixed() &&
2440                               instr2->definitions[0].physReg() == exec)
2441                         continue;
2442
2443                      ignore_early_exit = false;
2444                   }
2445
2446                   if (ignore_early_exit)
2447                      break;
2448                }
2449
2450                const bool discard_sends_pops_done =
2451                   pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx);
2452
2453                Block* discard_block =
2454                   discard_sends_pops_done ? discard_pops_done_and_exit_block : discard_exit_block;
2455                if (!discard_block) {
2456                   discard_block = program->create_and_insert_block();
2457                   discard_block->kind = block_kind_discard_early_exit;
2458                   if (discard_sends_pops_done) {
2459                      discard_pops_done_and_exit_block = discard_block;
2460                   } else {
2461                      discard_exit_block = discard_block;
2462                   }
2463                   block = &program->blocks[block_idx];
2464
2465                   bld.reset(discard_block);
2466                   if (program->has_pops_overlapped_waves_wait &&
2467                       (program->gfx_level >= GFX11 || discard_sends_pops_done)) {
2468                      /* If this discard early exit potentially exits the POPS ordered section, do
2469                       * the waitcnt necessary before resuming overlapping waves as the normal
2470                       * waitcnt insertion doesn't work in a discard early exit block.
2471                       */
2472                      if (program->gfx_level >= GFX10)
2473                         bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0);
2474                      wait_imm pops_exit_wait_imm;
2475                      pops_exit_wait_imm.vm = 0;
2476                      if (program->has_smem_buffer_or_global_loads)
2477                         pops_exit_wait_imm.lgkm = 0;
2478                      bld.sopp(aco_opcode::s_waitcnt, -1,
2479                               pops_exit_wait_imm.pack(program->gfx_level));
2480                   }
2481                   if (discard_sends_pops_done)
2482                      bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2483                   unsigned target = V_008DFC_SQ_EXP_NULL;
2484                   if (program->gfx_level >= GFX11)
2485                      target =
2486                         program->has_color_exports ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_MRTZ;
2487                   if (program->stage == fragment_fs)
2488                      bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
2489                              target, false, true, true);
2490                   if (should_dealloc_vgprs)
2491                      bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
2492                   bld.sopp(aco_opcode::s_endpgm);
2493
2494                   bld.reset(&ctx.instructions);
2495                }
2496
2497                assert(instr->operands[0].physReg() == scc);
2498                bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
2499
2500                discard_block->linear_preds.push_back(block->index);
2501                block->linear_succs.push_back(discard_block->index);
2502                break;
2503             }
2504             case aco_opcode::p_spill: {
2505                assert(instr->operands[0].regClass() == v1.as_linear());
2506                for (unsigned i = 0; i < instr->operands[2].size(); i++) {
2507                   Operand src =
2508                      instr->operands[2].isConstant()
2509                         ? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))
2510                         : Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
2511                   bld.writelane(bld.def(v1, instr->operands[0].physReg()), src,
2512                                 Operand::c32(instr->operands[1].constantValue() + i),
2513                                 instr->operands[0]);
2514                }
2515                break;
2516             }
2517             case aco_opcode::p_reload: {
2518                assert(instr->operands[0].regClass() == v1.as_linear());
2519                for (unsigned i = 0; i < instr->definitions[0].size(); i++)
2520                   bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2521                                instr->operands[0],
2522                                Operand::c32(instr->operands[1].constantValue() + i));
2523                break;
2524             }
2525             case aco_opcode::p_as_uniform: {
2526                if (instr->operands[0].isConstant() ||
2527                    instr->operands[0].regClass().type() == RegType::sgpr) {
2528                   std::map<PhysReg, copy_operation> copy_operations;
2529                   copy_operations[instr->definitions[0].physReg()] = {
2530                      instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
2531                   handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2532                } else {
2533                   assert(instr->operands[0].regClass().type() == RegType::vgpr);
2534                   assert(instr->definitions[0].regClass().type() == RegType::sgpr);
2535                   assert(instr->operands[0].size() == instr->definitions[0].size());
2536                   for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
2537                      bld.vop1(aco_opcode::v_readfirstlane_b32,
2538                               bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2539                               Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
2540                   }
2541                }
2542                break;
2543             }
2544             case aco_opcode::p_pops_gfx9_add_exiting_wave_id: {
2545                bld.sop2(aco_opcode::s_add_i32, instr->definitions[0], instr->definitions[1],
2546                         Operand(pops_exiting_wave_id, s1), instr->operands[0]);
2547                break;
2548             }
2549             case aco_opcode::p_bpermute_readlane: {
2550                emit_bpermute_readlane(program, instr, bld);
2551                break;
2552             }
2553             case aco_opcode::p_bpermute_shared_vgpr: {
2554                emit_bpermute_shared_vgpr(program, instr, bld);
2555                break;
2556             }
2557             case aco_opcode::p_bpermute_permlane: {
2558                emit_bpermute_permlane(program, instr, bld);
2559                break;
2560             }
2561             case aco_opcode::p_constaddr: {
2562                unsigned id = instr->definitions[0].tempId();
2563                PhysReg reg = instr->definitions[0].physReg();
2564                bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));
2565                bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2566                         Operand(reg, s1), instr->operands[0], Operand::c32(id));
2567                /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2568                break;
2569             }
2570             case aco_opcode::p_resume_shader_address: {
2571                /* Find index of resume block. */
2572                unsigned resume_idx = instr->operands[0].constantValue();
2573                unsigned resume_block_idx = 0;
2574                for (Block& resume_block : program->blocks) {
2575                   if (resume_block.kind & block_kind_resume) {
2576                      if (resume_idx == 0) {
2577                         resume_block_idx = resume_block.index;
2578                         break;
2579                      }
2580                      resume_idx--;
2581                   }
2582                }
2583                assert(resume_block_idx != 0);
2584                unsigned id = instr->definitions[0].tempId();
2585                PhysReg reg = instr->definitions[0].physReg();
2586                bld.sop1(aco_opcode::p_resumeaddr_getpc, instr->definitions[0], Operand::c32(id));
2587                bld.sop2(aco_opcode::p_resumeaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2588                         Operand(reg, s1), Operand::c32(resume_block_idx), Operand::c32(id));
2589                /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2590                break;
2591             }
2592             case aco_opcode::p_extract: {
2593                assert(instr->operands[1].isConstant());
2594                assert(instr->operands[2].isConstant());
2595                assert(instr->operands[3].isConstant());
2596                if (instr->definitions[0].regClass() == s1)
2597                   assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2598                Definition dst = instr->definitions[0];
2599                Operand op = instr->operands[0];
2600                unsigned bits = instr->operands[2].constantValue();
2601                unsigned index = instr->operands[1].constantValue();
2602                unsigned offset = index * bits;
2603                bool signext = !instr->operands[3].constantEquals(0);
2604
2605                if (dst.regClass() == s1) {
2606                   if (offset == (32 - bits)) {
2607                      bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
2608                               bld.def(s1, scc), op, Operand::c32(offset));
2609                   } else if (offset == 0 && signext && (bits == 8 || bits == 16)) {
2610                      bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
2611                               dst, op);
2612                   } else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2613                      bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2614                   } else {
2615                      bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
2616                               bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
2617                   }
2618                } else if ((dst.regClass() == v1 && op.physReg().byte() == 0) ||
2619                           ctx.program->gfx_level <= GFX7) {
2620                   assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
2621                   if (offset == (32 - bits) && op.regClass() != s1) {
2622                      bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
2623                               Operand::c32(offset), op);
2624                   } else if (offset == 0 && bits == 16 && ctx.program->gfx_level >= GFX11) {
2625                      bld.vop1(signext ? aco_opcode::v_cvt_i32_i16 : aco_opcode::v_cvt_u32_u16, dst,
2626                               op);
2627                   } else {
2628                      bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
2629                               Operand::c32(offset), Operand::c32(bits));
2630                   }
2631                } else {
2632                   assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b ||
2633                          op.regClass() == v1b);
2634                   if (ctx.program->gfx_level >= GFX11) {
2635                      unsigned op_vgpr_byte = op.physReg().byte() + offset / 8;
2636                      unsigned sign_byte = op_vgpr_byte + bits / 8 - 1;
2637
2638                      uint8_t swiz[4] = {4, 5, 6, 7};
2639                      swiz[dst.physReg().byte()] = op_vgpr_byte;
2640                      if (bits == 16)
2641                         swiz[dst.physReg().byte() + 1] = op_vgpr_byte + 1;
2642                      for (unsigned i = bits / 8; i < dst.bytes(); i++) {
2643                         uint8_t ext = bperm_0;
2644                         if (signext) {
2645                            if (sign_byte == 1)
2646                               ext = bperm_b1_sign;
2647                            else if (sign_byte == 3)
2648                               ext = bperm_b3_sign;
2649                            else /* replicate so sign-extension can be done later */
2650                               ext = sign_byte;
2651                         }
2652                         swiz[dst.physReg().byte() + i] = ext;
2653                      }
2654                      create_bperm(bld, swiz, dst, op);
2655
2656                      if (signext && sign_byte != 3 && sign_byte != 1) {
2657                         assert(bits == 8);
2658                         assert(dst.regClass() == v2b || dst.regClass() == v1);
2659                         uint8_t ext_swiz[4] = {4, 5, 6, 7};
2660                         uint8_t ext = dst.physReg().byte() == 2 ? bperm_b7_sign : bperm_b5_sign;
2661                         memset(ext_swiz + dst.physReg().byte() + 1, ext, dst.bytes() - 1);
2662                         create_bperm(bld, ext_swiz, dst, Operand::zero());
2663                      }
2664                   } else {
2665                      SDWA_instruction& sdwa = bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa();
2666                      sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext);
2667                   }
2668                }
2669                break;
2670             }
2671             case aco_opcode::p_insert: {
2672                assert(instr->operands[1].isConstant());
2673                assert(instr->operands[2].isConstant());
2674                if (instr->definitions[0].regClass() == s1)
2675                   assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2676                Definition dst = instr->definitions[0];
2677                Operand op = instr->operands[0];
2678                unsigned bits = instr->operands[2].constantValue();
2679                unsigned index = instr->operands[1].constantValue();
2680                unsigned offset = index * bits;
2681
2682                bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11;
2683                if (dst.regClass() == s1) {
2684                   if (offset == (32 - bits)) {
2685                      bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op,
2686                               Operand::c32(offset));
2687                   } else if (offset == 0) {
2688                      bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2689                               Operand::c32(bits << 16));
2690                   } else {
2691                      bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2692                               Operand::c32(bits << 16));
2693                      bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc),
2694                               Operand(dst.physReg(), s1), Operand::c32(offset));
2695                   }
2696                } else if (dst.regClass() == v1 || !has_sdwa) {
2697                   if (offset == (dst.bytes() * 8u - bits) &&
2698                       (dst.regClass() == v1 || program->gfx_level <= GFX7)) {
2699                      bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
2700                   } else if (offset == 0 && (dst.regClass() == v1 || program->gfx_level <= GFX7)) {
2701                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2702                   } else if (has_sdwa && (op.regClass() != s1 || program->gfx_level >= GFX9)) {
2703                      bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().dst_sel =
2704                         SubdwordSel(bits / 8, offset / 8, false);
2705                   } else if (program->gfx_level >= GFX11) {
2706                      uint8_t swiz[] = {4, 5, 6, 7};
2707                      for (unsigned i = 0; i < dst.bytes(); i++)
2708                         swiz[dst.physReg().byte() + i] = bperm_0;
2709                      for (unsigned i = 0; i < bits / 8; i++)
2710                         swiz[dst.physReg().byte() + i + offset / 8] = op.physReg().byte() + i;
2711                      create_bperm(bld, swiz, dst, op);
2712                   } else {
2713                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2714                      bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),
2715                               Operand(dst.physReg(), v1));
2716                   }
2717                } else {
2718                   assert(dst.regClass() == v2b);
2719                   bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op)
2720                      ->sdwa()
2721                      .sel[1] = SubdwordSel::ubyte;
2722                }
2723                break;
2724             }
2725             case aco_opcode::p_init_scratch: {
2726                assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3);
2727                if (!program->config->scratch_bytes_per_wave)
2728                   break;
2729
2730                Operand scratch_addr = instr->operands[0];
2731                if (scratch_addr.isUndefined()) {
2732                   PhysReg reg = instr->definitions[0].physReg();
2733                   bld.sop1(aco_opcode::p_load_symbol, Definition(reg, s1),
2734                            Operand::c32(aco_symbol_scratch_addr_lo));
2735                   bld.sop1(aco_opcode::p_load_symbol, Definition(reg.advance(4), s1),
2736                            Operand::c32(aco_symbol_scratch_addr_hi));
2737                   scratch_addr.setFixed(reg);
2738                } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
2739                   bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr,
2740                            Operand::zero());
2741                   scratch_addr.setFixed(instr->definitions[0].physReg());
2742                }
2743
2744                hw_init_scratch(bld, instr->definitions[0], scratch_addr, instr->operands[1]);
2745                break;
2746             }
2747             case aco_opcode::p_jump_to_epilog: {
2748                if (pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx)) {
2749                   bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2750                }
2751                bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
2752                break;
2753             }
2754             case aco_opcode::p_interp_gfx11: {
2755                assert(instr->definitions[0].regClass() == v1 ||
2756                       instr->definitions[0].regClass() == v2b);
2757                assert(instr->operands[0].regClass() == v1.as_linear());
2758                assert(instr->operands[1].isConstant());
2759                assert(instr->operands[2].isConstant());
2760                assert(instr->operands.back().physReg() == m0);
2761                Definition dst = instr->definitions[0];
2762                PhysReg lin_vgpr = instr->operands[0].physReg();
2763                unsigned attribute = instr->operands[1].constantValue();
2764                unsigned component = instr->operands[2].constantValue();
2765                uint16_t dpp_ctrl = 0;
2766                Operand coord1, coord2;
2767                if (instr->operands.size() == 6) {
2768                   assert(instr->operands[3].regClass() == v1);
2769                   assert(instr->operands[4].regClass() == v1);
2770                   coord1 = instr->operands[3];
2771                   coord2 = instr->operands[4];
2772                } else {
2773                   assert(instr->operands[3].isConstant());
2774                   dpp_ctrl = instr->operands[3].constantValue();
2775                }
2776
2777                bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1),
2778                           attribute, component);
2779
2780                Operand p(lin_vgpr, v1);
2781                Operand dst_op(dst.physReg(), v1);
2782                if (instr->operands.size() == 5) {
2783                   bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
2784                } else if (dst.regClass() == v2b) {
2785                   bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p,
2786                                     coord1, p);
2787                   bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p,
2788                                     coord2, dst_op);
2789                } else {
2790                   bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1,
2791                                     p);
2792                   bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2,
2793                                     dst_op);
2794                }
2795                break;
2796             }
2797             case aco_opcode::p_dual_src_export_gfx11: {
2798                PhysReg dst0 = instr->definitions[0].physReg();
2799                PhysReg dst1 = instr->definitions[1].physReg();
2800                Definition tmp = instr->definitions[2];
2801                Definition exec_tmp = instr->definitions[3];
2802                Definition clobber_vcc = instr->definitions[4];
2803                Definition clobber_scc = instr->definitions[5];
2804
2805                assert(tmp.regClass() == v1);
2806                assert(exec_tmp.regClass() == bld.lm);
2807                assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
2808                assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
2809
2810                bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm),
2811                         Operand(exec, bld.lm));
2812                bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
2813                         Operand(exec, bld.lm));
2814
2815                uint8_t enabled_channels = 0;
2816                Operand mrt0[4], mrt1[4];
2817
2818                bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg(), s1),
2819                         Operand::c32(0x55555555));
2820                if (ctx.program->wave_size == 64)
2821                   bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg().advance(4), s1),
2822                            Operand::c32(0x55555555));
2823
2824                for (unsigned i = 0; i < 4; i++) {
2825                   if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
2826                      mrt0[i] = instr->operands[i];
2827                      mrt1[i] = instr->operands[i + 4];
2828                      continue;
2829                   }
2830
2831                   Operand src0 = instr->operands[i];
2832                   Operand src1 = instr->operands[i + 4];
2833
2834                   uint32_t lane_sel_xor1 = 0;
2835                   for (unsigned j = 0; j < 8; j++)
2836                      lane_sel_xor1 |= (j ^ 1) << (j * 3);
2837
2838                   /* Swap odd, even lanes of mrt0. */
2839                   bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), src0, lane_sel_xor1);
2840
2841                   /* Swap even lanes between mrt0 and mrt1. */
2842                   bld.vop2(aco_opcode::v_cndmask_b32, tmp, Operand(dst0, v1), src1,
2843                            Operand(clobber_vcc.physReg(), bld.lm));
2844                   bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src1, Operand(dst0, v1),
2845                            Operand(clobber_vcc.physReg(), bld.lm));
2846
2847                   /* Swap odd, even lanes of mrt0 again. */
2848                   bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1),
2849                                 Operand(tmp.physReg(), v1), lane_sel_xor1);
2850
2851                   mrt0[i] = Operand(dst0, v1);
2852                   mrt1[i] = Operand(dst1, v1);
2853
2854                   enabled_channels |= 1 << i;
2855
2856                   dst0 = dst0.advance(4);
2857                   dst1 = dst1.advance(4);
2858                }
2859
2860                bld.sop1(Builder::s_mov, Definition(exec, bld.lm),
2861                         Operand(exec_tmp.physReg(), bld.lm));
2862
2863                /* Force export all channels when everything is undefined. */
2864                if (!enabled_channels)
2865                   enabled_channels = 0xf;
2866
2867                bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels,
2868                        V_008DFC_SQ_EXP_MRT + 21, false);
2869                bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels,
2870                        V_008DFC_SQ_EXP_MRT + 22, false);
2871                break;
2872             }
2873             default: break;
2874             }
2875          } else if (instr->isBranch()) {
2876             Pseudo_branch_instruction* branch = &instr->branch();
2877             const uint32_t target = branch->target[0];
2878             const bool uniform_branch = !((branch->opcode == aco_opcode::p_cbranch_z ||
2879                                            branch->opcode == aco_opcode::p_cbranch_nz) &&
2880                                           branch->operands[0].physReg() == exec);
2881
2882             /* Check if the branch instruction can be removed.
2883              * This is beneficial when executing the next block with an empty exec mask
2884              * is faster than the branch instruction itself.
2885              *
2886              * Override this judgement when:
2887              * - The application prefers to remove control flow
2888              * - The compiler stack knows that it's a divergent branch always taken
2889              */
2890             const bool prefer_remove =
2891                branch->selection_control_remove && ctx.program->gfx_level >= GFX10;
2892             bool can_remove = block->index < target;
2893             unsigned num_scalar = 0;
2894             unsigned num_vector = 0;
2895
2896             /* Check the instructions between branch and target */
2897             for (unsigned i = block->index + 1; i < branch->target[0]; i++) {
2898                /* Uniform conditional branches must not be ignored if they
2899                 * are about to jump over actual instructions */
2900                if (uniform_branch && !program->blocks[i].instructions.empty())
2901                   can_remove = false;
2902
2903                if (!can_remove)
2904                   break;
2905
2906                for (aco_ptr<Instruction>& inst : program->blocks[i].instructions) {
2907                   if (inst->isSOPP()) {
2908                      /* Discard early exits and loop breaks and continues should work fine with an
2909                       * empty exec mask.
2910                       */
2911                      bool is_break_continue =
2912                         program->blocks[i].kind & (block_kind_break | block_kind_continue);
2913                      bool discard_early_exit =
2914                         inst->sopp().block != -1 &&
2915                         (program->blocks[inst->sopp().block].kind & block_kind_discard_early_exit);
2916                      if ((inst->opcode != aco_opcode::s_cbranch_scc0 &&
2917                           inst->opcode != aco_opcode::s_cbranch_scc1) ||
2918                          (!discard_early_exit && !is_break_continue))
2919                         can_remove = false;
2920                   } else if (inst->isSALU()) {
2921                      num_scalar++;
2922                   } else if (inst->isVALU() || inst->isVINTRP()) {
2923                      num_vector++;
2924                      /* VALU which writes SGPRs are always executed on GFX10+ */
2925                      if (ctx.program->gfx_level >= GFX10) {
2926                         for (Definition& def : inst->definitions) {
2927                            if (def.regClass().type() == RegType::sgpr)
2928                               num_scalar++;
2929                         }
2930                      }
2931                   } else if (inst->isEXP()) {
2932                      /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs). */
2933                      can_remove = false;
2934                   } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
2935                              inst->isLDSDIR()) {
2936                      // TODO: GFX6-9 can use vskip
2937                      can_remove = prefer_remove;
2938                   } else if (inst->isSMEM()) {
2939                      /* SMEM are at least as expensive as branches */
2940                      can_remove = prefer_remove;
2941                   } else if (inst->isBarrier()) {
2942                      can_remove = prefer_remove;
2943                   } else {
2944                      can_remove = false;
2945                      assert(false && "Pseudo instructions should be lowered by this point.");
2946                   }
2947
2948                   if (!prefer_remove) {
2949                      /* Under these conditions, we shouldn't remove the branch.
2950                       * Don't care about the estimated cycles when the shader prefers flattening.
2951                       */
2952                      unsigned est_cycles;
2953                      if (ctx.program->gfx_level >= GFX10)
2954                         est_cycles = num_scalar * 2 + num_vector;
2955                      else
2956                         est_cycles = num_scalar * 4 + num_vector * 4;
2957
2958                      if (est_cycles > 16)
2959                         can_remove = false;
2960                   }
2961
2962                   if (!can_remove)
2963                      break;
2964                }
2965             }
2966
2967             if (can_remove)
2968                continue;
2969
2970             /* emit branch instruction */
2971             switch (instr->opcode) {
2972             case aco_opcode::p_branch:
2973                assert(block->linear_succs[0] == target);
2974                bld.sopp(aco_opcode::s_branch, branch->definitions[0], target);
2975                break;
2976             case aco_opcode::p_cbranch_nz:
2977                assert(block->linear_succs[1] == target);
2978                if (branch->operands[0].physReg() == exec)
2979                   bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target);
2980                else if (branch->operands[0].physReg() == vcc)
2981                   bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target);
2982                else {
2983                   assert(branch->operands[0].physReg() == scc);
2984                   bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target);
2985                }
2986                break;
2987             case aco_opcode::p_cbranch_z:
2988                assert(block->linear_succs[1] == target);
2989                if (branch->operands[0].physReg() == exec)
2990                   bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target);
2991                else if (branch->operands[0].physReg() == vcc)
2992                   bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target);
2993                else {
2994                   assert(branch->operands[0].physReg() == scc);
2995                   bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target);
2996                }
2997                break;
2998             default: unreachable("Unknown Pseudo branch instruction!");
2999             }
3000
3001          } else if (instr->isReduction()) {
3002             Pseudo_reduction_instruction& reduce = instr->reduction();
3003             emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
3004                            reduce.operands[1].physReg(),    // tmp
3005                            reduce.definitions[1].physReg(), // stmp
3006                            reduce.operands[2].physReg(),    // vtmp
3007                            reduce.definitions[2].physReg(), // sitmp
3008                            reduce.operands[0], reduce.definitions[0]);
3009          } else if (instr->isBarrier()) {
3010             Pseudo_barrier_instruction& barrier = instr->barrier();
3011
3012             /* Anything larger than a workgroup isn't possible. Anything
3013              * smaller requires no instructions and this pseudo instruction
3014              * would only be included to control optimizations. */
3015             bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
3016                                   program->workgroup_size > program->wave_size;
3017
3018             bld.insert(std::move(instr));
3019             if (emit_s_barrier)
3020                bld.sopp(aco_opcode::s_barrier);
3021          } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
3022             float_mode new_mode = block->fp_mode;
3023             new_mode.round16_64 = fp_round_ne;
3024             bool set_round = new_mode.round != block->fp_mode.round;
3025
3026             emit_set_mode(bld, new_mode, set_round, false);
3027
3028             instr->opcode = aco_opcode::v_cvt_f16_f32;
3029             ctx.instructions.emplace_back(std::move(instr));
3030
3031             emit_set_mode(bld, block->fp_mode, set_round, false);
3032          } else if (instr->isMIMG() && instr->mimg().strict_wqm) {
3033             lower_image_sample(&ctx, instr);
3034             ctx.instructions.emplace_back(std::move(instr));
3035          } else {
3036             ctx.instructions.emplace_back(std::move(instr));
3037          }
3038       }
3039
3040       /* Send the ordered section done message from this block if it's needed in this block, but
3041        * instr_after_end_idx() points beyond the end of its instructions. This may commonly happen
3042        * if the common post-dominator of multiple end locations turns out to be an empty block.
3043        */
3044       if (block_idx == pops_done_msg_bounds.end_block_idx() &&
3045           pops_done_msg_bounds.instr_after_end_idx() >= block->instructions.size()) {
3046          bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
3047       }
3048
3049       block->instructions = std::move(ctx.instructions);
3050    }
3051 }
3052
3053 } // namespace aco