src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include "aco_instruction_selection.h"
  27
  28 #include "aco_builder.h"
  29 #include "aco_interface.h"
  30 #include "aco_ir.h"
  31
  32 #include "common/ac_nir.h"
  33 #include "common/sid.h"
  34
  35 #include "util/fast_idiv_by_const.h"
  36 #include "util/memstream.h"
  37
  38 #include <array>
  39 #include <functional>
  40 #include <map>
  41 #include <numeric>
  42 #include <stack>
  43 #include <utility>
  44 #include <vector>
  45
  46 namespace aco {
  47 namespace {
  48
  49 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
  50
  51 static void
  52 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
  53           const char* msg)
  54 {
  55    char* out;
  56    size_t outsize;
  57    struct u_memstream mem;
  58    u_memstream_open(&mem, &out, &outsize);
  59    FILE* const memf = u_memstream_get(&mem);
  60
  61    fprintf(memf, "%s: ", msg);
  62    nir_print_instr(instr, memf);
  63    u_memstream_close(&mem);
  64
  65    _aco_err(ctx->program, file, line, out);
  66    free(out);
  67 }
  68
  69 struct if_context {
  70    Temp cond;
  71
  72    bool divergent_old;
  73    bool exec_potentially_empty_discard_old;
  74    bool exec_potentially_empty_break_old;
  75    bool had_divergent_discard_old;
  76    bool had_divergent_discard_then;
  77    uint16_t exec_potentially_empty_break_depth_old;
  78
  79    unsigned BB_if_idx;
  80    unsigned invert_idx;
  81    bool uniform_has_then_branch;
  82    bool then_branch_divergent;
  83    Block BB_invert;
  84    Block BB_endif;
  85 };
  86
  87 struct loop_context {
  88    Block loop_exit;
  89
  90    unsigned header_idx_old;
  91    Block* exit_old;
  92    bool divergent_cont_old;
  93    bool divergent_branch_old;
  94    bool divergent_if_old;
  95 };
  96
  97 static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
  98
  99 static void
 100 add_logical_edge(unsigned pred_idx, Block* succ)
 101 {
 102    succ->logical_preds.emplace_back(pred_idx);
 103 }
 104
 105 static void
 106 add_linear_edge(unsigned pred_idx, Block* succ)
 107 {
 108    succ->linear_preds.emplace_back(pred_idx);
 109 }
 110
 111 static void
 112 add_edge(unsigned pred_idx, Block* succ)
 113 {
 114    add_logical_edge(pred_idx, succ);
 115    add_linear_edge(pred_idx, succ);
 116 }
 117
 118 static void
 119 append_logical_start(Block* b)
 120 {
 121    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 122 }
 123
 124 static void
 125 append_logical_end(Block* b)
 126 {
 127    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 128 }
 129
 130 Temp
 131 get_ssa_temp(struct isel_context* ctx, nir_def* def)
 132 {
 133    uint32_t id = ctx->first_temp_id + def->index;
 134    return Temp(id, ctx->program->temp_rc[id]);
 135 }
 136
 137 Temp
 138 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
 139 {
 140    Builder bld(ctx->program, ctx->block);
 141    assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
 142    assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
 143
 144    if (ctx->program->wave_size == 32) {
 145       Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
 146       return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
 147    }
 148
 149    Operand mask_lo = Operand::c32(-1u);
 150    Operand mask_hi = Operand::c32(-1u);
 151
 152    if (mask.isTemp()) {
 153       RegClass rc = RegClass(mask.regClass().type(), 1);
 154       Builder::Result mask_split =
 155          bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
 156       mask_lo = Operand(mask_split.def(0).getTemp());
 157       mask_hi = Operand(mask_split.def(1).getTemp());
 158    } else if (mask.physReg() == exec) {
 159       mask_lo = Operand(exec_lo, s1);
 160       mask_hi = Operand(exec_hi, s1);
 161    }
 162
 163    Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
 164
 165    if (ctx->program->gfx_level <= GFX7)
 166       return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
 167    else
 168       return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
 169 }
 170
 171 inline void
 172 set_wqm(isel_context* ctx, bool enable_helpers = false)
 173 {
 174    if (ctx->program->stage == fragment_fs) {
 175       ctx->wqm_block_idx = ctx->block->index;
 176       ctx->wqm_instruction_idx = ctx->block->instructions.size();
 177       ctx->program->needs_wqm |= enable_helpers;
 178    }
 179 }
 180
 181 static Temp
 182 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
 183 {
 184    if (index.regClass() == s1)
 185       return bld.readlane(bld.def(s1), data, index);
 186
 187    /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists
 188     * of multiple binaries, because the VGPR use is not known when choosing
 189     * which registers to use for the shared VGPRs.
 190     */
 191    const bool avoid_shared_vgprs =
 192       ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 &&
 193       ctx->program->wave_size == 64 &&
 194       (ctx->program->info.has_epilog || ctx->program->info.merged_shader_compiled_separately ||
 195        ctx->stage == raytracing_cs);
 196
 197    if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
 198       /* GFX6-7: there is no bpermute instruction */
 199       Operand index_op(index);
 200       Operand input_data(data);
 201       index_op.setLateKill(true);
 202       input_data.setLateKill(true);
 203
 204       return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
 205                         bld.def(bld.lm, vcc), index_op, input_data);
 206    } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
 207
 208       /* GFX10 wave64 mode: emulate full-wave bpermute */
 209       Temp index_is_lo =
 210          bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
 211       Builder::Result index_is_lo_split =
 212          bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
 213       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
 214                                      index_is_lo_split.def(1).getTemp());
 215       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 216                                      index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
 217       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
 218       Operand input_data(data);
 219
 220       index_x4.setLateKill(true);
 221       input_data.setLateKill(true);
 222       same_half.setLateKill(true);
 223
 224       if (ctx->options->gfx_level <= GFX10_3) {
 225          /* We need one pair of shared VGPRs:
 226           * Note, that these have twice the allocation granularity of normal VGPRs
 227           */
 228          ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
 229
 230          return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2),
 231                            bld.def(s1, scc), index_x4, input_data, same_half);
 232       } else {
 233          return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2),
 234                            bld.def(s1, scc), Operand(v1.as_linear()), index_x4, input_data,
 235                            same_half);
 236       }
 237    } else {
 238       /* GFX8-9 or GFX10 wave32: bpermute works normally */
 239       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
 240       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
 241    }
 242 }
 243
 244 static Temp
 245 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask, bool allow_fi)
 246 {
 247    if (ctx->options->gfx_level >= GFX8) {
 248       unsigned and_mask = mask & 0x1f;
 249       unsigned or_mask = (mask >> 5) & 0x1f;
 250       unsigned xor_mask = (mask >> 10) & 0x1f;
 251
 252       /* Eliminate or_mask. */
 253       and_mask &= ~or_mask;
 254       xor_mask ^= or_mask;
 255
 256       uint16_t dpp_ctrl = 0xffff;
 257
 258       /* DPP16 before DPP8 before v_permlane(x)16_b32
 259        * because DPP16 supports modifiers and v_permlane
 260        * can't be folded into valu instructions.
 261        */
 262       if ((and_mask & 0x1c) == 0x1c && xor_mask < 4) {
 263          unsigned res[4];
 264          for (unsigned i = 0; i < 4; i++)
 265             res[i] = ((i & and_mask) ^ xor_mask);
 266          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
 267       } else if (and_mask == 0x1f && xor_mask == 8) {
 268          dpp_ctrl = dpp_row_rr(8);
 269       } else if (and_mask == 0x1f && xor_mask == 0xf) {
 270          dpp_ctrl = dpp_row_mirror;
 271       } else if (and_mask == 0x1f && xor_mask == 0x7) {
 272          dpp_ctrl = dpp_row_half_mirror;
 273       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x10 && xor_mask < 0x10) {
 274          dpp_ctrl = dpp_row_share(xor_mask);
 275       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && xor_mask < 0x10) {
 276          dpp_ctrl = dpp_row_xmask(xor_mask);
 277       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && xor_mask < 8) {
 278          uint32_t lane_sel = 0;
 279          for (unsigned i = 0; i < 8; i++)
 280             lane_sel |= ((i & and_mask) ^ xor_mask) << (i * 3);
 281          return bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src, lane_sel, allow_fi);
 282       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10) {
 283          uint64_t lane_mask = 0;
 284          for (unsigned i = 0; i < 16; i++)
 285             lane_mask |= uint64_t((i & and_mask) ^ (xor_mask & 0xf)) << i * 4;
 286          aco_opcode opcode =
 287             xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32;
 288          Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff));
 289          Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
 290          Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
 291          ret->valu().opsel[0] = allow_fi; /* set FETCH_INACTIVE */
 292          ret->valu().opsel[1] = true;     /* set BOUND_CTRL */
 293          return ret;
 294       }
 295
 296       if (dpp_ctrl != 0xffff)
 297          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl, 0xf, 0xf, true,
 298                              allow_fi);
 299    }
 300
 301    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
 302 }
 303
 304 Temp
 305 as_vgpr(Builder& bld, Temp val)
 306 {
 307    if (val.type() == RegType::sgpr)
 308       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 309    assert(val.type() == RegType::vgpr);
 310    return val;
 311 }
 312
 313 Temp
 314 as_vgpr(isel_context* ctx, Temp val)
 315 {
 316    Builder bld(ctx->program, ctx->block);
 317    return as_vgpr(bld, val);
 318 }
 319
 320 void
 321 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 322 {
 323    Builder bld(ctx->program, ctx->block);
 324    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
 325 }
 326
 327 Temp
 328 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 329 {
 330    /* no need to extract the whole vector */
 331    if (src.regClass() == dst_rc) {
 332       assert(idx == 0);
 333       return src;
 334    }
 335
 336    assert(src.bytes() > (idx * dst_rc.bytes()));
 337    Builder bld(ctx->program, ctx->block);
 338    auto it = ctx->allocated_vec.find(src.id());
 339    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
 340       if (it->second[idx].regClass() == dst_rc) {
 341          return it->second[idx];
 342       } else {
 343          assert(!dst_rc.is_subdword());
 344          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 345          return bld.copy(bld.def(dst_rc), it->second[idx]);
 346       }
 347    }
 348
 349    if (dst_rc.is_subdword())
 350       src = as_vgpr(ctx, src);
 351
 352    if (src.bytes() == dst_rc.bytes()) {
 353       assert(idx == 0);
 354       return bld.copy(bld.def(dst_rc), src);
 355    } else {
 356       Temp dst = bld.tmp(dst_rc);
 357       emit_extract_vector(ctx, src, idx, dst);
 358       return dst;
 359    }
 360 }
 361
 362 void
 363 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 364 {
 365    if (num_components == 1)
 366       return;
 367    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 368       return;
 369    RegClass rc;
 370    if (num_components > vec_src.size()) {
 371       if (vec_src.type() == RegType::sgpr) {
 372          /* should still help get_alu_src() */
 373          emit_split_vector(ctx, vec_src, vec_src.size());
 374          return;
 375       }
 376       /* sub-dword split */
 377       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
 378    } else {
 379       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
 380    }
 381    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
 382       aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 383    split->operands[0] = Operand(vec_src);
 384    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 385    for (unsigned i = 0; i < num_components; i++) {
 386       elems[i] = ctx->program->allocateTmp(rc);
 387       split->definitions[i] = Definition(elems[i]);
 388    }
 389    ctx->block->instructions.emplace_back(std::move(split));
 390    ctx->allocated_vec.emplace(vec_src.id(), elems);
 391 }
 392
 393 /* This vector expansion uses a mask to determine which elements in the new vector
 394  * come from the original vector. The other elements are undefined. */
 395 void
 396 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
 397               bool zero_padding = false)
 398 {
 399    assert(vec_src.type() == RegType::vgpr);
 400    Builder bld(ctx->program, ctx->block);
 401
 402    if (dst.type() == RegType::sgpr && num_components > dst.size()) {
 403       Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components));
 404       expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
 405       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst);
 406       ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
 407       return;
 408    }
 409
 410    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 411
 412    if (vec_src == dst)
 413       return;
 414
 415    if (num_components == 1) {
 416       if (dst.type() == RegType::sgpr)
 417          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 418       else
 419          bld.copy(Definition(dst), vec_src);
 420       return;
 421    }
 422
 423    unsigned component_bytes = dst.bytes() / num_components;
 424    RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes);
 425    RegClass dst_rc = RegClass::get(dst.type(), component_bytes);
 426    assert(dst.type() == RegType::vgpr || !src_rc.is_subdword());
 427    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 428
 429    Temp padding = Temp(0, dst_rc);
 430    if (zero_padding)
 431       padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));
 432
 433    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
 434       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 435    vec->definitions[0] = Definition(dst);
 436    unsigned k = 0;
 437    for (unsigned i = 0; i < num_components; i++) {
 438       if (mask & (1 << i)) {
 439          Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
 440          if (dst.type() == RegType::sgpr)
 441             src = bld.as_uniform(src);
 442          vec->operands[i] = Operand(src);
 443          elems[i] = src;
 444       } else {
 445          vec->operands[i] = Operand::zero(component_bytes);
 446          elems[i] = padding;
 447       }
 448    }
 449    ctx->block->instructions.emplace_back(std::move(vec));
 450    ctx->allocated_vec.emplace(dst.id(), elems);
 451 }
 452
 453 /* adjust misaligned small bit size loads */
 454 void
 455 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
 456 {
 457    Builder bld(ctx->program, ctx->block);
 458    Operand shift;
 459    Temp select = Temp();
 460    if (offset.isConstant()) {
 461       assert(offset.constantValue() && offset.constantValue() < 4);
 462       shift = Operand::c32(offset.constantValue() * 8);
 463    } else {
 464       /* bit_offset = 8 * (offset & 0x3) */
 465       Temp tmp =
 466          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
 467       select = bld.tmp(s1);
 468       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
 469                        Operand::c32(3u));
 470    }
 471
 472    if (vec.size() == 1) {
 473       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
 474    } else if (vec.size() == 2) {
 475       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
 476       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
 477       if (tmp == dst)
 478          emit_split_vector(ctx, dst, 2);
 479       else
 480          emit_extract_vector(ctx, tmp, 0, dst);
 481    } else if (vec.size() == 3 || vec.size() == 4) {
 482       Temp lo = bld.tmp(s2), hi;
 483       if (vec.size() == 3) {
 484          /* this can happen if we use VMEM for a uniform load */
 485          hi = bld.tmp(s1);
 486          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
 487       } else {
 488          hi = bld.tmp(s2);
 489          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
 490          hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
 491       }
 492       if (select != Temp())
 493          hi =
 494             bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
 495       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
 496       Temp mid = bld.tmp(s1);
 497       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
 498       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
 499       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
 500       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
 501       emit_split_vector(ctx, dst, 2);
 502    }
 503 }
 504
 505 void
 506 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
 507 {
 508    Builder bld(ctx->program, ctx->block);
 509    if (offset.isTemp()) {
 510       Temp tmp[4] = {vec, vec, vec, vec};
 511
 512       if (vec.size() == 4) {
 513          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
 514          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
 515                     Definition(tmp[2]), Definition(tmp[3]), vec);
 516       } else if (vec.size() == 3) {
 517          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
 518          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
 519                     Definition(tmp[2]), vec);
 520       } else if (vec.size() == 2) {
 521          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
 522          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
 523       }
 524       for (unsigned i = 0; i < dst.size(); i++)
 525          tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
 526
 527       vec = tmp[0];
 528       if (dst.size() == 2)
 529          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
 530
 531       offset = Operand::zero();
 532    }
 533
 534    unsigned num_components = vec.bytes() / component_size;
 535    if (vec.regClass() == dst.regClass()) {
 536       assert(offset.constantValue() == 0);
 537       bld.copy(Definition(dst), vec);
 538       emit_split_vector(ctx, dst, num_components);
 539       return;
 540    }
 541
 542    emit_split_vector(ctx, vec, num_components);
 543    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 544    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
 545
 546    assert(offset.constantValue() % component_size == 0);
 547    unsigned skip = offset.constantValue() / component_size;
 548    for (unsigned i = skip; i < num_components; i++)
 549       elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
 550
 551    if (dst.type() == RegType::vgpr) {
 552       /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
 553       num_components = dst.bytes() / component_size;
 554       aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
 555          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 556       for (unsigned i = 0; i < num_components; i++)
 557          create_vec->operands[i] = Operand(elems[i]);
 558       create_vec->definitions[0] = Definition(dst);
 559       bld.insert(std::move(create_vec));
 560
 561    } else if (skip) {
 562       /* if dst is sgpr - split the src, but move the original to sgpr. */
 563       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
 564       byte_align_scalar(ctx, vec, offset, dst);
 565    } else {
 566       assert(dst.size() == vec.size());
 567       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
 568    }
 569
 570    ctx->allocated_vec.emplace(dst.id(), elems);
 571 }
 572
 573 Temp
 574 get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit)
 575 {
 576    RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components);
 577    Temp tmp = get_ssa_temp(ctx, def);
 578    if (tmp.bytes() != rc.bytes())
 579       return emit_extract_vector(ctx, tmp, 0, rc);
 580    else
 581       return tmp;
 582 }
 583
 584 Temp
 585 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
 586 {
 587    Builder bld(ctx->program, ctx->block);
 588    if (!dst.id())
 589       dst = bld.tmp(bld.lm);
 590
 591    assert(val.regClass() == s1);
 592    assert(dst.regClass() == bld.lm);
 593
 594    return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
 595                    bld.scc(val));
 596 }
 597
 598 Temp
 599 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
 600 {
 601    Builder bld(ctx->program, ctx->block);
 602    if (!dst.id())
 603       dst = bld.tmp(s1);
 604
 605    assert(val.regClass() == bld.lm);
 606    assert(dst.regClass() == s1);
 607
 608    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
 609    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm));
 610    return dst;
 611 }
 612
 613 /**
 614  * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
 615  * src_bits and dst_bits are truncated.
 616  *
 617  * Sign extension may be applied using the sign_extend parameter. The position of the input sign
 618  * bit is indicated by src_bits in this case.
 619  *
 620  * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
 621  */
 622 Temp
 623 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
 624             bool sign_extend, Temp dst = Temp())
 625 {
 626    assert(!(sign_extend && dst_bits < src_bits) &&
 627           "Shrinking integers is not supported for signed inputs");
 628
 629    if (!dst.id()) {
 630       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
 631          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
 632       else
 633          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
 634    }
 635
 636    assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
 637    assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
 638
 639    if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
 640       /* Copy the raw value, leaving an undefined value in the upper bits for
 641        * the caller to handle appropriately */
 642       return bld.copy(Definition(dst), src);
 643    } else if (dst.bytes() < src.bytes()) {
 644       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
 645    }
 646
 647    Temp tmp = dst;
 648    if (dst_bits == 64)
 649       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
 650
 651    if (tmp == src) {
 652    } else if (src.regClass() == s1) {
 653       assert(src_bits < 32);
 654       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
 655                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
 656    } else {
 657       assert(src_bits < 32);
 658       bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(),
 659                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
 660    }
 661
 662    if (dst_bits == 64) {
 663       if (sign_extend && dst.regClass() == s2) {
 664          Temp high =
 665             bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
 666          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
 667       } else if (sign_extend && dst.regClass() == v2) {
 668          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
 669          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
 670       } else {
 671          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
 672       }
 673    }
 674
 675    return dst;
 676 }
 677
 678 enum sgpr_extract_mode {
 679    sgpr_extract_sext,
 680    sgpr_extract_zext,
 681    sgpr_extract_undef,
 682 };
 683
 684 Temp
 685 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
 686 {
 687    Temp vec = get_ssa_temp(ctx, src->src.ssa);
 688    unsigned src_size = src->src.ssa->bit_size;
 689    unsigned swizzle = src->swizzle[0];
 690
 691    if (vec.size() > 1) {
 692       assert(src_size == 16);
 693       vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
 694       swizzle = swizzle & 1;
 695    }
 696
 697    Builder bld(ctx->program, ctx->block);
 698    Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
 699
 700    if (mode == sgpr_extract_undef && swizzle == 0)
 701       bld.copy(Definition(tmp), vec);
 702    else
 703       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
 704                  Operand::c32(swizzle), Operand::c32(src_size),
 705                  Operand::c32((mode == sgpr_extract_sext)));
 706
 707    if (dst.regClass() == s2)
 708       convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
 709
 710    return dst;
 711 }
 712
 713 Temp
 714 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
 715 {
 716    if (src.src.ssa->num_components == 1 && size == 1)
 717       return get_ssa_temp(ctx, src.src.ssa);
 718
 719    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 720    unsigned elem_size = src.src.ssa->bit_size / 8u;
 721    bool identity_swizzle = true;
 722
 723    for (unsigned i = 0; identity_swizzle && i < size; i++) {
 724       if (src.swizzle[i] != i)
 725          identity_swizzle = false;
 726    }
 727    if (identity_swizzle)
 728       return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
 729
 730    assert(elem_size > 0);
 731    assert(vec.bytes() % elem_size == 0);
 732
 733    if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
 734       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
 735       return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
 736                                            sgpr_extract_undef);
 737    }
 738
 739    bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
 740    if (as_uniform)
 741       vec = as_vgpr(ctx, vec);
 742
 743    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
 744                                     : RegClass(vec.type(), elem_size / 4);
 745    if (size == 1) {
 746       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 747    } else {
 748       assert(size <= 4);
 749       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 750       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
 751          aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 752       for (unsigned i = 0; i < size; ++i) {
 753          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 754          vec_instr->operands[i] = Operand{elems[i]};
 755       }
 756       Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
 757       vec_instr->definitions[0] = Definition(dst);
 758       ctx->block->instructions.emplace_back(std::move(vec_instr));
 759       ctx->allocated_vec.emplace(dst.id(), elems);
 760       return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
 761    }
 762 }
 763
 764 Temp
 765 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
 766 {
 767    /* returns v2b or v1 for vop3p usage.
 768     * The source expects exactly 2 16bit components
 769     * which are within the same dword
 770     */
 771    assert(src.src.ssa->bit_size == 16);
 772    assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
 773
 774    Temp tmp = get_ssa_temp(ctx, src.src.ssa);
 775    if (tmp.size() == 1)
 776       return tmp;
 777
 778    /* the size is larger than 1 dword: check the swizzle */
 779    unsigned dword = src.swizzle[0] >> 1;
 780
 781    /* extract a full dword if possible */
 782    if (tmp.bytes() >= (dword + 1) * 4) {
 783       /* if the source is split into components, use p_create_vector */
 784       auto it = ctx->allocated_vec.find(tmp.id());
 785       if (it != ctx->allocated_vec.end()) {
 786          unsigned index = dword << 1;
 787          Builder bld(ctx->program, ctx->block);
 788          if (it->second[index].regClass() == v2b)
 789             return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
 790                               it->second[index + 1]);
 791       }
 792       return emit_extract_vector(ctx, tmp, dword, v1);
 793    } else {
 794       /* This must be a swizzled access to %a.zz where %a is v6b */
 795       assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
 796       assert(tmp.regClass() == v6b && dword == 1);
 797       return emit_extract_vector(ctx, tmp, dword * 2, v2b);
 798    }
 799 }
 800
 801 uint32_t
 802 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
 803 {
 804    nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
 805    return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
 806 }
 807
 808 Temp
 809 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
 810 {
 811    if (ptr.size() == 2)
 812       return ptr;
 813    Builder bld(ctx->program, ctx->block);
 814    if (ptr.type() == RegType::vgpr && !non_uniform)
 815       ptr = bld.as_uniform(ptr);
 816    return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
 817                      Operand::c32((unsigned)ctx->options->address32_hi));
 818 }
 819
 820 void
 821 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
 822                       bool writes_scc, uint8_t uses_ub = 0)
 823 {
 824    aco_ptr<SOP2_instruction> sop2{
 825       create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 826    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 827    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 828    sop2->definitions[0] = Definition(dst);
 829    if (instr->no_unsigned_wrap)
 830       sop2->definitions[0].setNUW(true);
 831    if (writes_scc)
 832       sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
 833
 834    for (int i = 0; i < 2; i++) {
 835       if (uses_ub & (1 << i)) {
 836          uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
 837          if (src_ub <= 0xffff)
 838             sop2->operands[i].set16bit(true);
 839          else if (src_ub <= 0xffffff)
 840             sop2->operands[i].set24bit(true);
 841       }
 842    }
 843
 844    ctx->block->instructions.emplace_back(std::move(sop2));
 845 }
 846
 847 void
 848 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
 849                       bool commutative, bool swap_srcs = false, bool flush_denorms = false,
 850                       bool nuw = false, uint8_t uses_ub = 0)
 851 {
 852    Builder bld(ctx->program, ctx->block);
 853    bld.is_precise = instr->exact;
 854
 855    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 856    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 857    if (src1.type() == RegType::sgpr) {
 858       if (commutative && src0.type() == RegType::vgpr) {
 859          Temp t = src0;
 860          src0 = src1;
 861          src1 = t;
 862       } else {
 863          src1 = as_vgpr(ctx, src1);
 864       }
 865    }
 866
 867    Operand op[2] = {Operand(src0), Operand(src1)};
 868
 869    for (int i = 0; i < 2; i++) {
 870       if (uses_ub & (1 << i)) {
 871          uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
 872          if (src_ub <= 0xffff)
 873             op[i].set16bit(true);
 874          else if (src_ub <= 0xffffff)
 875             op[i].set24bit(true);
 876       }
 877    }
 878
 879    if (flush_denorms && ctx->program->gfx_level < GFX9) {
 880       assert(dst.size() == 1);
 881       Temp tmp = bld.vop2(opc, bld.def(v1), op[0], op[1]);
 882       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
 883    } else {
 884       if (nuw) {
 885          bld.nuw().vop2(opc, Definition(dst), op[0], op[1]);
 886       } else {
 887          bld.vop2(opc, Definition(dst), op[0], op[1]);
 888       }
 889    }
 890 }
 891
 892 void
 893 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
 894 {
 895    Builder bld(ctx->program, ctx->block);
 896    bld.is_precise = instr->exact;
 897
 898    Temp src0 = get_alu_src(ctx, instr->src[0]);
 899    Temp src1 = get_alu_src(ctx, instr->src[1]);
 900
 901    if (src1.type() == RegType::sgpr) {
 902       assert(src0.type() == RegType::vgpr);
 903       std::swap(src0, src1);
 904    }
 905
 906    Temp src00 = bld.tmp(src0.type(), 1);
 907    Temp src01 = bld.tmp(src0.type(), 1);
 908    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
 909    Temp src10 = bld.tmp(v1);
 910    Temp src11 = bld.tmp(v1);
 911    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
 912    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
 913    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
 914    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
 915 }
 916
 917 void
 918 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
 919                        bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
 920 {
 921    assert(num_sources == 2 || num_sources == 3);
 922    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
 923    bool has_sgpr = false;
 924    for (unsigned i = 0; i < num_sources; i++) {
 925       src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
 926       if (has_sgpr)
 927          src[i] = as_vgpr(ctx, src[i]);
 928       else
 929          has_sgpr = src[i].type() == RegType::sgpr;
 930    }
 931
 932    Builder bld(ctx->program, ctx->block);
 933    bld.is_precise = instr->exact;
 934    if (flush_denorms && ctx->program->gfx_level < GFX9) {
 935       Temp tmp;
 936       if (num_sources == 3)
 937          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
 938       else
 939          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
 940       if (dst.size() == 1)
 941          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
 942       else
 943          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
 944    } else if (num_sources == 3) {
 945       bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
 946    } else {
 947       bld.vop3(op, Definition(dst), src[0], src[1]);
 948    }
 949 }
 950
 951 Builder::Result
 952 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
 953                        bool swap_srcs = false)
 954 {
 955    Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
 956    Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
 957    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 958       src1 = as_vgpr(ctx, src1);
 959    assert(instr->def.num_components == 2);
 960
 961    /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
 962    unsigned opsel_lo =
 963       (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
 964    unsigned opsel_hi =
 965       (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
 966
 967    Builder bld(ctx->program, ctx->block);
 968    bld.is_precise = instr->exact;
 969    Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
 970    return res;
 971 }
 972
 973 void
 974 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
 975                       unsigned neg_lo = 0)
 976 {
 977    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
 978    bool has_sgpr = false;
 979    for (unsigned i = 0; i < 3; i++) {
 980       src[i] = get_alu_src(ctx, instr->src[i]);
 981       if (has_sgpr)
 982          src[i] = as_vgpr(ctx, src[i]);
 983       else
 984          has_sgpr = src[i].type() == RegType::sgpr;
 985    }
 986
 987    Builder bld(ctx->program, ctx->block);
 988    bld.is_precise = instr->exact;
 989    VALU_instruction& vop3p =
 990       bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
 991    vop3p.clamp = clamp;
 992    vop3p.neg_lo = neg_lo;
 993 }
 994
 995 void
 996 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
 997 {
 998    Builder bld(ctx->program, ctx->block);
 999    bld.is_precise = instr->exact;
1000    if (dst.type() == RegType::sgpr)
1001       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1002                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
1003    else
1004       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
1005 }
1006
1007 void
1008 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1009 {
1010    Temp src0 = get_alu_src(ctx, instr->src[0]);
1011    Temp src1 = get_alu_src(ctx, instr->src[1]);
1012    assert(src0.size() == src1.size());
1013
1014    aco_ptr<Instruction> vopc;
1015    if (src1.type() == RegType::sgpr) {
1016       if (src0.type() == RegType::vgpr) {
1017          /* to swap the operands, we might also have to change the opcode */
1018          switch (op) {
1019          case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
1020          case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
1021          case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
1022          case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
1023          case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
1024          case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
1025          case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
1026          case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
1027          case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
1028          case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
1029          case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
1030          case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
1031          case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
1032          case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1033          case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1034          case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1035          case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1036          case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1037          default: /* eq and ne are commutative */ break;
1038          }
1039          Temp t = src0;
1040          src0 = src1;
1041          src1 = t;
1042       } else {
1043          src1 = as_vgpr(ctx, src1);
1044       }
1045    }
1046
1047    Builder bld(ctx->program, ctx->block);
1048    bld.vopc(op, Definition(dst), src0, src1);
1049 }
1050
1051 void
1052 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1053 {
1054    Temp src0 = get_alu_src(ctx, instr->src[0]);
1055    Temp src1 = get_alu_src(ctx, instr->src[1]);
1056    Builder bld(ctx->program, ctx->block);
1057
1058    assert(dst.regClass() == bld.lm);
1059    assert(src0.type() == RegType::sgpr);
1060    assert(src1.type() == RegType::sgpr);
1061
1062    /* Emit the SALU comparison instruction */
1063    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1064    /* Turn the result into a per-lane bool */
1065    bool_to_vector_condition(ctx, cmp, dst);
1066 }
1067
1068 void
1069 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1070                 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1071                 aco_opcode s64_op = aco_opcode::num_opcodes)
1072 {
1073    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
1074                      : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1075                                                              : aco_opcode::num_opcodes;
1076    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
1077                      : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1078                                                              : v16_op;
1079    bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
1080                    get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1081                    get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1082    aco_opcode op = use_valu ? v_op : s_op;
1083    assert(op != aco_opcode::num_opcodes);
1084    assert(dst.regClass() == ctx->program->lane_mask);
1085
1086    if (use_valu)
1087       emit_vopc_instruction(ctx, instr, op, dst);
1088    else
1089       emit_sopc_instruction(ctx, instr, op, dst);
1090 }
1091
1092 void
1093 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1094                    Temp dst)
1095 {
1096    Builder bld(ctx->program, ctx->block);
1097    Temp src0 = get_alu_src(ctx, instr->src[0]);
1098    Temp src1 = get_alu_src(ctx, instr->src[1]);
1099
1100    assert(dst.regClass() == bld.lm);
1101    assert(src0.regClass() == bld.lm);
1102    assert(src1.regClass() == bld.lm);
1103
1104    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1105 }
1106
1107 void
1108 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1109 {
1110    Builder bld(ctx->program, ctx->block);
1111    Temp cond = get_alu_src(ctx, instr->src[0]);
1112    Temp then = get_alu_src(ctx, instr->src[1]);
1113    Temp els = get_alu_src(ctx, instr->src[2]);
1114
1115    assert(cond.regClass() == bld.lm);
1116
1117    if (dst.type() == RegType::vgpr) {
1118       aco_ptr<Instruction> bcsel;
1119       if (dst.size() == 1) {
1120          then = as_vgpr(ctx, then);
1121          els = as_vgpr(ctx, els);
1122
1123          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1124       } else if (dst.size() == 2) {
1125          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1126          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1127          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1128          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1129
1130          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1131          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1132
1133          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1134       } else {
1135          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1136       }
1137       return;
1138    }
1139
1140    if (instr->def.bit_size == 1) {
1141       assert(dst.regClass() == bld.lm);
1142       assert(then.regClass() == bld.lm);
1143       assert(els.regClass() == bld.lm);
1144    }
1145
1146    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1147       if (dst.regClass() == s1 || dst.regClass() == s2) {
1148          assert((then.regClass() == s1 || then.regClass() == s2) &&
1149                 els.regClass() == then.regClass());
1150          assert(dst.size() == then.size());
1151          aco_opcode op =
1152             dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1153          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1154       } else {
1155          isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1156       }
1157       return;
1158    }
1159
1160    /* divergent boolean bcsel
1161     * this implements bcsel on bools: dst = s0 ? s1 : s2
1162     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1163    assert(instr->def.bit_size == 1);
1164
1165    if (cond.id() != then.id())
1166       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1167
1168    if (cond.id() == els.id())
1169       bld.copy(Definition(dst), then);
1170    else
1171       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1172                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1173 }
1174
1175 void
1176 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1177                uint32_t undo)
1178 {
1179    /* multiply by 16777216 to handle denormals */
1180    Temp is_denormal = bld.tmp(bld.lm);
1181    VALU_instruction& valu =
1182       bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal), val, Operand::c32(1u << 4))
1183          ->valu();
1184    valu.neg[0] = true;
1185    valu.abs[0] = true;
1186    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1187    scaled = bld.vop1(op, bld.def(v1), scaled);
1188    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1189
1190    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1191
1192    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1193 }
1194
1195 void
1196 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1197 {
1198    if (ctx->block->fp_mode.denorm32 == 0) {
1199       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1200       return;
1201    }
1202
1203    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1204 }
1205
1206 void
1207 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1208 {
1209    if (ctx->block->fp_mode.denorm32 == 0) {
1210       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1211       return;
1212    }
1213
1214    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1215 }
1216
1217 void
1218 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1219 {
1220    if (ctx->block->fp_mode.denorm32 == 0) {
1221       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1222       return;
1223    }
1224
1225    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1226 }
1227
1228 void
1229 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1230 {
1231    if (ctx->block->fp_mode.denorm32 == 0) {
1232       bld.vop1(aco_opcode::v_log_f32, dst, val);
1233       return;
1234    }
1235
1236    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1237 }
1238
1239 Temp
1240 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1241 {
1242    if (ctx->options->gfx_level >= GFX7)
1243       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1244
1245    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1246    /* TODO: create more efficient code! */
1247    if (val.type() == RegType::sgpr)
1248       val = as_vgpr(ctx, val);
1249
1250    /* Split the input value. */
1251    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1252    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1253
1254    /* Extract the exponent and compute the unbiased value. */
1255    Temp exponent =
1256       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1257    exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1258
1259    /* Extract the fractional part. */
1260    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1261                                 Operand::c32(0x000fffffu));
1262    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1263
1264    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1265    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1266               fract_mask);
1267
1268    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1269    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1270    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1271    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1272    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1273
1274    /* Get the sign bit. */
1275    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1276
1277    /* Decide the operation to apply depending on the unbiased exponent. */
1278    Temp exp_lt0 =
1279       bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
1280    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1281                           bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1282    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1283    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1284    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1285    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1286
1287    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1288 }
1289
1290 Temp
1291 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1292 {
1293    if (ctx->options->gfx_level >= GFX7)
1294       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1295
1296    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1297     * lowered at NIR level for precision reasons). */
1298    Temp src0 = as_vgpr(ctx, val);
1299
1300    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1301                              Operand::c32(0x3fefffffu));
1302
1303    Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
1304    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1305    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1306
1307    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1308    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1309    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1310    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1311
1312    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1313    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1314
1315    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1316
1317    Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1318    add->valu().neg[1] = true;
1319
1320    return add->definitions[0].getTemp();
1321 }
1322
1323 Temp
1324 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1325 {
1326    if (bld.program->gfx_level < GFX8) {
1327       Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1328       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1329                           add.def(1).getTemp());
1330    }
1331
1332    Builder::Result add(NULL);
1333    if (bld.program->gfx_level >= GFX9) {
1334       add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1335    } else {
1336       add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
1337    }
1338    add->valu().clamp = 1;
1339    return dst.getTemp();
1340 }
1341
1342 Temp
1343 usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1344 {
1345    if (bld.program->gfx_level < GFX8) {
1346       Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
1347       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
1348                           sub.def(1).getTemp());
1349    }
1350
1351    Builder::Result sub(NULL);
1352    if (bld.program->gfx_level >= GFX9) {
1353       sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
1354    } else {
1355       sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
1356    }
1357    sub->valu().clamp = 1;
1358    return dst.getTemp();
1359 }
1360
1361 void
1362 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1363 {
1364    Builder bld(ctx->program, ctx->block);
1365    bld.is_precise = instr->exact;
1366    Temp dst = get_ssa_temp(ctx, &instr->def);
1367    switch (instr->op) {
1368    case nir_op_vec2:
1369    case nir_op_vec3:
1370    case nir_op_vec4:
1371    case nir_op_vec5:
1372    case nir_op_vec8:
1373    case nir_op_vec16: {
1374       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1375       unsigned num = instr->def.num_components;
1376       for (unsigned i = 0; i < num; ++i)
1377          elems[i] = get_alu_src(ctx, instr->src[i]);
1378
1379       if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
1380          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1381             aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1)};
1382          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->def.bit_size / 8u);
1383          for (unsigned i = 0; i < num; ++i) {
1384             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1385                elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1386             vec->operands[i] = Operand{elems[i]};
1387          }
1388          vec->definitions[0] = Definition(dst);
1389          ctx->block->instructions.emplace_back(std::move(vec));
1390          ctx->allocated_vec.emplace(dst.id(), elems);
1391       } else {
1392          bool use_s_pack = ctx->program->gfx_level >= GFX9;
1393          Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
1394
1395          std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1396          uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1397          for (unsigned i = 0; i < num; i++) {
1398             unsigned packed_size = use_s_pack ? 16 : 32;
1399             unsigned idx = i * instr->def.bit_size / packed_size;
1400             unsigned offset = i * instr->def.bit_size % packed_size;
1401             if (nir_src_is_const(instr->src[i].src)) {
1402                const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1403                continue;
1404             }
1405             if (nir_src_is_undef(instr->src[i].src))
1406                continue;
1407
1408             if (offset != packed_size - instr->def.bit_size)
1409                elems[i] =
1410                   bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1411
1412             if (offset)
1413                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1414                                    Operand::c32(offset));
1415
1416             if (packed[idx].id())
1417                packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1418                                       packed[idx]);
1419             else
1420                packed[idx] = elems[i];
1421          }
1422
1423          if (use_s_pack) {
1424             for (unsigned i = 0; i < dst.size(); i++) {
1425                bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1426
1427                if (packed[i * 2].id() && packed[i * 2 + 1].id())
1428                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1429                                        packed[i * 2 + 1]);
1430                else if (packed[i * 2 + 1].id())
1431                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1432                                        Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1433                else if (packed[i * 2].id())
1434                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1435                                        Operand::c32(const_vals[i * 2 + 1]));
1436                else
1437                   packed[i] = Temp(); /* Both constants, so reset the entry */
1438
1439                if (same)
1440                   const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1441                else
1442                   const_vals[i] = 0;
1443             }
1444          }
1445
1446          for (unsigned i = 0; i < dst.size(); i++) {
1447             if (const_vals[i] && packed[i].id())
1448                packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1449                                     Operand::c32(const_vals[i]), packed[i]);
1450             else if (!packed[i].id())
1451                packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1452          }
1453
1454          if (dst.size() == 1)
1455             bld.copy(Definition(dst), packed[0]);
1456          else {
1457             aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1458                aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
1459             vec->definitions[0] = Definition(dst);
1460             for (unsigned i = 0; i < dst.size(); ++i)
1461                vec->operands[i] = Operand(packed[i]);
1462             bld.insert(std::move(vec));
1463          }
1464       }
1465       break;
1466    }
1467    case nir_op_mov: {
1468       Temp src = get_alu_src(ctx, instr->src[0]);
1469       if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1470          /* use size() instead of bytes() for 8/16-bit */
1471          assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1472          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1473       } else {
1474          assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1475          bld.copy(Definition(dst), src);
1476       }
1477       break;
1478    }
1479    case nir_op_inot: {
1480       Temp src = get_alu_src(ctx, instr->src[0]);
1481       if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1482          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1483       } else if (dst.regClass() == v2) {
1484          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1485          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1486          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1487          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1488          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1489       } else if (dst.type() == RegType::sgpr) {
1490          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1491          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1492       } else {
1493          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1494       }
1495       break;
1496    }
1497    case nir_op_iabs: {
1498       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1499          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
1500
1501          unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
1502          unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
1503
1504          Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
1505                               src, opsel_lo, opsel_hi);
1506          bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
1507          break;
1508       }
1509       Temp src = get_alu_src(ctx, instr->src[0]);
1510       if (dst.regClass() == s1) {
1511          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1512       } else if (dst.regClass() == v1) {
1513          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1514                   bld.vsub32(bld.def(v1), Operand::zero(), src));
1515       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1516          bld.vop3(
1517             aco_opcode::v_max_i16_e64, Definition(dst), src,
1518             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1519       } else if (dst.regClass() == v2b) {
1520          src = as_vgpr(ctx, src);
1521          bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
1522                   bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1523       } else {
1524          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1525       }
1526       break;
1527    }
1528    case nir_op_isign: {
1529       Temp src = get_alu_src(ctx, instr->src[0]);
1530       if (dst.regClass() == s1) {
1531          Temp tmp =
1532             bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1533          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1534       } else if (dst.regClass() == s2) {
1535          Temp neg =
1536             bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1537          Temp neqz;
1538          if (ctx->program->gfx_level >= GFX8)
1539             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1540          else
1541             neqz =
1542                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1543                   .def(1)
1544                   .getTemp();
1545          /* SCC gets zero-extended to 64 bit */
1546          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1547       } else if (dst.regClass() == v1) {
1548          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1549       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
1550          bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
1551       } else if (dst.regClass() == v2b) {
1552          src = as_vgpr(ctx, src);
1553          bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
1554                   bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
1555       } else if (dst.regClass() == v2) {
1556          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1557          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1558          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
1559          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1560          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1561          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1562       } else {
1563          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1564       }
1565       break;
1566    }
1567    case nir_op_imax: {
1568       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1569          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1570       } else if (dst.regClass() == v2b) {
1571          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1572       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1573          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1574       } else if (dst.regClass() == v1) {
1575          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1576       } else if (dst.regClass() == s1) {
1577          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1578       } else {
1579          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1580       }
1581       break;
1582    }
1583    case nir_op_umax: {
1584       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1585          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1586       } else if (dst.regClass() == v2b) {
1587          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1588       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1589          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1590       } else if (dst.regClass() == v1) {
1591          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1592       } else if (dst.regClass() == s1) {
1593          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1594       } else {
1595          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1596       }
1597       break;
1598    }
1599    case nir_op_imin: {
1600       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1601          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1602       } else if (dst.regClass() == v2b) {
1603          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1604       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1605          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1606       } else if (dst.regClass() == v1) {
1607          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1608       } else if (dst.regClass() == s1) {
1609          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1610       } else {
1611          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1612       }
1613       break;
1614    }
1615    case nir_op_umin: {
1616       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1617          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1618       } else if (dst.regClass() == v2b) {
1619          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1620       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1621          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1622       } else if (dst.regClass() == v1) {
1623          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1624       } else if (dst.regClass() == s1) {
1625          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1626       } else {
1627          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1628       }
1629       break;
1630    }
1631    case nir_op_ior: {
1632       if (instr->def.bit_size == 1) {
1633          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1634       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1635          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1636       } else if (dst.regClass() == v2) {
1637          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1638       } else if (dst.regClass() == s1) {
1639          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1640       } else if (dst.regClass() == s2) {
1641          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1642       } else {
1643          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1644       }
1645       break;
1646    }
1647    case nir_op_iand: {
1648       if (instr->def.bit_size == 1) {
1649          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1650       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1651          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1652       } else if (dst.regClass() == v2) {
1653          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1654       } else if (dst.regClass() == s1) {
1655          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1656       } else if (dst.regClass() == s2) {
1657          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1658       } else {
1659          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1660       }
1661       break;
1662    }
1663    case nir_op_ixor: {
1664       if (instr->def.bit_size == 1) {
1665          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1666       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1667          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1668       } else if (dst.regClass() == v2) {
1669          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1670       } else if (dst.regClass() == s1) {
1671          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1672       } else if (dst.regClass() == s2) {
1673          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1674       } else {
1675          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1676       }
1677       break;
1678    }
1679    case nir_op_ushr: {
1680       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1681          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1682       } else if (dst.regClass() == v2b) {
1683          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1684       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1685          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1686       } else if (dst.regClass() == v1) {
1687          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1688       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1689          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1690                   get_alu_src(ctx, instr->src[0]));
1691       } else if (dst.regClass() == v2) {
1692          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1693       } else if (dst.regClass() == s2) {
1694          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1695       } else if (dst.regClass() == s1) {
1696          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1697       } else {
1698          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1699       }
1700       break;
1701    }
1702    case nir_op_ishl: {
1703       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1704          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1705       } else if (dst.regClass() == v2b) {
1706          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1707       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1708          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1709       } else if (dst.regClass() == v1) {
1710          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1711                                false, 2);
1712       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1713          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1714                   get_alu_src(ctx, instr->src[0]));
1715       } else if (dst.regClass() == v2) {
1716          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1717       } else if (dst.regClass() == s1) {
1718          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1719       } else if (dst.regClass() == s2) {
1720          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1721       } else {
1722          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1723       }
1724       break;
1725    }
1726    case nir_op_ishr: {
1727       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1728          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1729       } else if (dst.regClass() == v2b) {
1730          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1731       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1732          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1733       } else if (dst.regClass() == v1) {
1734          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1735       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1736          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1737                   get_alu_src(ctx, instr->src[0]));
1738       } else if (dst.regClass() == v2) {
1739          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1740       } else if (dst.regClass() == s1) {
1741          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1742       } else if (dst.regClass() == s2) {
1743          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1744       } else {
1745          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1746       }
1747       break;
1748    }
1749    case nir_op_find_lsb: {
1750       Temp src = get_alu_src(ctx, instr->src[0]);
1751       if (src.regClass() == s1) {
1752          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1753       } else if (src.regClass() == v1) {
1754          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1755       } else if (src.regClass() == s2) {
1756          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1757       } else if (src.regClass() == v2) {
1758          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1759          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1760          lo = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), lo);
1761          hi = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), hi);
1762          hi = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)), hi);
1763          bld.vop2(aco_opcode::v_min_u32, Definition(dst), lo, hi);
1764       } else {
1765          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1766       }
1767       break;
1768    }
1769    case nir_op_ufind_msb:
1770    case nir_op_ifind_msb: {
1771       Temp src = get_alu_src(ctx, instr->src[0]);
1772       if (src.regClass() == s1 || src.regClass() == s2) {
1773          aco_opcode op = src.regClass() == s2
1774                             ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1775                                                              : aco_opcode::s_flbit_i32_i64)
1776                             : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1777                                                              : aco_opcode::s_flbit_i32);
1778          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1779
1780          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1781                                         Operand::c32(src.size() * 32u - 1u), msb_rev);
1782          Temp msb = sub.def(0).getTemp();
1783          Temp carry = sub.def(1).getTemp();
1784
1785          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1786                   bld.scc(carry));
1787       } else if (src.regClass() == v1) {
1788          aco_opcode op =
1789             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1790          Temp msb_rev = bld.tmp(v1);
1791          emit_vop1_instruction(ctx, instr, op, msb_rev);
1792          Temp msb = bld.tmp(v1);
1793          Temp carry =
1794             bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1795          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1796       } else if (src.regClass() == v2) {
1797          aco_opcode op =
1798             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1799
1800          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1801          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1802
1803          lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1804                          bld.vop1(op, bld.def(v1), lo));
1805          hi = bld.vop1(op, bld.def(v1), hi);
1806          Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1807
1808          Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1809
1810          Temp msb = bld.tmp(v1);
1811          Temp carry =
1812             bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1813          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1814       } else {
1815          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1816       }
1817       break;
1818    }
1819    case nir_op_ufind_msb_rev:
1820    case nir_op_ifind_msb_rev: {
1821       Temp src = get_alu_src(ctx, instr->src[0]);
1822       if (src.regClass() == s1) {
1823          aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
1824                                                            : aco_opcode::s_flbit_i32;
1825          bld.sop1(op, Definition(dst), src);
1826       } else if (src.regClass() == v1) {
1827          aco_opcode op =
1828             instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1829          emit_vop1_instruction(ctx, instr, op, dst);
1830       } else {
1831          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1832       }
1833       break;
1834    }
1835    case nir_op_bitfield_reverse: {
1836       if (dst.regClass() == s1) {
1837          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1838       } else if (dst.regClass() == v1) {
1839          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1840       } else {
1841          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1842       }
1843       break;
1844    }
1845    case nir_op_iadd: {
1846       if (dst.regClass() == s1) {
1847          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1848          break;
1849       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
1850          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1851          break;
1852       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
1853          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1854          break;
1855       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1856          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1857          break;
1858       }
1859
1860       Temp src0 = get_alu_src(ctx, instr->src[0]);
1861       Temp src1 = get_alu_src(ctx, instr->src[1]);
1862       if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1863          if (instr->no_unsigned_wrap)
1864             bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
1865          else
1866             bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1867          break;
1868       }
1869
1870       assert(src0.size() == 2 && src1.size() == 2);
1871       Temp src00 = bld.tmp(src0.type(), 1);
1872       Temp src01 = bld.tmp(dst.type(), 1);
1873       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1874       Temp src10 = bld.tmp(src1.type(), 1);
1875       Temp src11 = bld.tmp(dst.type(), 1);
1876       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1877
1878       if (dst.regClass() == s2) {
1879          Temp carry = bld.tmp(s1);
1880          Temp dst0 =
1881             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1882          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1883                               bld.scc(carry));
1884          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1885       } else if (dst.regClass() == v2) {
1886          Temp dst0 = bld.tmp(v1);
1887          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1888          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1889          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1890       } else {
1891          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1892       }
1893       break;
1894    }
1895    case nir_op_uadd_sat: {
1896       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1897          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1898          add_instr->valu().clamp = 1;
1899          break;
1900       }
1901       Temp src0 = get_alu_src(ctx, instr->src[0]);
1902       Temp src1 = get_alu_src(ctx, instr->src[1]);
1903       if (dst.regClass() == s1) {
1904          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1905          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1906          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1907                   bld.scc(carry));
1908          break;
1909       } else if (dst.regClass() == v2b) {
1910          Instruction* add_instr;
1911          if (ctx->program->gfx_level >= GFX10) {
1912             add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1913          } else {
1914             if (src1.type() == RegType::sgpr)
1915                std::swap(src0, src1);
1916             add_instr =
1917                bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1918          }
1919          add_instr->valu().clamp = 1;
1920          break;
1921       } else if (dst.regClass() == v1) {
1922          uadd32_sat(bld, Definition(dst), src0, src1);
1923          break;
1924       }
1925
1926       assert(src0.size() == 2 && src1.size() == 2);
1927
1928       Temp src00 = bld.tmp(src0.type(), 1);
1929       Temp src01 = bld.tmp(src0.type(), 1);
1930       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1931       Temp src10 = bld.tmp(src1.type(), 1);
1932       Temp src11 = bld.tmp(src1.type(), 1);
1933       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1934
1935       if (dst.regClass() == s2) {
1936          Temp carry0 = bld.tmp(s1);
1937          Temp carry1 = bld.tmp(s1);
1938
1939          Temp no_sat0 =
1940             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
1941          Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
1942                                  src01, src11, bld.scc(carry0));
1943
1944          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
1945
1946          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
1947                   bld.scc(carry1));
1948       } else if (dst.regClass() == v2) {
1949          Temp no_sat0 = bld.tmp(v1);
1950          Temp dst0 = bld.tmp(v1);
1951          Temp dst1 = bld.tmp(v1);
1952
1953          Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
1954          Temp carry1;
1955
1956          if (ctx->program->gfx_level >= GFX8) {
1957             carry1 = bld.tmp(bld.lm);
1958             bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
1959                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
1960                ->valu()
1961                .clamp = 1;
1962          } else {
1963             Temp no_sat1 = bld.tmp(v1);
1964             carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
1965             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
1966                          carry1);
1967          }
1968
1969          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
1970                       carry1);
1971          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1972       } else {
1973          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1974       }
1975       break;
1976    }
1977    case nir_op_iadd_sat: {
1978       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1979          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
1980          add_instr->valu().clamp = 1;
1981          break;
1982       }
1983       Temp src0 = get_alu_src(ctx, instr->src[0]);
1984       Temp src1 = get_alu_src(ctx, instr->src[1]);
1985       if (dst.regClass() == s1) {
1986          Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
1987          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
1988                                Operand::c32(INT32_MAX), cond);
1989          Temp overflow = bld.tmp(s1);
1990          Temp add =
1991             bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
1992          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
1993          break;
1994       }
1995
1996       src1 = as_vgpr(ctx, src1);
1997
1998       if (dst.regClass() == v2b) {
1999          Instruction* add_instr =
2000             bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
2001          add_instr->valu().clamp = 1;
2002       } else if (dst.regClass() == v1) {
2003          Instruction* add_instr =
2004             bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
2005          add_instr->valu().clamp = 1;
2006       } else {
2007          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2008       }
2009       break;
2010    }
2011    case nir_op_uadd_carry: {
2012       Temp src0 = get_alu_src(ctx, instr->src[0]);
2013       Temp src1 = get_alu_src(ctx, instr->src[1]);
2014       if (dst.regClass() == s1) {
2015          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2016          break;
2017       }
2018       if (dst.regClass() == v1) {
2019          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
2020          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2021                       carry);
2022          break;
2023       }
2024
2025       Temp src00 = bld.tmp(src0.type(), 1);
2026       Temp src01 = bld.tmp(dst.type(), 1);
2027       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2028       Temp src10 = bld.tmp(src1.type(), 1);
2029       Temp src11 = bld.tmp(dst.type(), 1);
2030       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2031       if (dst.regClass() == s2) {
2032          Temp carry = bld.tmp(s1);
2033          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
2034          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2035                           bld.scc(carry))
2036                     .def(1)
2037                     .getTemp();
2038          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2039       } else if (dst.regClass() == v2) {
2040          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
2041          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
2042          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2043                               Operand::c32(1u), carry);
2044          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2045       } else {
2046          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2047       }
2048       break;
2049    }
2050    case nir_op_isub: {
2051       if (dst.regClass() == s1) {
2052          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
2053          break;
2054       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2055          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2056          break;
2057       }
2058
2059       Temp src0 = get_alu_src(ctx, instr->src[0]);
2060       Temp src1 = get_alu_src(ctx, instr->src[1]);
2061       if (dst.regClass() == v1) {
2062          bld.vsub32(Definition(dst), src0, src1);
2063          break;
2064       } else if (dst.bytes() <= 2) {
2065          if (ctx->program->gfx_level >= GFX10)
2066             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
2067          else if (src1.type() == RegType::sgpr)
2068             bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
2069          else if (ctx->program->gfx_level >= GFX8)
2070             bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
2071          else
2072             bld.vsub32(Definition(dst), src0, src1);
2073          break;
2074       }
2075
2076       Temp src00 = bld.tmp(src0.type(), 1);
2077       Temp src01 = bld.tmp(dst.type(), 1);
2078       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2079       Temp src10 = bld.tmp(src1.type(), 1);
2080       Temp src11 = bld.tmp(dst.type(), 1);
2081       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2082       if (dst.regClass() == s2) {
2083          Temp borrow = bld.tmp(s1);
2084          Temp dst0 =
2085             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2086          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
2087                               bld.scc(borrow));
2088          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2089       } else if (dst.regClass() == v2) {
2090          Temp lower = bld.tmp(v1);
2091          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
2092          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
2093          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2094       } else {
2095          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2096       }
2097       break;
2098    }
2099    case nir_op_usub_borrow: {
2100       Temp src0 = get_alu_src(ctx, instr->src[0]);
2101       Temp src1 = get_alu_src(ctx, instr->src[1]);
2102       if (dst.regClass() == s1) {
2103          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2104          break;
2105       } else if (dst.regClass() == v1) {
2106          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
2107          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2108                       borrow);
2109          break;
2110       }
2111
2112       Temp src00 = bld.tmp(src0.type(), 1);
2113       Temp src01 = bld.tmp(dst.type(), 1);
2114       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2115       Temp src10 = bld.tmp(src1.type(), 1);
2116       Temp src11 = bld.tmp(dst.type(), 1);
2117       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2118       if (dst.regClass() == s2) {
2119          Temp borrow = bld.tmp(s1);
2120          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2121          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2122                            bld.scc(borrow))
2123                      .def(1)
2124                      .getTemp();
2125          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2126       } else if (dst.regClass() == v2) {
2127          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
2128          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
2129          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2130                                Operand::c32(1u), borrow);
2131          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2132       } else {
2133          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2134       }
2135       break;
2136    }
2137    case nir_op_usub_sat: {
2138       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2139          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2140          sub_instr->valu().clamp = 1;
2141          break;
2142       }
2143       Temp src0 = get_alu_src(ctx, instr->src[0]);
2144       Temp src1 = get_alu_src(ctx, instr->src[1]);
2145       if (dst.regClass() == s1) {
2146          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
2147          bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
2148          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
2149          break;
2150       } else if (dst.regClass() == v2b) {
2151          Instruction* sub_instr;
2152          if (ctx->program->gfx_level >= GFX10) {
2153             sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
2154          } else {
2155             aco_opcode op = aco_opcode::v_sub_u16;
2156             if (src1.type() == RegType::sgpr) {
2157                std::swap(src0, src1);
2158                op = aco_opcode::v_subrev_u16;
2159             }
2160             sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
2161          }
2162          sub_instr->valu().clamp = 1;
2163          break;
2164       } else if (dst.regClass() == v1) {
2165          usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
2166          break;
2167       }
2168
2169       assert(src0.size() == 2 && src1.size() == 2);
2170       Temp src00 = bld.tmp(src0.type(), 1);
2171       Temp src01 = bld.tmp(src0.type(), 1);
2172       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2173       Temp src10 = bld.tmp(src1.type(), 1);
2174       Temp src11 = bld.tmp(src1.type(), 1);
2175       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2176
2177       if (dst.regClass() == s2) {
2178          Temp carry0 = bld.tmp(s1);
2179          Temp carry1 = bld.tmp(s1);
2180
2181          Temp no_sat0 =
2182             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
2183          Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
2184                                  src01, src11, bld.scc(carry0));
2185
2186          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
2187
2188          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
2189                   bld.scc(carry1));
2190       } else if (dst.regClass() == v2) {
2191          Temp no_sat0 = bld.tmp(v1);
2192          Temp dst0 = bld.tmp(v1);
2193          Temp dst1 = bld.tmp(v1);
2194
2195          Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
2196          Temp carry1;
2197
2198          if (ctx->program->gfx_level >= GFX8) {
2199             carry1 = bld.tmp(bld.lm);
2200             bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
2201                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
2202                ->valu()
2203                .clamp = 1;
2204          } else {
2205             Temp no_sat1 = bld.tmp(v1);
2206             carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
2207             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
2208                          carry1);
2209          }
2210
2211          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
2212                       carry1);
2213          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2214       } else {
2215          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2216       }
2217       break;
2218    }
2219    case nir_op_isub_sat: {
2220       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2221          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
2222          sub_instr->valu().clamp = 1;
2223          break;
2224       }
2225       Temp src0 = get_alu_src(ctx, instr->src[0]);
2226       Temp src1 = get_alu_src(ctx, instr->src[1]);
2227       if (dst.regClass() == s1) {
2228          Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
2229          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2230                                Operand::c32(INT32_MAX), cond);
2231          Temp overflow = bld.tmp(s1);
2232          Temp sub =
2233             bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2234          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
2235          break;
2236       }
2237
2238       src1 = as_vgpr(ctx, src1);
2239
2240       if (dst.regClass() == v2b) {
2241          Instruction* sub_instr =
2242             bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
2243          sub_instr->valu().clamp = 1;
2244       } else if (dst.regClass() == v1) {
2245          Instruction* sub_instr =
2246             bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
2247          sub_instr->valu().clamp = 1;
2248       } else {
2249          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2250       }
2251       break;
2252    }
2253    case nir_op_imul: {
2254       if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
2255          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
2256       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
2257          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
2258       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2259          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
2260       } else if (dst.type() == RegType::vgpr) {
2261          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2262          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2263
2264          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2265             bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
2266             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
2267                                   true /* commutative */, false, false, nuw_16bit);
2268          } else if (nir_src_is_const(instr->src[0].src)) {
2269             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
2270                           nir_src_as_uint(instr->src[0].src), false);
2271          } else if (nir_src_is_const(instr->src[1].src)) {
2272             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
2273                           nir_src_as_uint(instr->src[1].src), false);
2274          } else {
2275             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
2276          }
2277       } else if (dst.regClass() == s1) {
2278          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
2279       } else {
2280          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2281       }
2282       break;
2283    }
2284    case nir_op_umul_high: {
2285       if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2286          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
2287       } else if (dst.bytes() == 4) {
2288          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2289          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2290
2291          Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2292          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2293             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2294          } else {
2295             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2296          }
2297
2298          if (dst.regClass() == s1)
2299             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2300       } else {
2301          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2302       }
2303       break;
2304    }
2305    case nir_op_imul_high: {
2306       if (dst.regClass() == v1) {
2307          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2308       } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2309          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2310       } else if (dst.regClass() == s1) {
2311          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2312                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2313          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2314       } else {
2315          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2316       }
2317       break;
2318    }
2319    case nir_op_fmul: {
2320       if (dst.regClass() == v2b) {
2321          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2322       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2323          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2324       } else if (dst.regClass() == v1) {
2325          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2326       } else if (dst.regClass() == v2) {
2327          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2328       } else {
2329          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2330       }
2331       break;
2332    }
2333    case nir_op_fmulz: {
2334       if (dst.regClass() == v1) {
2335          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
2336       } else {
2337          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2338       }
2339       break;
2340    }
2341    case nir_op_fadd: {
2342       if (dst.regClass() == v2b) {
2343          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2344       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2345          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2346       } else if (dst.regClass() == v1) {
2347          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2348       } else if (dst.regClass() == v2) {
2349          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2350       } else {
2351          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2352       }
2353       break;
2354    }
2355    case nir_op_fsub: {
2356       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2357          Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2358          VALU_instruction& sub = add->valu();
2359          sub.neg_lo[1] = true;
2360          sub.neg_hi[1] = true;
2361          break;
2362       }
2363
2364       Temp src0 = get_alu_src(ctx, instr->src[0]);
2365       Temp src1 = get_alu_src(ctx, instr->src[1]);
2366       if (dst.regClass() == v2b) {
2367          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2368             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2369          else
2370             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2371       } else if (dst.regClass() == v1) {
2372          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2373             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2374          else
2375             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2376       } else if (dst.regClass() == v2) {
2377          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2378                                      as_vgpr(ctx, src1));
2379          add->valu().neg[1] = true;
2380       } else {
2381          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2382       }
2383       break;
2384    }
2385    case nir_op_ffma: {
2386       if (dst.regClass() == v2b) {
2387          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
2388       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2389          assert(instr->def.num_components == 2);
2390
2391          Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
2392          Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
2393          Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
2394
2395          /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
2396          unsigned opsel_lo = 0, opsel_hi = 0;
2397          for (unsigned i = 0; i < 3; i++) {
2398             opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
2399             opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
2400          }
2401
2402          bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
2403       } else if (dst.regClass() == v1) {
2404          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
2405                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2406       } else if (dst.regClass() == v2) {
2407          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
2408       } else {
2409          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2410       }
2411       break;
2412    }
2413    case nir_op_ffmaz: {
2414       if (dst.regClass() == v1) {
2415          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
2416                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2417       } else {
2418          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2419       }
2420       break;
2421    }
2422    case nir_op_fmax: {
2423       if (dst.regClass() == v2b) {
2424          // TODO: check fp_mode.must_flush_denorms16_64
2425          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
2426       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2427          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2428       } else if (dst.regClass() == v1) {
2429          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2430                                ctx->block->fp_mode.must_flush_denorms32);
2431       } else if (dst.regClass() == v2) {
2432          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2433                                 ctx->block->fp_mode.must_flush_denorms16_64);
2434       } else {
2435          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2436       }
2437       break;
2438    }
2439    case nir_op_fmin: {
2440       if (dst.regClass() == v2b) {
2441          // TODO: check fp_mode.must_flush_denorms16_64
2442          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
2443       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2444          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2445       } else if (dst.regClass() == v1) {
2446          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2447                                ctx->block->fp_mode.must_flush_denorms32);
2448       } else if (dst.regClass() == v2) {
2449          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2450                                 ctx->block->fp_mode.must_flush_denorms16_64);
2451       } else {
2452          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2453       }
2454       break;
2455    }
2456    case nir_op_sdot_4x8_iadd: {
2457       if (ctx->options->gfx_level >= GFX11)
2458          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
2459       else
2460          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2461       break;
2462    }
2463    case nir_op_sdot_4x8_iadd_sat: {
2464       if (ctx->options->gfx_level >= GFX11)
2465          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
2466       else
2467          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2468       break;
2469    }
2470    case nir_op_sudot_4x8_iadd: {
2471       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
2472       break;
2473    }
2474    case nir_op_sudot_4x8_iadd_sat: {
2475       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
2476       break;
2477    }
2478    case nir_op_udot_4x8_uadd: {
2479       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2480       break;
2481    }
2482    case nir_op_udot_4x8_uadd_sat: {
2483       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2484       break;
2485    }
2486    case nir_op_sdot_2x16_iadd: {
2487       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2488       break;
2489    }
2490    case nir_op_sdot_2x16_iadd_sat: {
2491       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2492       break;
2493    }
2494    case nir_op_udot_2x16_uadd: {
2495       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2496       break;
2497    }
2498    case nir_op_udot_2x16_uadd_sat: {
2499       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2500       break;
2501    }
2502    case nir_op_cube_amd: {
2503       Temp in = get_alu_src(ctx, instr->src[0], 3);
2504       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2505                      emit_extract_vector(ctx, in, 2, v1)};
2506       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2507       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2508       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2509       Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
2510       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
2511       break;
2512    }
2513    case nir_op_bcsel: {
2514       emit_bcsel(ctx, instr, dst);
2515       break;
2516    }
2517    case nir_op_frsq: {
2518       if (dst.regClass() == v2b) {
2519          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2520       } else if (dst.regClass() == v1) {
2521          Temp src = get_alu_src(ctx, instr->src[0]);
2522          emit_rsq(ctx, bld, Definition(dst), src);
2523       } else if (dst.regClass() == v2) {
2524          /* Lowered at NIR level for precision reasons. */
2525          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2526       } else {
2527          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2528       }
2529       break;
2530    }
2531    case nir_op_fneg: {
2532       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2533          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2534          Instruction* vop3p =
2535             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2536                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2537          vop3p->valu().neg_lo[0] = true;
2538          vop3p->valu().neg_hi[0] = true;
2539          break;
2540       }
2541       Temp src = get_alu_src(ctx, instr->src[0]);
2542       if (dst.regClass() == v2b) {
2543          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2544       } else if (dst.regClass() == v1) {
2545          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2546                   as_vgpr(ctx, src));
2547       } else if (dst.regClass() == v2) {
2548          if (ctx->block->fp_mode.must_flush_denorms16_64)
2549             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2550                            as_vgpr(ctx, src));
2551          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2552          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2553          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2554          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2555       } else {
2556          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2557       }
2558       break;
2559    }
2560    case nir_op_fabs: {
2561       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2562          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2563          Instruction* vop3p =
2564             bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
2565                       instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
2566                .instr;
2567          vop3p->valu().neg_lo[1] = true;
2568          vop3p->valu().neg_hi[1] = true;
2569          break;
2570       }
2571       Temp src = get_alu_src(ctx, instr->src[0]);
2572       if (dst.regClass() == v2b) {
2573          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2574                                          Operand::c16(0x3c00), as_vgpr(ctx, src))
2575                                .instr;
2576          mul->valu().abs[1] = true;
2577       } else if (dst.regClass() == v1) {
2578          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2579                                          Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2580                                .instr;
2581          mul->valu().abs[1] = true;
2582       } else if (dst.regClass() == v2) {
2583          if (ctx->block->fp_mode.must_flush_denorms16_64)
2584             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2585                            as_vgpr(ctx, src));
2586          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2587          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2588          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2589          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2590       } else {
2591          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2592       }
2593       break;
2594    }
2595    case nir_op_fsat: {
2596       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2597          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2598          Instruction* vop3p =
2599             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2600                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2601          vop3p->valu().clamp = true;
2602          break;
2603       }
2604       Temp src = get_alu_src(ctx, instr->src[0]);
2605       if (dst.regClass() == v2b) {
2606          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2607                   src);
2608       } else if (dst.regClass() == v1) {
2609          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2610                   Operand::c32(0x3f800000u), src);
2611          /* apparently, it is not necessary to flush denorms if this instruction is used with these
2612           * operands */
2613          // TODO: confirm that this holds under any circumstances
2614       } else if (dst.regClass() == v2) {
2615          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2616          add->valu().clamp = true;
2617       } else {
2618          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2619       }
2620       break;
2621    }
2622    case nir_op_flog2: {
2623       if (dst.regClass() == v2b) {
2624          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2625       } else if (dst.regClass() == v1) {
2626          Temp src = get_alu_src(ctx, instr->src[0]);
2627          emit_log2(ctx, bld, Definition(dst), src);
2628       } else {
2629          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2630       }
2631       break;
2632    }
2633    case nir_op_frcp: {
2634       if (dst.regClass() == v2b) {
2635          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2636       } else if (dst.regClass() == v1) {
2637          Temp src = get_alu_src(ctx, instr->src[0]);
2638          emit_rcp(ctx, bld, Definition(dst), src);
2639       } else if (dst.regClass() == v2) {
2640          /* Lowered at NIR level for precision reasons. */
2641          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2642       } else {
2643          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2644       }
2645       break;
2646    }
2647    case nir_op_fexp2: {
2648       if (dst.regClass() == v2b) {
2649          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2650       } else if (dst.regClass() == v1) {
2651          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2652       } else {
2653          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2654       }
2655       break;
2656    }
2657    case nir_op_fsqrt: {
2658       if (dst.regClass() == v2b) {
2659          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2660       } else if (dst.regClass() == v1) {
2661          Temp src = get_alu_src(ctx, instr->src[0]);
2662          emit_sqrt(ctx, bld, Definition(dst), src);
2663       } else if (dst.regClass() == v2) {
2664          /* Lowered at NIR level for precision reasons. */
2665          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2666       } else {
2667          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2668       }
2669       break;
2670    }
2671    case nir_op_ffract: {
2672       if (dst.regClass() == v2b) {
2673          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2674       } else if (dst.regClass() == v1) {
2675          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2676       } else if (dst.regClass() == v2) {
2677          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2678       } else {
2679          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2680       }
2681       break;
2682    }
2683    case nir_op_ffloor: {
2684       if (dst.regClass() == v2b) {
2685          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2686       } else if (dst.regClass() == v1) {
2687          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2688       } else if (dst.regClass() == v2) {
2689          Temp src = get_alu_src(ctx, instr->src[0]);
2690          emit_floor_f64(ctx, bld, Definition(dst), src);
2691       } else {
2692          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2693       }
2694       break;
2695    }
2696    case nir_op_fceil: {
2697       if (dst.regClass() == v2b) {
2698          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2699       } else if (dst.regClass() == v1) {
2700          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2701       } else if (dst.regClass() == v2) {
2702          if (ctx->options->gfx_level >= GFX7) {
2703             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2704          } else {
2705             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2706             /* trunc = trunc(src0)
2707              * if (src0 > 0.0 && src0 != trunc)
2708              *    trunc += 1.0
2709              */
2710             Temp src0 = get_alu_src(ctx, instr->src[0]);
2711             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2712             Temp tmp0 =
2713                bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2714             Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
2715             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
2716             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2717                                 bld.copy(bld.def(v1), Operand::zero()),
2718                                 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2719             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2720                              bld.copy(bld.def(v1), Operand::zero()), add);
2721             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2722          }
2723       } else {
2724          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2725       }
2726       break;
2727    }
2728    case nir_op_ftrunc: {
2729       if (dst.regClass() == v2b) {
2730          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2731       } else if (dst.regClass() == v1) {
2732          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2733       } else if (dst.regClass() == v2) {
2734          Temp src = get_alu_src(ctx, instr->src[0]);
2735          emit_trunc_f64(ctx, bld, Definition(dst), src);
2736       } else {
2737          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2738       }
2739       break;
2740    }
2741    case nir_op_fround_even: {
2742       if (dst.regClass() == v2b) {
2743          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2744       } else if (dst.regClass() == v1) {
2745          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2746       } else if (dst.regClass() == v2) {
2747          if (ctx->options->gfx_level >= GFX7) {
2748             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2749          } else {
2750             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2751             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2752             Temp src0 = get_alu_src(ctx, instr->src[0]);
2753             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2754
2755             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2756                                     bld.copy(bld.def(s1), Operand::c32(-2u)));
2757             Temp bfi =
2758                bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2759                         bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2760             Temp tmp =
2761                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2762                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2763             Instruction* sub =
2764                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2765                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2766             sub->valu().neg[1] = true;
2767             tmp = sub->definitions[0].getTemp();
2768
2769             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2770                                 Operand::c32(0x432fffffu));
2771             Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
2772             vop3->valu().abs[0] = true;
2773             Temp cond = vop3->definitions[0].getTemp();
2774
2775             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2776             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2777             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2778                                      as_vgpr(ctx, src0_lo), cond);
2779             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2780                                      as_vgpr(ctx, src0_hi), cond);
2781
2782             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2783          }
2784       } else {
2785          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2786       }
2787       break;
2788    }
2789    case nir_op_fsin_amd:
2790    case nir_op_fcos_amd: {
2791       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2792       aco_ptr<Instruction> norm;
2793       if (dst.regClass() == v2b) {
2794          aco_opcode opcode =
2795             instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2796          bld.vop1(opcode, Definition(dst), src);
2797       } else if (dst.regClass() == v1) {
2798          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2799          if (ctx->options->gfx_level < GFX9)
2800             src = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), src);
2801
2802          aco_opcode opcode =
2803             instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2804          bld.vop1(opcode, Definition(dst), src);
2805       } else {
2806          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2807       }
2808       break;
2809    }
2810    case nir_op_ldexp: {
2811       if (dst.regClass() == v2b) {
2812          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2813       } else if (dst.regClass() == v1) {
2814          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2815       } else if (dst.regClass() == v2) {
2816          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2817       } else {
2818          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2819       }
2820       break;
2821    }
2822    case nir_op_frexp_sig: {
2823       if (dst.regClass() == v2b) {
2824          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2825       } else if (dst.regClass() == v1) {
2826          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2827       } else if (dst.regClass() == v2) {
2828          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2829       } else {
2830          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2831       }
2832       break;
2833    }
2834    case nir_op_frexp_exp: {
2835       if (instr->src[0].src.ssa->bit_size == 16) {
2836          Temp src = get_alu_src(ctx, instr->src[0]);
2837          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2838          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2839          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2840       } else if (instr->src[0].src.ssa->bit_size == 32) {
2841          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2842       } else if (instr->src[0].src.ssa->bit_size == 64) {
2843          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2844       } else {
2845          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2846       }
2847       break;
2848    }
2849    case nir_op_fsign: {
2850       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2851       if (dst.regClass() == v2b) {
2852          assert(ctx->program->gfx_level >= GFX9);
2853          /* replace negative zero with positive zero */
2854          src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2855          src =
2856             bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u));
2857          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2858       } else if (dst.regClass() == v1) {
2859          if (ctx->block->fp_mode.denorm32 == fp_denorm_flush) {
2860             /* If denormals are flushed, then v_mul_legacy_f32(2.0, src) can become omod. */
2861             src =
2862                bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), Operand::c32(0x40000000), src);
2863          } else {
2864             src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
2865          }
2866          src =
2867             bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
2868          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2869       } else if (dst.regClass() == v2) {
2870          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
2871          Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2872          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2873                                    emit_extract_vector(ctx, src, 1, v1), cond);
2874
2875          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
2876          tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2877          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2878
2879          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2880       } else {
2881          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2882       }
2883       break;
2884    }
2885    case nir_op_f2f16:
2886    case nir_op_f2f16_rtne: {
2887       Temp src = get_alu_src(ctx, instr->src[0]);
2888       if (instr->src[0].src.ssa->bit_size == 64)
2889          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2890       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2891          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2892           * keep value numbering and the scheduler simpler.
2893           */
2894          bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2895       else
2896          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2897       break;
2898    }
2899    case nir_op_f2f16_rtz: {
2900       Temp src = get_alu_src(ctx, instr->src[0]);
2901       if (instr->src[0].src.ssa->bit_size == 64)
2902          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2903       if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2904          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2905       else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
2906          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2907       else
2908          bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2909       break;
2910    }
2911    case nir_op_f2f32: {
2912       if (instr->src[0].src.ssa->bit_size == 16) {
2913          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2914       } else if (instr->src[0].src.ssa->bit_size == 64) {
2915          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2916       } else {
2917          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2918       }
2919       break;
2920    }
2921    case nir_op_f2f64: {
2922       Temp src = get_alu_src(ctx, instr->src[0]);
2923       if (instr->src[0].src.ssa->bit_size == 16)
2924          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2925       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2926       break;
2927    }
2928    case nir_op_i2f16: {
2929       assert(dst.regClass() == v2b);
2930       Temp src = get_alu_src(ctx, instr->src[0]);
2931       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2932       if (input_size <= 16) {
2933          /* Expand integer to the size expected by the uint→float converter used below */
2934          unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2935          if (input_size != target_size) {
2936             src = convert_int(ctx, bld, src, input_size, target_size, true);
2937          }
2938       }
2939
2940       if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2941          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2942       } else {
2943          /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
2944           *
2945           * This is also the fallback-path taken on GFX7 and earlier, which
2946           * do not support direct f16⟷i16 conversions.
2947           */
2948          src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2949          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2950       }
2951       break;
2952    }
2953    case nir_op_i2f32: {
2954       assert(dst.size() == 1);
2955       Temp src = get_alu_src(ctx, instr->src[0]);
2956       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2957       if (input_size <= 32) {
2958          if (input_size <= 16) {
2959             /* Sign-extend to 32-bits */
2960             src = convert_int(ctx, bld, src, input_size, 32, true);
2961          }
2962          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2963       } else {
2964          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2965       }
2966       break;
2967    }
2968    case nir_op_i2f64: {
2969       if (instr->src[0].src.ssa->bit_size <= 32) {
2970          Temp src = get_alu_src(ctx, instr->src[0]);
2971          if (instr->src[0].src.ssa->bit_size <= 16)
2972             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2973          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2974       } else {
2975          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2976       }
2977       break;
2978    }
2979    case nir_op_u2f16: {
2980       assert(dst.regClass() == v2b);
2981       Temp src = get_alu_src(ctx, instr->src[0]);
2982       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2983       if (input_size <= 16) {
2984          /* Expand integer to the size expected by the uint→float converter used below */
2985          unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2986          if (input_size != target_size) {
2987             src = convert_int(ctx, bld, src, input_size, target_size, false);
2988          }
2989       }
2990
2991       if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2992          bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2993       } else {
2994          /* Large 32bit inputs need to return inf/FLOAT_MAX.
2995           *
2996           * This is also the fallback-path taken on GFX7 and earlier, which
2997           * do not support direct f16⟷u16 conversions.
2998           */
2999          src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
3000          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3001       }
3002       break;
3003    }
3004    case nir_op_u2f32: {
3005       assert(dst.size() == 1);
3006       Temp src = get_alu_src(ctx, instr->src[0]);
3007       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3008       if (input_size == 8) {
3009          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
3010       } else if (input_size <= 32) {
3011          if (input_size == 16)
3012             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3013          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
3014       } else {
3015          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3016       }
3017       break;
3018    }
3019    case nir_op_u2f64: {
3020       if (instr->src[0].src.ssa->bit_size <= 32) {
3021          Temp src = get_alu_src(ctx, instr->src[0]);
3022          if (instr->src[0].src.ssa->bit_size <= 16)
3023             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3024          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
3025       } else {
3026          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3027       }
3028       break;
3029    }
3030    case nir_op_f2i8:
3031    case nir_op_f2i16: {
3032       if (instr->src[0].src.ssa->bit_size == 16) {
3033          if (ctx->program->gfx_level >= GFX8) {
3034             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
3035          } else {
3036             /* GFX7 and earlier do not support direct f16⟷i16 conversions */
3037             Temp tmp = bld.tmp(v1);
3038             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3039             tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
3040             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3041                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3042             if (dst.type() == RegType::sgpr) {
3043                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3044             }
3045          }
3046       } else if (instr->src[0].src.ssa->bit_size == 32) {
3047          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3048       } else {
3049          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3050       }
3051       break;
3052    }
3053    case nir_op_f2u8:
3054    case nir_op_f2u16: {
3055       if (instr->src[0].src.ssa->bit_size == 16) {
3056          if (ctx->program->gfx_level >= GFX8) {
3057             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
3058          } else {
3059             /* GFX7 and earlier do not support direct f16⟷u16 conversions */
3060             Temp tmp = bld.tmp(v1);
3061             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3062             tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
3063             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3064                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3065             if (dst.type() == RegType::sgpr) {
3066                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3067             }
3068          }
3069       } else if (instr->src[0].src.ssa->bit_size == 32) {
3070          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3071       } else {
3072          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3073       }
3074       break;
3075    }
3076    case nir_op_f2i32: {
3077       Temp src = get_alu_src(ctx, instr->src[0]);
3078       if (instr->src[0].src.ssa->bit_size == 16) {
3079          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3080          if (dst.type() == RegType::vgpr) {
3081             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
3082          } else {
3083             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3084                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
3085          }
3086       } else if (instr->src[0].src.ssa->bit_size == 32) {
3087          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3088       } else if (instr->src[0].src.ssa->bit_size == 64) {
3089          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3090       } else {
3091          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3092       }
3093       break;
3094    }
3095    case nir_op_f2u32: {
3096       Temp src = get_alu_src(ctx, instr->src[0]);
3097       if (instr->src[0].src.ssa->bit_size == 16) {
3098          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3099          if (dst.type() == RegType::vgpr) {
3100             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
3101          } else {
3102             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3103                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
3104          }
3105       } else if (instr->src[0].src.ssa->bit_size == 32) {
3106          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3107       } else if (instr->src[0].src.ssa->bit_size == 64) {
3108          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3109       } else {
3110          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3111       }
3112       break;
3113    }
3114    case nir_op_b2f16: {
3115       Temp src = get_alu_src(ctx, instr->src[0]);
3116       assert(src.regClass() == bld.lm);
3117
3118       if (dst.regClass() == s1) {
3119          src = bool_to_scalar_condition(ctx, src);
3120          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3121       } else if (dst.regClass() == v2b) {
3122          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3123          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3124       } else {
3125          unreachable("Wrong destination register class for nir_op_b2f16.");
3126       }
3127       break;
3128    }
3129    case nir_op_b2f32: {
3130       Temp src = get_alu_src(ctx, instr->src[0]);
3131       assert(src.regClass() == bld.lm);
3132
3133       if (dst.regClass() == s1) {
3134          src = bool_to_scalar_condition(ctx, src);
3135          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3136       } else if (dst.regClass() == v1) {
3137          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3138                       Operand::c32(0x3f800000u), src);
3139       } else {
3140          unreachable("Wrong destination register class for nir_op_b2f32.");
3141       }
3142       break;
3143    }
3144    case nir_op_b2f64: {
3145       Temp src = get_alu_src(ctx, instr->src[0]);
3146       assert(src.regClass() == bld.lm);
3147
3148       if (dst.regClass() == s2) {
3149          src = bool_to_scalar_condition(ctx, src);
3150          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3151                   Operand::zero(), bld.scc(src));
3152       } else if (dst.regClass() == v2) {
3153          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3154          Temp upper =
3155             bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3156          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3157       } else {
3158          unreachable("Wrong destination register class for nir_op_b2f64.");
3159       }
3160       break;
3161    }
3162    case nir_op_i2i8:
3163    case nir_op_i2i16:
3164    case nir_op_i2i32: {
3165       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3166          /* no need to do the extract in get_alu_src() */
3167          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3168                                      ? sgpr_extract_sext
3169                                      : sgpr_extract_undef;
3170          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3171       } else {
3172          const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3173          const unsigned output_bitsize = instr->def.bit_size;
3174          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3175                      output_bitsize > input_bitsize, dst);
3176       }
3177       break;
3178    }
3179    case nir_op_u2u8:
3180    case nir_op_u2u16:
3181    case nir_op_u2u32: {
3182       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3183          /* no need to do the extract in get_alu_src() */
3184          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3185                                      ? sgpr_extract_zext
3186                                      : sgpr_extract_undef;
3187          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3188       } else {
3189          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3190                      instr->def.bit_size, false, dst);
3191       }
3192       break;
3193    }
3194    case nir_op_b2b32:
3195    case nir_op_b2i8:
3196    case nir_op_b2i16:
3197    case nir_op_b2i32: {
3198       Temp src = get_alu_src(ctx, instr->src[0]);
3199       assert(src.regClass() == bld.lm);
3200
3201       if (dst.regClass() == s1) {
3202          bool_to_scalar_condition(ctx, src, dst);
3203       } else if (dst.type() == RegType::vgpr) {
3204          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
3205                       src);
3206       } else {
3207          unreachable("Invalid register class for b2i32");
3208       }
3209       break;
3210    }
3211    case nir_op_b2b1: {
3212       Temp src = get_alu_src(ctx, instr->src[0]);
3213       assert(dst.regClass() == bld.lm);
3214
3215       if (src.type() == RegType::vgpr) {
3216          assert(src.regClass() == v1 || src.regClass() == v2);
3217          assert(dst.regClass() == bld.lm);
3218          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3219                   Definition(dst), Operand::zero(), src);
3220       } else {
3221          assert(src.regClass() == s1 || src.regClass() == s2);
3222          Temp tmp;
3223          if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
3224             tmp =
3225                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3226                   .def(1)
3227                   .getTemp();
3228          } else {
3229             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3230                            bld.scc(bld.def(s1)), Operand::zero(), src);
3231          }
3232          bool_to_vector_condition(ctx, tmp, dst);
3233       }
3234       break;
3235    }
3236    case nir_op_unpack_64_2x32:
3237    case nir_op_unpack_32_2x16:
3238    case nir_op_unpack_64_4x16:
3239    case nir_op_unpack_32_4x8:
3240       bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3241       emit_split_vector(
3242          ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3243       break;
3244    case nir_op_pack_64_2x32_split: {
3245       Temp src0 = get_alu_src(ctx, instr->src[0]);
3246       Temp src1 = get_alu_src(ctx, instr->src[1]);
3247
3248       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3249       break;
3250    }
3251    case nir_op_unpack_64_2x32_split_x:
3252       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3253                  get_alu_src(ctx, instr->src[0]));
3254       break;
3255    case nir_op_unpack_64_2x32_split_y:
3256       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3257                  get_alu_src(ctx, instr->src[0]));
3258       break;
3259    case nir_op_unpack_32_2x16_split_x:
3260       if (dst.type() == RegType::vgpr) {
3261          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3262                     get_alu_src(ctx, instr->src[0]));
3263       } else {
3264          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3265       }
3266       break;
3267    case nir_op_unpack_32_2x16_split_y:
3268       if (dst.type() == RegType::vgpr) {
3269          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3270                     get_alu_src(ctx, instr->src[0]));
3271       } else {
3272          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3273                     get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3274                     Operand::zero());
3275       }
3276       break;
3277    case nir_op_pack_32_2x16_split: {
3278       Temp src0 = get_alu_src(ctx, instr->src[0]);
3279       Temp src1 = get_alu_src(ctx, instr->src[1]);
3280       if (dst.regClass() == v1) {
3281          src0 = emit_extract_vector(ctx, src0, 0, v2b);
3282          src1 = emit_extract_vector(ctx, src1, 0, v2b);
3283          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3284       } else {
3285          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3286                          Operand::c32(0xFFFFu));
3287          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3288                          Operand::c32(16u));
3289          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3290       }
3291       break;
3292    }
3293    case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3294    case nir_op_pack_half_2x16_rtz_split:
3295    case nir_op_pack_half_2x16_split: {
3296       if (dst.regClass() == v1) {
3297          if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
3298             emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3299          else
3300             emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3301       } else {
3302          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3303       }
3304       break;
3305    }
3306    case nir_op_pack_unorm_2x16:
3307    case nir_op_pack_snorm_2x16: {
3308       unsigned bit_size = instr->src[0].src.ssa->bit_size;
3309       /* Only support 16 and 32bit. */
3310       assert(bit_size == 32 || bit_size == 16);
3311
3312       RegClass src_rc = bit_size == 32 ? v1 : v2b;
3313       Temp src = get_alu_src(ctx, instr->src[0], 2);
3314       Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
3315       Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
3316
3317       /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
3318       if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
3319          src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
3320          src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
3321          bit_size = 32;
3322       }
3323
3324       aco_opcode opcode;
3325       if (bit_size == 32) {
3326          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
3327                                                       : aco_opcode::v_cvt_pknorm_i16_f32;
3328       } else {
3329          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
3330                                                       : aco_opcode::v_cvt_pknorm_i16_f16;
3331       }
3332       bld.vop3(opcode, Definition(dst), src0, src1);
3333       break;
3334    }
3335    case nir_op_pack_uint_2x16:
3336    case nir_op_pack_sint_2x16: {
3337       Temp src = get_alu_src(ctx, instr->src[0], 2);
3338       Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3339       Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3340       aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
3341                                                              : aco_opcode::v_cvt_pk_i16_i32;
3342       bld.vop3(opcode, Definition(dst), src0, src1);
3343       break;
3344    }
3345    case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3346    case nir_op_unpack_half_2x16_split_x: {
3347       Temp src = get_alu_src(ctx, instr->src[0]);
3348       if (src.regClass() == v1)
3349          src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3350       if (dst.regClass() == v1) {
3351          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3352                 (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3353          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3354       } else {
3355          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3356       }
3357       break;
3358    }
3359    case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3360    case nir_op_unpack_half_2x16_split_y: {
3361       Temp src = get_alu_src(ctx, instr->src[0]);
3362       if (src.regClass() == s1)
3363          src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
3364                           Operand::c32(1u), Operand::c32(16u), Operand::zero());
3365       else
3366          src =
3367             bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3368       if (dst.regClass() == v1) {
3369          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3370                 (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3371          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3372       } else {
3373          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3374       }
3375       break;
3376    }
3377    case nir_op_sad_u8x4: {
3378       assert(dst.regClass() == v1);
3379       emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
3380       break;
3381    }
3382    case nir_op_fquantize2f16: {
3383       Temp src = get_alu_src(ctx, instr->src[0]);
3384       Temp f16;
3385       if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3386          f16 = bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, bld.def(v2b), src);
3387       else
3388          f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
3389       Temp f32, cmp_res;
3390
3391       if (ctx->program->gfx_level >= GFX8) {
3392          Temp mask = bld.copy(
3393             bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3394          cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, mask);
3395          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3396       } else {
3397          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3398           * so compare the result and flush to 0 if it's smaller.
3399           */
3400          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3401          Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3402          Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3403          tmp0->valu().abs[0] = true;
3404          Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f32, bld.def(bld.lm), Operand::zero(), f32);
3405          cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3406                             tmp0->definitions[0].getTemp(), tmp1);
3407       }
3408
3409       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3410          Temp copysign_0 =
3411             bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3412          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3413       } else {
3414          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3415       }
3416       break;
3417    }
3418    case nir_op_bfm: {
3419       Temp bits = get_alu_src(ctx, instr->src[0]);
3420       Temp offset = get_alu_src(ctx, instr->src[1]);
3421
3422       if (dst.regClass() == s1) {
3423          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3424       } else if (dst.regClass() == v1) {
3425          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3426       } else {
3427          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3428       }
3429       break;
3430    }
3431    case nir_op_bitfield_select: {
3432
3433       /* dst = (insert & bitmask) | (base & ~bitmask) */
3434       if (dst.regClass() == s1) {
3435          Temp bitmask = get_alu_src(ctx, instr->src[0]);
3436          Temp insert = get_alu_src(ctx, instr->src[1]);
3437          Temp base = get_alu_src(ctx, instr->src[2]);
3438          aco_ptr<Instruction> sop2;
3439          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3440          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3441          Operand lhs;
3442          if (const_insert && const_bitmask) {
3443             lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3444          } else {
3445             insert =
3446                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3447             lhs = Operand(insert);
3448          }
3449
3450          Operand rhs;
3451          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3452          if (const_base && const_bitmask) {
3453             rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3454          } else {
3455             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3456             rhs = Operand(base);
3457          }
3458
3459          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3460
3461       } else if (dst.regClass() == v1) {
3462          emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3463       } else {
3464          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3465       }
3466       break;
3467    }
3468    case nir_op_ubfe:
3469    case nir_op_ibfe: {
3470       if (dst.bytes() != 4)
3471          unreachable("Unsupported BFE bit size");
3472
3473       if (dst.type() == RegType::sgpr) {
3474          Temp base = get_alu_src(ctx, instr->src[0]);
3475
3476          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3477          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3478          aco_opcode opcode =
3479             instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3480          if (const_offset && const_bits) {
3481             uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
3482             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3483             break;
3484          }
3485
3486          Temp offset = get_alu_src(ctx, instr->src[1]);
3487          Temp bits = get_alu_src(ctx, instr->src[2]);
3488
3489          if (ctx->program->gfx_level >= GFX9) {
3490             Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
3491                                          : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3492                                                     bld.def(s1, scc), bits, Operand::c32(0x1fu));
3493             Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
3494             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
3495          } else if (instr->op == nir_op_ubfe) {
3496             Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3497             Temp masked =
3498                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3499             bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3500          } else {
3501             Operand bits_op = const_bits
3502                                  ? Operand::c32((const_bits->u32 & 0x1f) << 16)
3503                                  : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
3504                                             bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3505                                                      bld.def(s1, scc), bits, Operand::c32(0x1fu)),
3506                                             Operand::c32(16u));
3507             Operand offset_op = const_offset
3508                                    ? Operand::c32(const_offset->u32 & 0x1fu)
3509                                    : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3510                                               offset, Operand::c32(0x1fu));
3511
3512             Temp extract =
3513                bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3514             bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3515          }
3516
3517       } else {
3518          aco_opcode opcode =
3519             instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3520          emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3521       }
3522       break;
3523    }
3524    case nir_op_extract_u8:
3525    case nir_op_extract_i8:
3526    case nir_op_extract_u16:
3527    case nir_op_extract_i16: {
3528       bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3529       unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3530       uint32_t bits = comp == 4 ? 8 : 16;
3531       unsigned index = nir_src_as_uint(instr->src[1].src);
3532       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3533          assert(index == 0);
3534          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3535       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
3536          Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3537          unsigned swizzle = instr->src[0].swizzle[0];
3538          if (vec.size() > 1) {
3539             vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3540             swizzle = swizzle & 1;
3541          }
3542          index += swizzle * instr->def.bit_size / bits;
3543          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3544                     Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3545       } else {
3546          Temp src = get_alu_src(ctx, instr->src[0]);
3547          Definition def(dst);
3548          if (dst.bytes() == 8) {
3549             src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3550             index %= comp;
3551             def = bld.def(src.type(), 1);
3552          }
3553          assert(def.bytes() <= 4);
3554          if (def.regClass() == s1) {
3555             bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3556                        Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3557          } else {
3558             src = emit_extract_vector(ctx, src, 0, def.regClass());
3559             bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3560                        Operand::c32(bits), Operand::c32(is_signed));
3561          }
3562          if (dst.size() == 2)
3563             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3564                        Operand::zero());
3565       }
3566       break;
3567    }
3568    case nir_op_insert_u8:
3569    case nir_op_insert_u16: {
3570       unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3571       uint32_t bits = comp == 4 ? 8 : 16;
3572       unsigned index = nir_src_as_uint(instr->src[1].src);
3573       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3574          assert(index == 0);
3575          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3576       } else {
3577          Temp src = get_alu_src(ctx, instr->src[0]);
3578          Definition def(dst);
3579          bool swap = false;
3580          if (dst.bytes() == 8) {
3581             src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3582             swap = index >= comp;
3583             index %= comp;
3584             def = bld.def(src.type(), 1);
3585          }
3586          if (def.regClass() == s1) {
3587             bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3588                        Operand::c32(index), Operand::c32(bits));
3589          } else {
3590             src = emit_extract_vector(ctx, src, 0, def.regClass());
3591             bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3592                        Operand::c32(bits));
3593          }
3594          if (dst.size() == 2 && swap)
3595             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3596                        def.getTemp());
3597          else if (dst.size() == 2)
3598             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3599                        Operand::zero());
3600       }
3601       break;
3602    }
3603    case nir_op_bit_count: {
3604       Temp src = get_alu_src(ctx, instr->src[0]);
3605       if (src.regClass() == s1) {
3606          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3607       } else if (src.regClass() == v1) {
3608          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3609       } else if (src.regClass() == v2) {
3610          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3611                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3612                            emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3613       } else if (src.regClass() == s2) {
3614          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3615       } else {
3616          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3617       }
3618       break;
3619    }
3620    case nir_op_flt: {
3621       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3622                       aco_opcode::v_cmp_lt_f64);
3623       break;
3624    }
3625    case nir_op_fge: {
3626       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3627                       aco_opcode::v_cmp_ge_f64);
3628       break;
3629    }
3630    case nir_op_feq: {
3631       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3632                       aco_opcode::v_cmp_eq_f64);
3633       break;
3634    }
3635    case nir_op_fneu: {
3636       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3637                       aco_opcode::v_cmp_neq_f64);
3638       break;
3639    }
3640    case nir_op_ilt: {
3641       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3642                       aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3643       break;
3644    }
3645    case nir_op_ige: {
3646       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3647                       aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3648       break;
3649    }
3650    case nir_op_ieq: {
3651       if (instr->src[0].src.ssa->bit_size == 1)
3652          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3653       else
3654          emit_comparison(
3655             ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3656             aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3657             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3658       break;
3659    }
3660    case nir_op_ine: {
3661       if (instr->src[0].src.ssa->bit_size == 1)
3662          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3663       else
3664          emit_comparison(
3665             ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3666             aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3667             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3668       break;
3669    }
3670    case nir_op_ult: {
3671       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3672                       aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3673       break;
3674    }
3675    case nir_op_uge: {
3676       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3677                       aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3678       break;
3679    }
3680    case nir_op_bitz:
3681    case nir_op_bitnz: {
3682       assert(instr->src[0].src.ssa->bit_size != 1);
3683       bool test0 = instr->op == nir_op_bitz;
3684       Temp src0 = get_alu_src(ctx, instr->src[0]);
3685       Temp src1 = get_alu_src(ctx, instr->src[1]);
3686       bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
3687       if (!use_valu) {
3688          aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
3689                                                                : aco_opcode::s_bitcmp1_b32;
3690          if (test0)
3691             op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
3692                                                        : aco_opcode::s_bitcmp0_b32;
3693          emit_sopc_instruction(ctx, instr, op, dst);
3694          break;
3695       }
3696
3697       /* We do not have a VALU version of s_bitcmp.
3698        * But if the second source is constant, we can use
3699        * v_cmp_class_f32's LUT to check the bit.
3700        * The LUT only has 10 entries, so extract a higher byte if we have to.
3701        * For sign bits comparision with 0 is better because v_cmp_class
3702        * can't be inverted.
3703        */
3704       if (nir_src_is_const(instr->src[1].src)) {
3705          uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
3706          bit &= instr->src[0].src.ssa->bit_size - 1;
3707          src0 = as_vgpr(ctx, src0);
3708
3709          if (src0.regClass() == v2) {
3710             src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
3711             bit &= 31;
3712          }
3713
3714          if (bit == 31) {
3715             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3716                      Operand::c32(0), src0);
3717             break;
3718          }
3719
3720          if (bit == 15 && ctx->program->gfx_level >= GFX8) {
3721             bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
3722                      Operand::c32(0), src0);
3723             break;
3724          }
3725
3726          /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
3727          const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
3728          const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
3729          const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
3730          if (use_opsel) {
3731             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
3732                               Operand::c32(16), Operand::c32(0));
3733             bit &= 0xf;
3734          }
3735
3736          /* If we can use sdwa the extract is free, while test0's s_not is not. */
3737          if (bit == 7 && test0 && can_sdwa) {
3738             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
3739                               Operand::c32(8), Operand::c32(1));
3740             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3741                      Operand::c32(0), src0);
3742             break;
3743          }
3744
3745          if (bit > max_bit) {
3746             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
3747                               Operand::c32(8), Operand::c32(0));
3748             bit &= 0x7;
3749          }
3750
3751          /* denorm and snan/qnan inputs are preserved using all float control modes. */
3752          static const struct {
3753             uint32_t fp32;
3754             uint32_t fp16;
3755             bool negate;
3756          } float_lut[10] = {
3757             {0x7f800001, 0x7c01, false}, /* snan */
3758             {~0u, ~0u, false},           /* qnan */
3759             {0xff800000, 0xfc00, false}, /* -inf */
3760             {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
3761             {1, 1, true},                /* -denormal */
3762             {0, 0, true},                /* -0.0 */
3763             {0, 0, false},               /* +0.0 */
3764             {1, 1, false},               /* +denormal */
3765             {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
3766             {0x7f800000, 0x7c00, false}, /* +inf */
3767          };
3768
3769          Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
3770          /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
3771          const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
3772                                (ctx->program->gfx_level >= GFX11 && use_opsel);
3773          const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
3774          const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
3775
3776          VALU_instruction& res =
3777             bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
3778          if (float_lut[bit].negate) {
3779             res.format = asVOP3(res.format);
3780             res.neg[0] = true;
3781          }
3782
3783          if (test0)
3784             bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
3785
3786          break;
3787       }
3788
3789       Temp res;
3790       aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
3791       if (instr->src[0].src.ssa->bit_size == 16) {
3792          op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
3793          if (ctx->program->gfx_level < GFX10)
3794             res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
3795          else
3796             res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
3797
3798          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
3799       } else if (instr->src[0].src.ssa->bit_size == 32) {
3800          res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
3801       } else if (instr->src[0].src.ssa->bit_size == 64) {
3802          if (ctx->program->gfx_level < GFX8)
3803             res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
3804          else
3805             res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
3806
3807          res = emit_extract_vector(ctx, res, 0, v1);
3808          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
3809       } else {
3810          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3811       }
3812       bld.vopc(op, Definition(dst), Operand::c32(0), res);
3813       break;
3814    }
3815    case nir_op_fddx:
3816    case nir_op_fddy:
3817    case nir_op_fddx_fine:
3818    case nir_op_fddy_fine:
3819    case nir_op_fddx_coarse:
3820    case nir_op_fddy_coarse: {
3821       if (!nir_src_is_divergent(instr->src[0].src)) {
3822          /* Source is the same in all lanes, so the derivative is zero.
3823           * This also avoids emitting invalid IR.
3824           */
3825          bld.copy(Definition(dst), Operand::zero());
3826          break;
3827       }
3828
3829       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3830       uint16_t dpp_ctrl1, dpp_ctrl2;
3831       if (instr->op == nir_op_fddx_fine) {
3832          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3833          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3834       } else if (instr->op == nir_op_fddy_fine) {
3835          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3836          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3837       } else {
3838          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3839          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3840             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3841          else
3842             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3843       }
3844
3845       Temp tmp;
3846       if (ctx->program->gfx_level >= GFX8) {
3847          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3848          bld.vop2_dpp(aco_opcode::v_sub_f32, Definition(dst), src, tl, dpp_ctrl2);
3849       } else {
3850          Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3851          Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3852          bld.vop2(aco_opcode::v_sub_f32, Definition(dst), tr, tl);
3853       }
3854       set_wqm(ctx, true);
3855       break;
3856    }
3857    default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3858    }
3859 }
3860
3861 void
3862 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3863 {
3864    Temp dst = get_ssa_temp(ctx, &instr->def);
3865
3866    // TODO: we really want to have the resulting type as this would allow for 64bit literals
3867    // which get truncated the lsb if double and msb if int
3868    // for now, we only use s_mov_b64 with 64bit inline constants
3869    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3870    assert(dst.type() == RegType::sgpr);
3871
3872    Builder bld(ctx->program, ctx->block);
3873
3874    if (instr->def.bit_size == 1) {
3875       assert(dst.regClass() == bld.lm);
3876       int val = instr->value[0].b ? -1 : 0;
3877       Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3878       bld.copy(Definition(dst), op);
3879    } else if (instr->def.bit_size == 8) {
3880       bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3881    } else if (instr->def.bit_size == 16) {
3882       /* sign-extend to use s_movk_i32 instead of a literal */
3883       bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3884    } else if (dst.size() == 1) {
3885       bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3886    } else {
3887       assert(dst.size() != 1);
3888       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3889          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3890       if (instr->def.bit_size == 64)
3891          for (unsigned i = 0; i < dst.size(); i++)
3892             vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3893       else {
3894          for (unsigned i = 0; i < dst.size(); i++)
3895             vec->operands[i] = Operand::c32(instr->value[i].u32);
3896       }
3897       vec->definitions[0] = Definition(dst);
3898       ctx->block->instructions.emplace_back(std::move(vec));
3899    }
3900 }
3901
3902 bool
3903 can_use_byte_align_for_global_load(unsigned num_components, unsigned component_size,
3904                                    unsigned align_, bool support_12_byte)
3905 {
3906    /* Only use byte-align for 8/16-bit loads if we won't have to increase it's size and won't have
3907     * to use unsupported load sizes.
3908     */
3909    assert(util_is_power_of_two_nonzero(align_));
3910    if (align_ < 4) {
3911       assert(component_size < 4);
3912       unsigned load_size = num_components * component_size;
3913       uint32_t new_size = align(load_size + (4 - align_), 4);
3914       return new_size == align(load_size, 4) && (new_size != 12 || support_12_byte);
3915    }
3916    return true;
3917 }
3918
3919 struct LoadEmitInfo {
3920    Operand offset;
3921    Temp dst;
3922    unsigned num_components;
3923    unsigned component_size;
3924    Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */
3925    Temp idx = Temp(0, v1);      /* buffer index */
3926    unsigned component_stride = 0;
3927    unsigned const_offset = 0;
3928    unsigned align_mul = 0;
3929    unsigned align_offset = 0;
3930    pipe_format format;
3931
3932    bool glc = false;
3933    bool slc = false;
3934    bool split_by_component_stride = true;
3935    unsigned swizzle_component_size = 0;
3936    memory_sync_info sync;
3937    Temp soffset = Temp(0, s1);
3938 };
3939
3940 struct EmitLoadParameters {
3941    using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
3942                              unsigned bytes_needed, unsigned align, unsigned const_offset,
3943                              Temp dst_hint);
3944
3945    Callback callback;
3946    bool byte_align_loads;
3947    bool supports_8bit_16bit_loads;
3948    unsigned max_const_offset_plus_one;
3949 };
3950
3951 void
3952 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
3953           const EmitLoadParameters& params)
3954 {
3955    unsigned load_size = info.num_components * info.component_size;
3956    unsigned component_size = info.component_size;
3957
3958    unsigned num_vals = 0;
3959    Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
3960
3961    unsigned const_offset = info.const_offset;
3962
3963    const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
3964    unsigned align_offset = info.align_offset % align_mul;
3965
3966    unsigned bytes_read = 0;
3967    while (bytes_read < load_size) {
3968       unsigned bytes_needed = load_size - bytes_read;
3969
3970       /* add buffer for unaligned loads */
3971       int byte_align = 0;
3972       if (params.byte_align_loads) {
3973          byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3974       }
3975
3976       if (byte_align) {
3977          if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3978              !params.supports_8bit_16bit_loads) {
3979             if (info.component_stride) {
3980                assert(params.supports_8bit_16bit_loads && "unimplemented");
3981                bytes_needed = 2;
3982                byte_align = 0;
3983             } else {
3984                bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
3985                bytes_needed = align(bytes_needed, 4);
3986             }
3987          } else {
3988             byte_align = 0;
3989          }
3990       }
3991
3992       if (info.split_by_component_stride) {
3993          if (info.swizzle_component_size)
3994             bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
3995          if (info.component_stride)
3996             bytes_needed = MIN2(bytes_needed, info.component_size);
3997       }
3998
3999       bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
4000
4001       /* reduce constant offset */
4002       Operand offset = info.offset;
4003       unsigned reduced_const_offset = const_offset;
4004       bool remove_const_offset_completely = need_to_align_offset;
4005       if (const_offset &&
4006           (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
4007          unsigned to_add = const_offset;
4008          if (remove_const_offset_completely) {
4009             reduced_const_offset = 0;
4010          } else {
4011             to_add =
4012                const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
4013             reduced_const_offset %= params.max_const_offset_plus_one;
4014          }
4015          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4016          if (offset.isConstant()) {
4017             offset = Operand::c32(offset.constantValue() + to_add);
4018          } else if (offset.isUndefined()) {
4019             offset = Operand::c32(to_add);
4020          } else if (offset_tmp.regClass() == s1) {
4021             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
4022                               Operand::c32(to_add));
4023          } else if (offset_tmp.regClass() == v1) {
4024             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
4025          } else {
4026             Temp lo = bld.tmp(offset_tmp.type(), 1);
4027             Temp hi = bld.tmp(offset_tmp.type(), 1);
4028             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4029
4030             if (offset_tmp.regClass() == s2) {
4031                Temp carry = bld.tmp(s1);
4032                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
4033                              Operand::c32(to_add));
4034                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
4035                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
4036             } else {
4037                Temp new_lo = bld.tmp(v1);
4038                Temp carry =
4039                   bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
4040                hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
4041                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
4042             }
4043          }
4044       }
4045
4046       /* align offset down if needed */
4047       Operand aligned_offset = offset;
4048       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
4049       if (need_to_align_offset) {
4050          align = 4;
4051          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4052          if (offset.isConstant()) {
4053             aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
4054          } else if (offset.isUndefined()) {
4055             aligned_offset = Operand::zero();
4056          } else if (offset_tmp.regClass() == s1) {
4057             aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
4058                                       Operand::c32(0xfffffffcu), offset_tmp);
4059          } else if (offset_tmp.regClass() == s2) {
4060             aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
4061                                       Operand::c64(0xfffffffffffffffcllu), offset_tmp);
4062          } else if (offset_tmp.regClass() == v1) {
4063             aligned_offset =
4064                bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
4065          } else if (offset_tmp.regClass() == v2) {
4066             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
4067             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4068             lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
4069             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
4070          }
4071       }
4072       Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp()
4073                                 : aligned_offset.isConstant()
4074                                    ? bld.copy(bld.def(s1), aligned_offset)
4075                                    : Temp(0, s1);
4076
4077       Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
4078                                  reduced_const_offset, byte_align ? Temp() : info.dst);
4079
4080       /* the callback wrote directly to dst */
4081       if (val == info.dst) {
4082          assert(num_vals == 0);
4083          emit_split_vector(ctx, info.dst, info.num_components);
4084          return;
4085       }
4086
4087       /* shift result right if needed */
4088       if (params.byte_align_loads && info.component_size < 4) {
4089          Operand byte_align_off = Operand::c32(byte_align);
4090          if (byte_align == -1) {
4091             if (offset.isConstant())
4092                byte_align_off = Operand::c32(offset.constantValue() % 4u);
4093             else if (offset.isUndefined())
4094                byte_align_off = Operand::zero();
4095             else if (offset.size() == 2)
4096                byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
4097                                                             RegClass(offset.getTemp().type(), 1)));
4098             else
4099                byte_align_off = offset;
4100          }
4101
4102          assert(val.bytes() >= load_size && "unimplemented");
4103          if (val.type() == RegType::sgpr)
4104             byte_align_scalar(ctx, val, byte_align_off, info.dst);
4105          else
4106             byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
4107          return;
4108       }
4109
4110       /* add result to list and advance */
4111       if (info.component_stride) {
4112          assert(val.bytes() % info.component_size == 0);
4113          unsigned num_loaded_components = val.bytes() / info.component_size;
4114          unsigned advance_bytes = info.component_stride * num_loaded_components;
4115          const_offset += advance_bytes;
4116          align_offset = (align_offset + advance_bytes) % align_mul;
4117       } else {
4118          const_offset += val.bytes();
4119          align_offset = (align_offset + val.bytes()) % align_mul;
4120       }
4121       bytes_read += val.bytes();
4122       vals[num_vals++] = val;
4123    }
4124
4125    /* create array of components */
4126    unsigned components_split = 0;
4127    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4128    bool has_vgprs = false;
4129    for (unsigned i = 0; i < num_vals;) {
4130       Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
4131       unsigned num_tmps = 0;
4132       unsigned tmp_size = 0;
4133       RegType reg_type = RegType::sgpr;
4134       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
4135          if (vals[i].type() == RegType::vgpr)
4136             reg_type = RegType::vgpr;
4137          tmp_size += vals[i].bytes();
4138          tmp[num_tmps++] = vals[i++];
4139       }
4140       if (num_tmps > 1) {
4141          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
4142             aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
4143          for (unsigned j = 0; j < num_tmps; j++)
4144             vec->operands[j] = Operand(tmp[j]);
4145          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
4146          vec->definitions[0] = Definition(tmp[0]);
4147          bld.insert(std::move(vec));
4148       }
4149
4150       if (tmp[0].bytes() % component_size) {
4151          /* trim tmp[0] */
4152          assert(i == num_vals);
4153          RegClass new_rc =
4154             RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
4155          tmp[0] =
4156             bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
4157       }
4158
4159       RegClass elem_rc = RegClass::get(reg_type, component_size);
4160
4161       unsigned start = components_split;
4162
4163       if (tmp_size == elem_rc.bytes()) {
4164          allocated_vec[components_split++] = tmp[0];
4165       } else {
4166          assert(tmp_size % elem_rc.bytes() == 0);
4167          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
4168             aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
4169          for (auto& def : split->definitions) {
4170             Temp component = bld.tmp(elem_rc);
4171             allocated_vec[components_split++] = component;
4172             def = Definition(component);
4173          }
4174          split->operands[0] = Operand(tmp[0]);
4175          bld.insert(std::move(split));
4176       }
4177
4178       /* try to p_as_uniform early so we can create more optimizable code and
4179        * also update allocated_vec */
4180       for (unsigned j = start; j < components_split; j++) {
4181          if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
4182             allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
4183          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
4184       }
4185    }
4186
4187    /* concatenate components and p_as_uniform() result if needed */
4188    if (info.dst.type() == RegType::vgpr || !has_vgprs)
4189       ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
4190
4191    int padding_bytes =
4192       MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
4193
4194    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
4195       aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
4196    for (unsigned i = 0; i < info.num_components; i++)
4197       vec->operands[i] = Operand(allocated_vec[i]);
4198    if (padding_bytes)
4199       vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
4200    if (info.dst.type() == RegType::sgpr && has_vgprs) {
4201       Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
4202       vec->definitions[0] = Definition(tmp);
4203       bld.insert(std::move(vec));
4204       bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
4205    } else {
4206       vec->definitions[0] = Definition(info.dst);
4207       bld.insert(std::move(vec));
4208    }
4209 }
4210
4211 Operand
4212 load_lds_size_m0(Builder& bld)
4213 {
4214    /* m0 does not need to be initialized on GFX9+ */
4215    if (bld.program->gfx_level >= GFX9)
4216       return Operand(s1);
4217
4218    return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
4219 }
4220
4221 Temp
4222 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4223                   unsigned align, unsigned const_offset, Temp dst_hint)
4224 {
4225    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
4226
4227    Operand m = load_lds_size_m0(bld);
4228
4229    bool large_ds_read = bld.program->gfx_level >= GFX7;
4230    bool usable_read2 = bld.program->gfx_level >= GFX7;
4231
4232    bool read2 = false;
4233    unsigned size = 0;
4234    aco_opcode op;
4235    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
4236       size = 16;
4237       op = aco_opcode::ds_read_b128;
4238    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
4239       size = 16;
4240       read2 = true;
4241       op = aco_opcode::ds_read2_b64;
4242    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
4243       size = 12;
4244       op = aco_opcode::ds_read_b96;
4245    } else if (bytes_needed >= 8 && align % 8 == 0) {
4246       size = 8;
4247       op = aco_opcode::ds_read_b64;
4248    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
4249       size = 8;
4250       read2 = true;
4251       op = aco_opcode::ds_read2_b32;
4252    } else if (bytes_needed >= 4 && align % 4 == 0) {
4253       size = 4;
4254       op = aco_opcode::ds_read_b32;
4255    } else if (bytes_needed >= 2 && align % 2 == 0) {
4256       size = 2;
4257       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
4258    } else {
4259       size = 1;
4260       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
4261    }
4262
4263    unsigned const_offset_unit = read2 ? size / 2u : 1u;
4264    unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
4265
4266    if (const_offset > (const_offset_range - const_offset_unit)) {
4267       unsigned excess = const_offset - (const_offset % const_offset_range);
4268       offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
4269       const_offset -= excess;
4270    }
4271
4272    const_offset /= const_offset_unit;
4273
4274    RegClass rc = RegClass::get(RegType::vgpr, size);
4275    Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
4276    Instruction* instr;
4277    if (read2)
4278       instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
4279    else
4280       instr = bld.ds(op, Definition(val), offset, m, const_offset);
4281    instr->ds().sync = info.sync;
4282
4283    if (m.isUndefined())
4284       instr->operands.pop_back();
4285
4286    return val;
4287 }
4288
4289 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
4290
4291 Temp
4292 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4293                    unsigned align, unsigned const_offset, Temp dst_hint)
4294 {
4295    assert(align >= 4u);
4296
4297    bld.program->has_smem_buffer_or_global_loads = true;
4298
4299    bool buffer = info.resource.id() && info.resource.bytes() == 16;
4300    Temp addr = info.resource;
4301    if (!buffer && !addr.id()) {
4302       addr = offset;
4303       offset = Temp();
4304    }
4305
4306    bytes_needed = MIN2(bytes_needed, 64);
4307    unsigned needed_round_up = util_next_power_of_two(bytes_needed);
4308    unsigned needed_round_down = needed_round_up >> (needed_round_up != bytes_needed ? 1 : 0);
4309    /* Only round-up global loads if it's aligned so that it won't cross pages */
4310    bytes_needed = buffer || align % needed_round_up == 0 ? needed_round_up : needed_round_down;
4311
4312    aco_opcode op;
4313    if (bytes_needed <= 4) {
4314       op = buffer ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4315    } else if (bytes_needed <= 8) {
4316       op = buffer ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4317    } else if (bytes_needed <= 16) {
4318       op = buffer ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4319    } else if (bytes_needed <= 32) {
4320       op = buffer ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4321    } else {
4322       assert(bytes_needed == 64);
4323       op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4324    }
4325
4326    aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4327    if (buffer) {
4328       if (const_offset)
4329          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4330                            Operand::c32(const_offset));
4331       load->operands[0] = Operand(info.resource);
4332       load->operands[1] = Operand(offset);
4333    } else {
4334       load->operands[0] = Operand(addr);
4335       if (offset.id() && const_offset)
4336          load->operands[1] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4337                                       Operand::c32(const_offset));
4338       else if (offset.id())
4339          load->operands[1] = Operand(offset);
4340       else
4341          load->operands[1] = Operand::c32(const_offset);
4342    }
4343    RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
4344    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4345    load->definitions[0] = Definition(val);
4346    load->glc = info.glc;
4347    load->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4348    load->sync = info.sync;
4349    bld.insert(std::move(load));
4350    return val;
4351 }
4352
4353 const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4354
4355 Temp
4356 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4357                     unsigned align_, unsigned const_offset, Temp dst_hint)
4358 {
4359    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4360    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4361
4362    if (info.soffset.id()) {
4363       if (soffset.isTemp())
4364          vaddr = bld.copy(bld.def(v1), soffset);
4365       soffset = Operand(info.soffset);
4366    }
4367
4368    if (soffset.isUndefined())
4369       soffset = Operand::zero();
4370
4371    bool offen = !vaddr.isUndefined();
4372    bool idxen = info.idx.id();
4373
4374    if (offen && idxen)
4375       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4376    else if (idxen)
4377       vaddr = Operand(info.idx);
4378
4379    unsigned bytes_size = 0;
4380    aco_opcode op;
4381    if (bytes_needed == 1 || align_ % 2) {
4382       bytes_size = 1;
4383       op = aco_opcode::buffer_load_ubyte;
4384    } else if (bytes_needed == 2 || align_ % 4) {
4385       bytes_size = 2;
4386       op = aco_opcode::buffer_load_ushort;
4387    } else if (bytes_needed <= 4) {
4388       bytes_size = 4;
4389       op = aco_opcode::buffer_load_dword;
4390    } else if (bytes_needed <= 8) {
4391       bytes_size = 8;
4392       op = aco_opcode::buffer_load_dwordx2;
4393    } else if (bytes_needed <= 12 && bld.program->gfx_level > GFX6) {
4394       bytes_size = 12;
4395       op = aco_opcode::buffer_load_dwordx3;
4396    } else {
4397       bytes_size = 16;
4398       op = aco_opcode::buffer_load_dwordx4;
4399    }
4400    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4401    mubuf->operands[0] = Operand(info.resource);
4402    mubuf->operands[1] = vaddr;
4403    mubuf->operands[2] = soffset;
4404    mubuf->offen = offen;
4405    mubuf->idxen = idxen;
4406    mubuf->glc = info.glc;
4407    mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4408    mubuf->slc = info.slc;
4409    mubuf->sync = info.sync;
4410    mubuf->offset = const_offset;
4411    mubuf->swizzled = info.swizzle_component_size != 0;
4412    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4413    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4414    mubuf->definitions[0] = Definition(val);
4415    bld.insert(std::move(mubuf));
4416
4417    return val;
4418 }
4419
4420 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4421
4422 Temp
4423 mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
4424                            unsigned bytes_needed, unsigned align_, unsigned const_offset,
4425                            Temp dst_hint)
4426 {
4427    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4428    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4429
4430    if (info.soffset.id()) {
4431       if (soffset.isTemp())
4432          vaddr = bld.copy(bld.def(v1), soffset);
4433       soffset = Operand(info.soffset);
4434    }
4435
4436    if (soffset.isUndefined())
4437       soffset = Operand::zero();
4438
4439    bool offen = !vaddr.isUndefined();
4440    bool idxen = info.idx.id();
4441
4442    if (offen && idxen)
4443       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4444    else if (idxen)
4445       vaddr = Operand(info.idx);
4446
4447    aco_opcode op = aco_opcode::num_opcodes;
4448    if (info.component_size == 2) {
4449       switch (bytes_needed) {
4450       case 2: op = aco_opcode::buffer_load_format_d16_x; break;
4451       case 4: op = aco_opcode::buffer_load_format_d16_xy; break;
4452       case 6: op = aco_opcode::buffer_load_format_d16_xyz; break;
4453       case 8: op = aco_opcode::buffer_load_format_d16_xyzw; break;
4454       default: unreachable("invalid buffer load format size"); break;
4455       }
4456    } else {
4457       assert(info.component_size == 4);
4458       switch (bytes_needed) {
4459       case 4: op = aco_opcode::buffer_load_format_x; break;
4460       case 8: op = aco_opcode::buffer_load_format_xy; break;
4461       case 12: op = aco_opcode::buffer_load_format_xyz; break;
4462       case 16: op = aco_opcode::buffer_load_format_xyzw; break;
4463       default: unreachable("invalid buffer load format size"); break;
4464       }
4465    }
4466
4467    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4468    mubuf->operands[0] = Operand(info.resource);
4469    mubuf->operands[1] = vaddr;
4470    mubuf->operands[2] = soffset;
4471    mubuf->offen = offen;
4472    mubuf->idxen = idxen;
4473    mubuf->glc = info.glc;
4474    mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4475    mubuf->slc = info.slc;
4476    mubuf->sync = info.sync;
4477    mubuf->offset = const_offset;
4478    RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
4479    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4480    mubuf->definitions[0] = Definition(val);
4481    bld.insert(std::move(mubuf));
4482
4483    return val;
4484 }
4485
4486 const EmitLoadParameters mubuf_load_format_params{mubuf_load_format_callback, false, true, 4096};
4487
4488 Temp
4489 scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4490                       unsigned align_, unsigned const_offset, Temp dst_hint)
4491 {
4492    unsigned bytes_size = 0;
4493    aco_opcode op;
4494    if (bytes_needed == 1 || align_ % 2u) {
4495       bytes_size = 1;
4496       op = aco_opcode::scratch_load_ubyte;
4497    } else if (bytes_needed == 2 || align_ % 4u) {
4498       bytes_size = 2;
4499       op = aco_opcode::scratch_load_ushort;
4500    } else if (bytes_needed <= 4) {
4501       bytes_size = 4;
4502       op = aco_opcode::scratch_load_dword;
4503    } else if (bytes_needed <= 8) {
4504       bytes_size = 8;
4505       op = aco_opcode::scratch_load_dwordx2;
4506    } else if (bytes_needed <= 12) {
4507       bytes_size = 12;
4508       op = aco_opcode::scratch_load_dwordx3;
4509    } else {
4510       bytes_size = 16;
4511       op = aco_opcode::scratch_load_dwordx4;
4512    }
4513    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4514    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4515    aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, Format::SCRATCH, 2, 1)};
4516    flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
4517    flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
4518    flat->sync = info.sync;
4519    flat->offset = const_offset;
4520    flat->definitions[0] = Definition(val);
4521    bld.insert(std::move(flat));
4522
4523    return val;
4524 }
4525
4526 const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, false, true, 4096};
4527 const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, false, true, 2048};
4528
4529 Temp
4530 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4531 {
4532    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4533                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4534
4535    if (addr.type() == RegType::vgpr)
4536       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4537                         Operand::c32(-1u), Operand::c32(rsrc_conf));
4538    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4539                      Operand::c32(rsrc_conf));
4540 }
4541
4542 Temp
4543 add64_32(Builder& bld, Temp src0, Temp src1)
4544 {
4545    Temp src00 = bld.tmp(src0.type(), 1);
4546    Temp src01 = bld.tmp(src0.type(), 1);
4547    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
4548
4549    if (src0.type() == RegType::vgpr || src1.type() == RegType::vgpr) {
4550       Temp dst0 = bld.tmp(v1);
4551       Temp carry = bld.vadd32(Definition(dst0), src00, src1, true).def(1).getTemp();
4552       Temp dst1 = bld.vadd32(bld.def(v1), src01, Operand::zero(), false, carry);
4553       return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
4554    } else {
4555       Temp carry = bld.tmp(s1);
4556       Temp dst0 =
4557          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src1);
4558       Temp dst1 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), src01, carry);
4559       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), dst0, dst1);
4560    }
4561 }
4562
4563 void
4564 lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout,
4565                      uint32_t* const_offset_inout, Temp* offset_inout)
4566 {
4567    Temp address = *address_inout;
4568    uint64_t const_offset = *const_offset_inout + offset_in;
4569    Temp offset = *offset_inout;
4570
4571    uint64_t max_const_offset_plus_one =
4572       1; /* GFX7/8/9: FLAT loads do not support constant offsets */
4573    if (bld.program->gfx_level >= GFX9)
4574       max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max;
4575    else if (bld.program->gfx_level == GFX6)
4576       max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */
4577    uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one);
4578    const_offset %= max_const_offset_plus_one;
4579
4580    if (!offset.id()) {
4581       while (unlikely(excess_offset > UINT32_MAX)) {
4582          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(UINT32_MAX)));
4583          excess_offset -= UINT32_MAX;
4584       }
4585       if (excess_offset)
4586          offset = bld.copy(bld.def(s1), Operand::c32(excess_offset));
4587    } else {
4588       /* If we add to "offset", we would transform the indended
4589        * "address + u2u64(offset) + u2u64(const_offset)" into
4590        * "address + u2u64(offset + const_offset)", so add to the address.
4591        * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition,
4592        * but that should be really rare.
4593        */
4594       while (excess_offset) {
4595          uint32_t src2 = MIN2(excess_offset, UINT32_MAX);
4596          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(src2)));
4597          excess_offset -= src2;
4598       }
4599    }
4600
4601    if (bld.program->gfx_level == GFX6) {
4602       /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */
4603       if (offset.type() != RegType::sgpr) {
4604          address = add64_32(bld, address, offset);
4605          offset = Temp();
4606       }
4607       offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero());
4608    } else if (bld.program->gfx_level <= GFX8) {
4609       /* GFX7,8 (FLAT): VGPR address */
4610       if (offset.id()) {
4611          address = add64_32(bld, address, offset);
4612          offset = Temp();
4613       }
4614       address = as_vgpr(bld, address);
4615    } else {
4616       /* GFX9+ (GLOBAL): (VGPR address), or (SGPR address and VGPR offset) */
4617       if (address.type() == RegType::vgpr && offset.id()) {
4618          address = add64_32(bld, address, offset);
4619          offset = Temp();
4620       } else if (address.type() == RegType::sgpr && offset.id()) {
4621          offset = as_vgpr(bld, offset);
4622       }
4623       if (address.type() == RegType::sgpr && !offset.id())
4624          offset = bld.copy(bld.def(v1), bld.copy(bld.def(s1), Operand::zero()));
4625    }
4626
4627    *address_inout = address;
4628    *const_offset_inout = const_offset;
4629    *offset_inout = offset;
4630 }
4631
4632 Temp
4633 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4634                      unsigned align_, unsigned const_offset, Temp dst_hint)
4635 {
4636    Temp addr = info.resource;
4637    if (!addr.id()) {
4638       addr = offset;
4639       offset = Temp();
4640    }
4641    lower_global_address(bld, 0, &addr, &const_offset, &offset);
4642
4643    unsigned bytes_size = 0;
4644    bool use_mubuf = bld.program->gfx_level == GFX6;
4645    bool global = bld.program->gfx_level >= GFX9;
4646    aco_opcode op;
4647    if (bytes_needed == 1 || align_ % 2u) {
4648       bytes_size = 1;
4649       op = use_mubuf ? aco_opcode::buffer_load_ubyte
4650            : global  ? aco_opcode::global_load_ubyte
4651                      : aco_opcode::flat_load_ubyte;
4652    } else if (bytes_needed == 2 || align_ % 4u) {
4653       bytes_size = 2;
4654       op = use_mubuf ? aco_opcode::buffer_load_ushort
4655            : global  ? aco_opcode::global_load_ushort
4656                      : aco_opcode::flat_load_ushort;
4657    } else if (bytes_needed <= 4) {
4658       bytes_size = 4;
4659       op = use_mubuf ? aco_opcode::buffer_load_dword
4660            : global  ? aco_opcode::global_load_dword
4661                      : aco_opcode::flat_load_dword;
4662    } else if (bytes_needed <= 8 || (bytes_needed <= 12 && use_mubuf)) {
4663       bytes_size = 8;
4664       op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4665            : global  ? aco_opcode::global_load_dwordx2
4666                      : aco_opcode::flat_load_dwordx2;
4667    } else if (bytes_needed <= 12 && !use_mubuf) {
4668       bytes_size = 12;
4669       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4670    } else {
4671       bytes_size = 16;
4672       op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4673            : global  ? aco_opcode::global_load_dwordx4
4674                      : aco_opcode::flat_load_dwordx4;
4675    }
4676    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4677    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4678    if (use_mubuf) {
4679       aco_ptr<MUBUF_instruction> mubuf{
4680          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4681       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
4682       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
4683       mubuf->operands[2] = Operand(offset);
4684       mubuf->glc = info.glc;
4685       mubuf->dlc = false;
4686       mubuf->offset = const_offset;
4687       mubuf->addr64 = addr.type() == RegType::vgpr;
4688       mubuf->disable_wqm = false;
4689       mubuf->sync = info.sync;
4690       mubuf->definitions[0] = Definition(val);
4691       bld.insert(std::move(mubuf));
4692    } else {
4693       aco_ptr<FLAT_instruction> flat{
4694          create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4695       if (addr.regClass() == s2) {
4696          assert(global && offset.id() && offset.type() == RegType::vgpr);
4697          flat->operands[0] = Operand(offset);
4698          flat->operands[1] = Operand(addr);
4699       } else {
4700          assert(addr.type() == RegType::vgpr && !offset.id());
4701          flat->operands[0] = Operand(addr);
4702          flat->operands[1] = Operand(s1);
4703       }
4704       flat->glc = info.glc;
4705       flat->dlc =
4706          info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4707       flat->sync = info.sync;
4708       assert(global || !const_offset);
4709       flat->offset = const_offset;
4710       flat->definitions[0] = Definition(val);
4711       bld.insert(std::move(flat));
4712    }
4713
4714    return val;
4715 }
4716
4717 const EmitLoadParameters global_load_params{global_load_callback, true, true, UINT32_MAX};
4718
4719 Temp
4720 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4721          Temp address, unsigned base_offset, unsigned align)
4722 {
4723    assert(util_is_power_of_two_nonzero(align));
4724
4725    Builder bld(ctx->program, ctx->block);
4726
4727    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4728    info.align_mul = align;
4729    info.align_offset = 0;
4730    info.sync = memory_sync_info(storage_shared);
4731    info.const_offset = base_offset;
4732    emit_load(ctx, bld, info, lds_load_params);
4733
4734    return dst;
4735 }
4736
4737 void
4738 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4739                  Temp src)
4740 {
4741    if (!count)
4742       return;
4743
4744    Builder bld(ctx->program, ctx->block);
4745
4746    /* count == 1 fast path */
4747    if (count == 1) {
4748       if (dst_type == RegType::sgpr)
4749          dst[0] = bld.as_uniform(src);
4750       else
4751          dst[0] = as_vgpr(ctx, src);
4752       return;
4753    }
4754
4755    /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4756    unsigned elem_size_bytes =
4757       1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4758
4759    ASSERTED bool is_subdword = elem_size_bytes < 4;
4760    assert(!is_subdword || dst_type == RegType::vgpr);
4761
4762    for (unsigned i = 0; i < count; i++)
4763       dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4764
4765    std::vector<Temp> temps;
4766    /* use allocated_vec if possible */
4767    auto it = ctx->allocated_vec.find(src.id());
4768    if (it != ctx->allocated_vec.end()) {
4769       if (!it->second[0].id())
4770          goto split;
4771       unsigned elem_size = it->second[0].bytes();
4772       assert(src.bytes() % elem_size == 0);
4773
4774       for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4775          if (!it->second[i].id())
4776             goto split;
4777       }
4778       if (elem_size_bytes % elem_size)
4779          goto split;
4780
4781       temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4782       elem_size_bytes = elem_size;
4783    }
4784
4785 split:
4786    /* split src if necessary */
4787    if (temps.empty()) {
4788       if (is_subdword && src.type() == RegType::sgpr)
4789          src = as_vgpr(ctx, src);
4790       if (dst_type == RegType::sgpr)
4791          src = bld.as_uniform(src);
4792
4793       unsigned num_elems = src.bytes() / elem_size_bytes;
4794       aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4795          aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4796       split->operands[0] = Operand(src);
4797       for (unsigned i = 0; i < num_elems; i++) {
4798          temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4799          split->definitions[i] = Definition(temps.back());
4800       }
4801       bld.insert(std::move(split));
4802    }
4803
4804    unsigned idx = 0;
4805    for (unsigned i = 0; i < count; i++) {
4806       unsigned op_count = dst[i].bytes() / elem_size_bytes;
4807       if (op_count == 1) {
4808          if (dst_type == RegType::sgpr)
4809             dst[i] = bld.as_uniform(temps[idx++]);
4810          else
4811             dst[i] = as_vgpr(ctx, temps[idx++]);
4812          continue;
4813       }
4814
4815       aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4816                                                                       Format::PSEUDO, op_count, 1)};
4817       for (unsigned j = 0; j < op_count; j++) {
4818          Temp tmp = temps[idx++];
4819          if (dst_type == RegType::sgpr)
4820             tmp = bld.as_uniform(tmp);
4821          vec->operands[j] = Operand(tmp);
4822       }
4823       vec->definitions[0] = Definition(dst[i]);
4824       bld.insert(std::move(vec));
4825    }
4826    return;
4827 }
4828
4829 bool
4830 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4831 {
4832    unsigned start_elem = ffs(todo_mask) - 1;
4833    bool skip = !(mask & (1 << start_elem));
4834    if (skip)
4835       mask = ~mask & todo_mask;
4836
4837    mask &= todo_mask;
4838
4839    u_bit_scan_consecutive_range(&mask, start, count);
4840
4841    return !skip;
4842 }
4843
4844 void
4845 advance_write_mask(uint32_t* todo_mask, int start, int count)
4846 {
4847    *todo_mask &= ~u_bit_consecutive(0, count) << start;
4848 }
4849
4850 void
4851 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4852           unsigned base_offset, unsigned align)
4853 {
4854    assert(util_is_power_of_two_nonzero(align));
4855    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4856
4857    Builder bld(ctx->program, ctx->block);
4858    bool large_ds_write = ctx->options->gfx_level >= GFX7;
4859    bool usable_write2 = ctx->options->gfx_level >= GFX7;
4860
4861    unsigned write_count = 0;
4862    Temp write_datas[32];
4863    unsigned offsets[32];
4864    unsigned bytes[32];
4865    aco_opcode opcodes[32];
4866
4867    wrmask = util_widen_mask(wrmask, elem_size_bytes);
4868
4869    const unsigned wrmask_bitcnt = util_bitcount(wrmask);
4870    uint32_t todo = u_bit_consecutive(0, data.bytes());
4871
4872    if (u_bit_consecutive(0, wrmask_bitcnt) == wrmask)
4873       todo = MIN2(todo, wrmask);
4874
4875    while (todo) {
4876       int offset, byte;
4877       if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
4878          offsets[write_count] = offset;
4879          bytes[write_count] = byte;
4880          opcodes[write_count] = aco_opcode::num_opcodes;
4881          write_count++;
4882          advance_write_mask(&todo, offset, byte);
4883          continue;
4884       }
4885
4886       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
4887       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
4888       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
4889       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
4890
4891       // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
4892       aco_opcode op = aco_opcode::num_opcodes;
4893       if (byte >= 16 && aligned16 && large_ds_write) {
4894          op = aco_opcode::ds_write_b128;
4895          byte = 16;
4896       } else if (byte >= 12 && aligned16 && large_ds_write) {
4897          op = aco_opcode::ds_write_b96;
4898          byte = 12;
4899       } else if (byte >= 8 && aligned8) {
4900          op = aco_opcode::ds_write_b64;
4901          byte = 8;
4902       } else if (byte >= 4 && aligned4) {
4903          op = aco_opcode::ds_write_b32;
4904          byte = 4;
4905       } else if (byte >= 2 && aligned2) {
4906          op = aco_opcode::ds_write_b16;
4907          byte = 2;
4908       } else if (byte >= 1) {
4909          op = aco_opcode::ds_write_b8;
4910          byte = 1;
4911       } else {
4912          assert(false);
4913       }
4914
4915       offsets[write_count] = offset;
4916       bytes[write_count] = byte;
4917       opcodes[write_count] = op;
4918       write_count++;
4919       advance_write_mask(&todo, offset, byte);
4920    }
4921
4922    Operand m = load_lds_size_m0(bld);
4923
4924    split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
4925
4926    for (unsigned i = 0; i < write_count; i++) {
4927       aco_opcode op = opcodes[i];
4928       if (op == aco_opcode::num_opcodes)
4929          continue;
4930
4931       Temp split_data = write_datas[i];
4932
4933       unsigned second = write_count;
4934       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
4935          for (second = i + 1; second < write_count; second++) {
4936             if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
4937                op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4938                opcodes[second] = aco_opcode::num_opcodes;
4939                break;
4940             }
4941          }
4942       }
4943
4944       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
4945       unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
4946
4947       unsigned inline_offset = base_offset + offsets[i];
4948       unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
4949       Temp address_offset = address;
4950       if (inline_offset > max_offset) {
4951          address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
4952          inline_offset = offsets[i];
4953       }
4954
4955       /* offsets[i] shouldn't be large enough for this to happen */
4956       assert(inline_offset <= max_offset);
4957
4958       Instruction* instr;
4959       if (write2) {
4960          Temp second_data = write_datas[second];
4961          inline_offset /= split_data.bytes();
4962          instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
4963                         inline_offset + write2_off);
4964       } else {
4965          instr = bld.ds(op, address_offset, split_data, m, inline_offset);
4966       }
4967       instr->ds().sync = memory_sync_info(storage_shared);
4968
4969       if (m.isUndefined())
4970          instr->operands.pop_back();
4971    }
4972 }
4973
4974 aco_opcode
4975 get_buffer_store_op(unsigned bytes)
4976 {
4977    switch (bytes) {
4978    case 1: return aco_opcode::buffer_store_byte;
4979    case 2: return aco_opcode::buffer_store_short;
4980    case 4: return aco_opcode::buffer_store_dword;
4981    case 8: return aco_opcode::buffer_store_dwordx2;
4982    case 12: return aco_opcode::buffer_store_dwordx3;
4983    case 16: return aco_opcode::buffer_store_dwordx4;
4984    }
4985    unreachable("Unexpected store size");
4986    return aco_opcode::num_opcodes;
4987 }
4988
4989 void
4990 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
4991                    Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
4992                    Temp* write_datas, unsigned* offsets)
4993 {
4994    unsigned write_count_with_skips = 0;
4995    bool skips[16];
4996    unsigned bytes[16];
4997
4998    /* determine how to split the data */
4999    unsigned todo = u_bit_consecutive(0, data.bytes());
5000    while (todo) {
5001       int offset, byte;
5002       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
5003       offsets[write_count_with_skips] = offset;
5004       if (skips[write_count_with_skips]) {
5005          bytes[write_count_with_skips] = byte;
5006          advance_write_mask(&todo, offset, byte);
5007          write_count_with_skips++;
5008          continue;
5009       }
5010
5011       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
5012        * larger than swizzle_element_size */
5013       byte = MIN2(byte, swizzle_element_size);
5014       if (byte % 4)
5015          byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
5016
5017       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
5018       if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12)
5019          byte = 8;
5020
5021       /* dword or larger stores have to be dword-aligned */
5022       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
5023       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
5024       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
5025       if (!dword_aligned)
5026          byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
5027
5028       bytes[write_count_with_skips] = byte;
5029       advance_write_mask(&todo, offset, byte);
5030       write_count_with_skips++;
5031    }
5032
5033    /* actually split data */
5034    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
5035
5036    /* remove skips */
5037    for (unsigned i = 0; i < write_count_with_skips; i++) {
5038       if (skips[i])
5039          continue;
5040       write_datas[*write_count] = write_datas[i];
5041       offsets[*write_count] = offsets[i];
5042       (*write_count)++;
5043    }
5044 }
5045
5046 Temp
5047 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
5048                       unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
5049 {
5050    Builder bld(ctx->program, ctx->block);
5051    unsigned dword_size = elem_size_bytes / 4;
5052
5053    if (!dst.id())
5054       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
5055
5056    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
5057    aco_ptr<Pseudo_instruction> instr{
5058       create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
5059    instr->definitions[0] = Definition(dst);
5060
5061    for (unsigned i = 0; i < cnt; ++i) {
5062       if (arr[i].id()) {
5063          assert(arr[i].size() == dword_size);
5064          allocated_vec[i] = arr[i];
5065          instr->operands[i] = Operand(arr[i]);
5066       } else {
5067          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
5068                               Operand::zero(dword_size == 2 ? 8 : 4));
5069          allocated_vec[i] = zero;
5070          instr->operands[i] = Operand(zero);
5071       }
5072    }
5073
5074    bld.insert(std::move(instr));
5075
5076    if (split_cnt)
5077       emit_split_vector(ctx, dst, split_cnt);
5078    else
5079       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
5080
5081    return dst;
5082 }
5083
5084 inline unsigned
5085 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
5086 {
5087    if (const_offset >= 4096) {
5088       unsigned excess_const_offset = const_offset / 4096u * 4096u;
5089       const_offset %= 4096u;
5090
5091       if (!voffset.id())
5092          voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
5093       else if (unlikely(voffset.regClass() == s1))
5094          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
5095                             Operand::c32(excess_const_offset), Operand(voffset));
5096       else if (likely(voffset.regClass() == v1))
5097          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
5098       else
5099          unreachable("Unsupported register class of voffset");
5100    }
5101
5102    return const_offset;
5103 }
5104
5105 void
5106 emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
5107                         Temp vdata, unsigned const_offset, memory_sync_info sync, bool glc,
5108                         bool slc, bool swizzled)
5109 {
5110    assert(vdata.id());
5111    assert(vdata.size() != 3 || ctx->program->gfx_level != GFX6);
5112    assert(vdata.size() >= 1 && vdata.size() <= 4);
5113
5114    Builder bld(ctx->program, ctx->block);
5115    aco_opcode op = get_buffer_store_op(vdata.bytes());
5116    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
5117
5118    bool offen = voffset.id();
5119    bool idxen = idx.id();
5120
5121    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
5122    glc &= ctx->program->gfx_level < GFX11;
5123
5124    Operand vaddr_op(v1);
5125    if (offen && idxen)
5126       vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, voffset);
5127    else if (offen)
5128       vaddr_op = Operand(voffset);
5129    else if (idxen)
5130       vaddr_op = Operand(idx);
5131
5132    Builder::Result r =
5133       bld.mubuf(op, Operand(descriptor), vaddr_op, soffset_op, Operand(vdata), const_offset, offen,
5134                 swizzled, idxen, /* addr64 */ false, /* disable_wqm */ false, glc,
5135                 /* dlc*/ false, slc);
5136
5137    r->mubuf().sync = sync;
5138 }
5139
5140 void
5141 store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
5142                  unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
5143                  bool swizzled, memory_sync_info sync, bool glc, bool slc)
5144 {
5145    Builder bld(ctx->program, ctx->block);
5146    assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 ||
5147           elem_size_bytes == 8);
5148    assert(write_mask);
5149    write_mask = util_widen_mask(write_mask, elem_size_bytes);
5150
5151    unsigned write_count = 0;
5152    Temp write_datas[32];
5153    unsigned offsets[32];
5154    split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
5155                       swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
5156                       write_datas, offsets);
5157
5158    for (unsigned i = 0; i < write_count; i++) {
5159       unsigned const_offset = offsets[i] + base_const_offset;
5160       emit_single_mubuf_store(ctx, descriptor, voffset, soffset, idx, write_datas[i], const_offset,
5161                               sync, glc, slc, swizzled);
5162    }
5163 }
5164
5165 Temp
5166 wave_id_in_threadgroup(isel_context* ctx)
5167 {
5168    Builder bld(ctx->program, ctx->block);
5169    return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
5170                    get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(24u | (4u << 16)));
5171 }
5172
5173 Temp
5174 thread_id_in_threadgroup(isel_context* ctx)
5175 {
5176    /* tid_in_tg = wave_id * wave_size + tid_in_wave */
5177
5178    Builder bld(ctx->program, ctx->block);
5179    Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
5180
5181    if (ctx->program->workgroup_size <= ctx->program->wave_size)
5182       return tid_in_wave;
5183
5184    Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
5185    Temp num_pre_threads =
5186       bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
5187                Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
5188    return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
5189 }
5190
5191 bool
5192 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
5193 {
5194    unsigned write_mask = nir_intrinsic_write_mask(instr);
5195    unsigned component = nir_intrinsic_component(instr);
5196    nir_src offset = *nir_get_io_offset_src(instr);
5197
5198    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5199       return false;
5200
5201    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5202
5203    if (instr->src[0].ssa->bit_size == 64)
5204       write_mask = util_widen_mask(write_mask, 2);
5205
5206    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
5207
5208    /* Use semantic location as index. radv already uses it as intrinsic base
5209     * but radeonsi does not. We need to make LS output and TCS input index
5210     * match each other, so need to use semantic location explicitly. Also for
5211     * TCS epilog to index tess factor temps using semantic location directly.
5212     */
5213    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5214    unsigned base = sem.location + sem.dual_source_blend_index;
5215    unsigned idx = base * 4u + component;
5216
5217    for (unsigned i = 0; i < 8; ++i) {
5218       if (write_mask & (1 << i)) {
5219          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
5220          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
5221       }
5222       idx++;
5223    }
5224
5225    if (ctx->stage == fragment_fs && ctx->program->info.has_epilog) {
5226       unsigned index = base - FRAG_RESULT_DATA0;
5227
5228       if (nir_intrinsic_src_type(instr) == nir_type_float16) {
5229          ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2);
5230       } else if (nir_intrinsic_src_type(instr) == nir_type_int16) {
5231          ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2);
5232       } else if (nir_intrinsic_src_type(instr) == nir_type_uint16) {
5233          ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2);
5234       }
5235    }
5236
5237    return true;
5238 }
5239
5240 bool
5241 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
5242 {
5243    /* Only TCS per-vertex inputs are supported by this function.
5244     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
5245     * is the same.
5246     */
5247    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
5248       return false;
5249
5250    nir_src* off_src = nir_get_io_offset_src(instr);
5251    nir_src* vertex_index_src = nir_get_io_arrayed_index_src(instr);
5252    nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
5253    bool can_use_temps =
5254       nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
5255       nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
5256
5257    if (!can_use_temps)
5258       return false;
5259
5260    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5261
5262    unsigned idx =
5263       sem.location * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
5264    Temp* src = &ctx->inputs.temps[idx];
5265    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
5266
5267    return true;
5268 }
5269
5270 void
5271 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
5272 {
5273    /* LS pass output to TCS by temp if they have same in/out patch size. */
5274    bool ls_need_output = ctx->stage == vertex_tess_control_hs &&
5275                          ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq;
5276
5277    bool tcs_need_output = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL &&
5278                           ctx->program->info.has_epilog &&
5279                           ctx->program->info.tcs.pass_tessfactors_by_reg;
5280
5281    bool ps_need_output = ctx->stage == fragment_fs;
5282
5283    if (ls_need_output || tcs_need_output || ps_need_output) {
5284       bool stored_to_temps = store_output_to_temps(ctx, instr);
5285       if (!stored_to_temps) {
5286          isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
5287          abort();
5288       }
5289    } else {
5290       unreachable("Shader stage not implemented");
5291    }
5292 }
5293
5294 bool
5295 in_exec_divergent_or_in_loop(isel_context* ctx)
5296 {
5297    return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
5298           ctx->cf_info.had_divergent_discard;
5299 }
5300
5301 void
5302 emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5303                         Temp prim_mask)
5304 {
5305    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5306    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5307
5308    Builder bld(ctx->program, ctx->block);
5309
5310    if (in_exec_divergent_or_in_loop(ctx)) {
5311       Operand prim_mask_op = bld.m0(prim_mask);
5312       prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
5313       Operand coord2_op(coord2);
5314       coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */
5315       bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5316                  Operand::c32(idx), Operand::c32(component), coord1, coord2_op, prim_mask_op);
5317       return;
5318    }
5319
5320    Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5321
5322    Temp res;
5323    if (dst.regClass() == v2b) {
5324       Temp p10 =
5325          bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1, p);
5326       res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v1), p, coord2, p10);
5327       emit_extract_vector(ctx, res, 0, dst);
5328    } else {
5329       Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
5330       bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10);
5331    }
5332    /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5333    set_wqm(ctx, true);
5334 }
5335
5336 void
5337 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5338                   Temp prim_mask)
5339 {
5340    if (ctx->options->gfx_level >= GFX11) {
5341       emit_interp_instr_gfx11(ctx, idx, component, src, dst, prim_mask);
5342       return;
5343    }
5344
5345    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5346    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5347
5348    Builder bld(ctx->program, ctx->block);
5349
5350    if (dst.regClass() == v2b) {
5351       if (ctx->program->dev.has_16bank_lds) {
5352          assert(ctx->options->gfx_level <= GFX8);
5353          Builder::Result interp_p1 =
5354             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
5355                        bld.m0(prim_mask), idx, component);
5356          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
5357                                 bld.m0(prim_mask), interp_p1, idx, component);
5358          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
5359                     interp_p1, idx, component);
5360       } else {
5361          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
5362
5363          if (ctx->options->gfx_level == GFX8)
5364             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
5365
5366          Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
5367                                                 bld.m0(prim_mask), idx, component);
5368          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
5369                     component);
5370       }
5371    } else {
5372       Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
5373                                              bld.m0(prim_mask), idx, component);
5374
5375       if (ctx->program->dev.has_16bank_lds)
5376          interp_p1->operands[0].setLateKill(true);
5377
5378       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
5379                  idx, component);
5380    }
5381 }
5382
5383 void
5384 emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsigned vertex_id,
5385                       Temp dst, Temp prim_mask)
5386 {
5387    Builder bld(ctx->program, ctx->block);
5388    if (ctx->options->gfx_level >= GFX11) {
5389       uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
5390       if (in_exec_divergent_or_in_loop(ctx)) {
5391          Operand prim_mask_op = bld.m0(prim_mask);
5392          prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
5393          bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5394                     Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
5395                     prim_mask_op);
5396       } else {
5397          Temp p =
5398             bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5399          if (dst.regClass() == v2b) {
5400             Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
5401             emit_extract_vector(ctx, res, 0, dst);
5402          } else {
5403             bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
5404          }
5405          /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5406          set_wqm(ctx, true);
5407       }
5408    } else {
5409       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32((vertex_id + 2) % 3),
5410                  bld.m0(prim_mask), idx, component);
5411    }
5412 }
5413
5414 void
5415 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
5416 {
5417    Builder bld(ctx->program, ctx->block);
5418
5419    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
5420       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
5421    for (unsigned i = 0; i < num_components; i++) {
5422       if (ctx->args->frag_pos[i].used)
5423          vec->operands[i] = Operand(get_arg(ctx, ctx->args->frag_pos[i]));
5424       else
5425          vec->operands[i] = Operand(v1);
5426    }
5427    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
5428       assert(num_components == 4);
5429       vec->operands[3] =
5430          bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->frag_pos[3]));
5431    }
5432
5433    for (Operand& op : vec->operands)
5434       op = op.isUndefined() ? Operand::zero() : op;
5435
5436    vec->definitions[0] = Definition(dst);
5437    ctx->block->instructions.emplace_back(std::move(vec));
5438    emit_split_vector(ctx, dst, num_components);
5439    return;
5440 }
5441
5442 void
5443 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
5444 {
5445    Builder bld(ctx->program, ctx->block);
5446    Temp cond;
5447
5448    /* VRS Rate X = Ancillary[2:3]
5449     * VRS Rate Y = Ancillary[4:5]
5450     */
5451    Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5452                           Operand::c32(2u), Operand::c32(2u));
5453    Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5454                           Operand::c32(4u), Operand::c32(2u));
5455
5456    /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
5457    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
5458    x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5459                      bld.copy(bld.def(v1), Operand::c32(4u)), cond);
5460
5461    /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
5462    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
5463    y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5464                      bld.copy(bld.def(v1), Operand::c32(1u)), cond);
5465
5466    bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
5467 }
5468
5469 void
5470 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
5471 {
5472    Temp dst = get_ssa_temp(ctx, &instr->def);
5473    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
5474    unsigned idx = nir_intrinsic_base(instr);
5475    unsigned component = nir_intrinsic_component(instr);
5476    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5477
5478    assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
5479
5480    if (instr->def.num_components == 1) {
5481       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
5482    } else {
5483       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
5484          aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1));
5485       for (unsigned i = 0; i < instr->def.num_components; i++) {
5486          Temp tmp = ctx->program->allocateTmp(instr->def.bit_size == 16 ? v2b : v1);
5487          emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
5488          vec->operands[i] = Operand(tmp);
5489       }
5490       vec->definitions[0] = Definition(dst);
5491       ctx->block->instructions.emplace_back(std::move(vec));
5492    }
5493 }
5494
5495 Temp
5496 mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
5497                     unsigned alignment, unsigned const_offset, Temp dst_hint)
5498 {
5499    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5500    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
5501
5502    if (info.soffset.id()) {
5503       if (soffset.isTemp())
5504          vaddr = bld.copy(bld.def(v1), soffset);
5505       soffset = Operand(info.soffset);
5506    }
5507
5508    if (soffset.isUndefined())
5509       soffset = Operand::zero();
5510
5511    const bool offen = !vaddr.isUndefined();
5512    const bool idxen = info.idx.id();
5513
5514    if (offen && idxen)
5515       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
5516    else if (idxen)
5517       vaddr = Operand(info.idx);
5518
5519    /* Determine number of fetched components.
5520     * Note, ACO IR works with GFX6-8 nfmt + dfmt fields, these are later converted for GFX10+.
5521     */
5522    const struct ac_vtx_format_info* vtx_info =
5523       ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, info.format);
5524    /* The number of channels in the format determines the memory range. */
5525    const unsigned max_components = vtx_info->num_channels;
5526    /* Calculate maximum number of components loaded according to alignment. */
5527    unsigned max_fetched_components = bytes_needed / info.component_size;
5528    max_fetched_components =
5529       ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
5530                              alignment, max_fetched_components);
5531    const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
5532    /* Adjust bytes needed in case we need to do a smaller load due to alignment.
5533     * If a larger format is selected, it's still OK to load a smaller amount from it.
5534     */
5535    bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
5536    unsigned bytes_size = 0;
5537    const unsigned bit_size = info.component_size * 8;
5538    aco_opcode op = aco_opcode::num_opcodes;
5539
5540    if (bytes_needed == 2) {
5541       bytes_size = 2;
5542       op = aco_opcode::tbuffer_load_format_d16_x;
5543    } else if (bytes_needed <= 4) {
5544       bytes_size = 4;
5545       if (bit_size == 16)
5546          op = aco_opcode::tbuffer_load_format_d16_xy;
5547       else
5548          op = aco_opcode::tbuffer_load_format_x;
5549    } else if (bytes_needed <= 6) {
5550       bytes_size = 6;
5551       if (bit_size == 16)
5552          op = aco_opcode::tbuffer_load_format_d16_xyz;
5553       else
5554          op = aco_opcode::tbuffer_load_format_xy;
5555    } else if (bytes_needed <= 8) {
5556       bytes_size = 8;
5557       if (bit_size == 16)
5558          op = aco_opcode::tbuffer_load_format_d16_xyzw;
5559       else
5560          op = aco_opcode::tbuffer_load_format_xy;
5561    } else if (bytes_needed <= 12) {
5562       bytes_size = 12;
5563       op = aco_opcode::tbuffer_load_format_xyz;
5564    } else {
5565       bytes_size = 16;
5566       op = aco_opcode::tbuffer_load_format_xyzw;
5567    }
5568
5569    /* Abort when suitable opcode wasn't found so we don't compile buggy shaders. */
5570    if (op == aco_opcode::num_opcodes) {
5571       aco_err(bld.program, "unsupported bit size for typed buffer load");
5572       abort();
5573    }
5574
5575    aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(op, Format::MTBUF, 3, 1)};
5576    mtbuf->operands[0] = Operand(info.resource);
5577    mtbuf->operands[1] = vaddr;
5578    mtbuf->operands[2] = soffset;
5579    mtbuf->offen = offen;
5580    mtbuf->idxen = idxen;
5581    mtbuf->glc = info.glc;
5582    mtbuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
5583    mtbuf->slc = info.slc;
5584    mtbuf->sync = info.sync;
5585    mtbuf->offset = const_offset;
5586    mtbuf->dfmt = fetch_fmt & 0xf;
5587    mtbuf->nfmt = fetch_fmt >> 4;
5588    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5589    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5590    mtbuf->definitions[0] = Definition(val);
5591    bld.insert(std::move(mtbuf));
5592
5593    return val;
5594 }
5595
5596 const EmitLoadParameters mtbuf_load_params{mtbuf_load_callback, false, true, 4096};
5597
5598 void
5599 visit_load_fs_input(isel_context* ctx, nir_intrinsic_instr* instr)
5600 {
5601    Builder bld(ctx->program, ctx->block);
5602    Temp dst = get_ssa_temp(ctx, &instr->def);
5603    nir_src offset = *nir_get_io_offset_src(instr);
5604
5605    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5606       isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
5607
5608    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5609
5610    unsigned idx = nir_intrinsic_base(instr);
5611    unsigned component = nir_intrinsic_component(instr);
5612    unsigned vertex_id = 0; /* P0 */
5613
5614    if (instr->intrinsic == nir_intrinsic_load_input_vertex)
5615       vertex_id = nir_src_as_uint(instr->src[0]);
5616
5617    if (instr->def.num_components == 1 && instr->def.bit_size != 64) {
5618       emit_interp_mov_instr(ctx, idx, component, vertex_id, dst, prim_mask);
5619    } else {
5620       unsigned num_components = instr->def.num_components;
5621       if (instr->def.bit_size == 64)
5622          num_components *= 2;
5623       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5624          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5625       for (unsigned i = 0; i < num_components; i++) {
5626          unsigned chan_component = (component + i) % 4;
5627          unsigned chan_idx = idx + (component + i) / 4;
5628          vec->operands[i] = Operand(bld.tmp(instr->def.bit_size == 16 ? v2b : v1));
5629          emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(),
5630                                prim_mask);
5631       }
5632       vec->definitions[0] = Definition(dst);
5633       bld.insert(std::move(vec));
5634    }
5635 }
5636
5637 void
5638 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5639 {
5640    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5641
5642    Builder bld(ctx->program, ctx->block);
5643    Temp dst = get_ssa_temp(ctx, &instr->def);
5644
5645    if (load_input_from_temps(ctx, instr, dst))
5646       return;
5647
5648    unreachable("LDS-based TCS input should have been lowered in NIR.");
5649 }
5650
5651 void
5652 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5653 {
5654    switch (ctx->shader->info.stage) {
5655    case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5656    default: unreachable("Unimplemented shader stage");
5657    }
5658 }
5659
5660 void
5661 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5662 {
5663    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5664
5665    Builder bld(ctx->program, ctx->block);
5666    Temp dst = get_ssa_temp(ctx, &instr->def);
5667
5668    Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5669    Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5670    Operand tes_w = Operand::zero();
5671
5672    if (ctx->shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
5673       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5674       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5675       tes_w = Operand(tmp);
5676    }
5677
5678    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5679    emit_split_vector(ctx, tess_coord, 3);
5680 }
5681
5682 void
5683 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5684             Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5685             bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5686 {
5687    Builder bld(ctx->program, ctx->block);
5688
5689    bool use_smem =
5690       dst.type() != RegType::vgpr && (!glc || ctx->options->gfx_level >= GFX8) && allow_smem;
5691    if (use_smem)
5692       offset = bld.as_uniform(offset);
5693    else {
5694       /* GFX6-7 are affected by a hw bug that prevents address clamping to
5695        * work correctly when the SGPR offset is used.
5696        */
5697       if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
5698          offset = as_vgpr(ctx, offset);
5699    }
5700
5701    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5702    info.glc = glc;
5703    info.sync = sync;
5704    info.align_mul = align_mul;
5705    info.align_offset = align_offset;
5706    if (use_smem)
5707       emit_load(ctx, bld, info, smem_load_params);
5708    else
5709       emit_load(ctx, bld, info, mubuf_load_params);
5710 }
5711
5712 void
5713 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5714 {
5715    Temp dst = get_ssa_temp(ctx, &instr->def);
5716    Builder bld(ctx->program, ctx->block);
5717    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5718
5719    unsigned size = instr->def.bit_size / 8;
5720    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5721                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5722 }
5723
5724 void
5725 visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5726 {
5727    Builder bld(ctx->program, ctx->block);
5728    Temp dst = get_ssa_temp(ctx, &instr->def);
5729    unsigned offset = nir_intrinsic_base(instr);
5730    unsigned count = instr->def.num_components;
5731    nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5732
5733    if (instr->def.bit_size == 64)
5734       count *= 2;
5735
5736    if (index_cv && instr->def.bit_size >= 32) {
5737       unsigned start = (offset + index_cv->u32) / 4u;
5738       uint64_t mask = BITFIELD64_MASK(count) << start;
5739       if ((ctx->args->inline_push_const_mask | mask) == ctx->args->inline_push_const_mask &&
5740           start + count <= (sizeof(ctx->args->inline_push_const_mask) * 8u)) {
5741          std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5742          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5743             aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5744          unsigned arg_index =
5745             util_bitcount64(ctx->args->inline_push_const_mask & BITFIELD64_MASK(start));
5746          for (unsigned i = 0; i < count; ++i) {
5747             elems[i] = get_arg(ctx, ctx->args->inline_push_consts[arg_index++]);
5748             vec->operands[i] = Operand{elems[i]};
5749          }
5750          vec->definitions[0] = Definition(dst);
5751          ctx->block->instructions.emplace_back(std::move(vec));
5752          ctx->allocated_vec.emplace(dst.id(), elems);
5753          return;
5754       }
5755    }
5756
5757    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5758    if (offset != 0) // TODO check if index != 0 as well
5759       index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5760                              Operand::c32(offset), index);
5761    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->push_constants));
5762    Temp vec = dst;
5763    bool trim = false;
5764    bool aligned = true;
5765
5766    if (instr->def.bit_size == 8) {
5767       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5768       bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5769       if (!aligned)
5770          vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5771    } else if (instr->def.bit_size == 16) {
5772       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5773       if (!aligned)
5774          vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5775    }
5776
5777    aco_opcode op;
5778
5779    switch (vec.size()) {
5780    case 1: op = aco_opcode::s_load_dword; break;
5781    case 2: op = aco_opcode::s_load_dwordx2; break;
5782    case 3:
5783       vec = bld.tmp(s4);
5784       trim = true;
5785       FALLTHROUGH;
5786    case 4: op = aco_opcode::s_load_dwordx4; break;
5787    case 6:
5788       vec = bld.tmp(s8);
5789       trim = true;
5790       FALLTHROUGH;
5791    case 8: op = aco_opcode::s_load_dwordx8; break;
5792    default: unreachable("unimplemented or forbidden load_push_constant.");
5793    }
5794
5795    bld.smem(op, Definition(vec), ptr, index);
5796
5797    if (!aligned) {
5798       Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5799       byte_align_scalar(ctx, vec, byte_offset, dst);
5800       return;
5801    }
5802
5803    if (trim) {
5804       emit_split_vector(ctx, vec, 4);
5805       RegClass rc = dst.size() == 3 ? s1 : s2;
5806       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5807                  emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5808    }
5809    emit_split_vector(ctx, dst, instr->def.num_components);
5810 }
5811
5812 void
5813 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5814 {
5815    Temp dst = get_ssa_temp(ctx, &instr->def);
5816
5817    Builder bld(ctx->program, ctx->block);
5818
5819    uint32_t desc_type =
5820       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5821       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5822    if (ctx->options->gfx_level >= GFX10) {
5823       desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5824                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5825                    S_008F0C_RESOURCE_LEVEL(ctx->options->gfx_level < GFX11);
5826    } else {
5827       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5828                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5829    }
5830
5831    unsigned base = nir_intrinsic_base(instr);
5832    unsigned range = nir_intrinsic_range(instr);
5833
5834    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5835    if (base && offset.type() == RegType::sgpr)
5836       offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5837                               Operand::c32(base));
5838    else if (base && offset.type() == RegType::vgpr)
5839       offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5840
5841    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5842                           bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5843                                      Operand::c32(ctx->constant_data_offset)),
5844                           Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5845                           Operand::c32(desc_type));
5846    unsigned size = instr->def.bit_size / 8;
5847    // TODO: get alignment information for subdword constants
5848    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5849 }
5850
5851 /* Packs multiple Temps of different sizes in to a vector of v1 Temps.
5852  * The byte count of each input Temp must be a multiple of 2.
5853  */
5854 static std::vector<Temp>
5855 emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
5856 {
5857    Builder bld(ctx->program, ctx->block);
5858    std::vector<Temp> packed;
5859    Temp low = Temp();
5860    for (Temp tmp : unpacked) {
5861       assert(tmp.bytes() % 2 == 0);
5862       unsigned byte_idx = 0;
5863       while (byte_idx < tmp.bytes()) {
5864          if (low != Temp()) {
5865             Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5866             Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high);
5867             low = Temp();
5868             packed.push_back(dword);
5869             byte_idx += 2;
5870          } else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) {
5871             packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
5872             byte_idx += 4;
5873          } else {
5874             low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5875             byte_idx += 2;
5876          }
5877       }
5878    }
5879    if (low != Temp()) {
5880       Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b));
5881       packed.push_back(dword);
5882    }
5883    return packed;
5884 }
5885
5886 static bool
5887 should_declare_array(ac_image_dim dim)
5888 {
5889    return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5890           dim == ac_image_2darraymsaa;
5891 }
5892
5893 static int
5894 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5895 {
5896    switch (dim) {
5897    case GLSL_SAMPLER_DIM_BUF: return 1;
5898    case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5899    case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5900    case GLSL_SAMPLER_DIM_MS: return array ? 3 : 2;
5901    case GLSL_SAMPLER_DIM_3D:
5902    case GLSL_SAMPLER_DIM_CUBE: return 3;
5903    case GLSL_SAMPLER_DIM_RECT:
5904    case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5905    case GLSL_SAMPLER_DIM_SUBPASS_MS: return 2;
5906    default: break;
5907    }
5908    return 0;
5909 }
5910
5911 static MIMG_instruction*
5912 emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector<Temp> coords,
5913           Operand vdata = Operand(v1))
5914 {
5915    size_t nsa_size = bld.program->dev.max_nsa_vgprs;
5916    nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
5917
5918    const bool strict_wqm = coords[0].regClass().is_linear_vgpr();
5919    if (strict_wqm)
5920       nsa_size = coords.size();
5921
5922    for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {
5923       if (!coords[i].id())
5924          continue;
5925
5926       coords[i] = as_vgpr(bld, coords[i]);
5927    }
5928
5929    if (nsa_size < coords.size()) {
5930       Temp coord = coords[nsa_size];
5931       if (coords.size() - nsa_size > 1) {
5932          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5933             aco_opcode::p_create_vector, Format::PSEUDO, coords.size() - nsa_size, 1)};
5934
5935          unsigned coord_size = 0;
5936          for (unsigned i = nsa_size; i < coords.size(); i++) {
5937             vec->operands[i - nsa_size] = Operand(coords[i]);
5938             coord_size += coords[i].size();
5939          }
5940
5941          coord = bld.tmp(RegType::vgpr, coord_size);
5942          vec->definitions[0] = Definition(coord);
5943          bld.insert(std::move(vec));
5944       } else {
5945          coord = as_vgpr(bld, coord);
5946       }
5947
5948       coords[nsa_size] = coord;
5949       coords.resize(nsa_size + 1);
5950    }
5951
5952    bool has_dst = dst.id() != 0;
5953
5954    aco_ptr<MIMG_instruction> mimg{
5955       create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), has_dst)};
5956    if (has_dst)
5957       mimg->definitions[0] = Definition(dst);
5958    mimg->operands[0] = Operand(rsrc);
5959    mimg->operands[1] = samp;
5960    mimg->operands[2] = vdata;
5961    for (unsigned i = 0; i < coords.size(); i++)
5962       mimg->operands[3 + i] = Operand(coords[i]);
5963    mimg->strict_wqm = strict_wqm;
5964
5965    MIMG_instruction* res = mimg.get();
5966    bld.insert(std::move(mimg));
5967    return res;
5968 }
5969
5970 void
5971 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5972 {
5973    Builder bld(ctx->program, ctx->block);
5974    Temp dst = get_ssa_temp(ctx, &instr->def);
5975    Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
5976    Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
5977    Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
5978    Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
5979    Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
5980    Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
5981
5982    /* On GFX11 image_bvh64_intersect_ray has a special vaddr layout with NSA:
5983     * There are five smaller vector groups:
5984     * node_pointer, ray_extent, ray_origin, ray_dir, ray_inv_dir.
5985     * These directly match the NIR intrinsic sources.
5986     */
5987    std::vector<Temp> args = {
5988       node, tmax, origin, dir, inv_dir,
5989    };
5990
5991    if (bld.program->gfx_level == GFX10_3) {
5992       std::vector<Temp> scalar_args;
5993       for (Temp tmp : args) {
5994          for (unsigned i = 0; i < tmp.size(); i++)
5995             scalar_args.push_back(emit_extract_vector(ctx, tmp, i, v1));
5996       }
5997       args = std::move(scalar_args);
5998    }
5999
6000    MIMG_instruction* mimg =
6001       emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, dst, resource, Operand(s4), args);
6002    mimg->dim = ac_image_1d;
6003    mimg->dmask = 0xf;
6004    mimg->unrm = true;
6005    mimg->r128 = true;
6006
6007    emit_split_vector(ctx, dst, instr->def.num_components);
6008 }
6009
6010 static std::vector<Temp>
6011 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6012 {
6013
6014    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6015    bool a16 = instr->src[1].ssa->bit_size == 16;
6016    RegClass rc = a16 ? v2b : v1;
6017    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6018    bool is_array = nir_intrinsic_image_array(instr);
6019    ASSERTED bool add_frag_pos =
6020       (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6021    assert(!add_frag_pos && "Input attachments should be lowered.");
6022    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6023    bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6024    int count = image_type_to_components_count(dim, is_array);
6025    std::vector<Temp> coords;
6026    Builder bld(ctx->program, ctx->block);
6027
6028    if (gfx9_1d) {
6029       coords.emplace_back(emit_extract_vector(ctx, src0, 0, rc));
6030       coords.emplace_back(bld.copy(bld.def(rc), Operand::zero(a16 ? 2 : 4)));
6031       if (is_array)
6032          coords.emplace_back(emit_extract_vector(ctx, src0, 1, rc));
6033    } else {
6034       for (int i = 0; i < count; i++)
6035          coords.emplace_back(emit_extract_vector(ctx, src0, i, rc));
6036    }
6037
6038    bool has_lod = false;
6039    Temp lod;
6040
6041    if (instr->intrinsic == nir_intrinsic_bindless_image_load ||
6042        instr->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
6043        instr->intrinsic == nir_intrinsic_bindless_image_store) {
6044       int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3;
6045       assert(instr->src[lod_index].ssa->bit_size == (a16 ? 16 : 32));
6046       has_lod =
6047          !nir_src_is_const(instr->src[lod_index]) || nir_src_as_uint(instr->src[lod_index]) != 0;
6048
6049       if (has_lod)
6050          lod = get_ssa_temp_tex(ctx, instr->src[lod_index].ssa, a16);
6051    }
6052
6053    if (ctx->program->info.image_2d_view_of_3d && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
6054       /* The hw can't bind a slice of a 3D image as a 2D image, because it
6055        * ignores BASE_ARRAY if the target is 3D. The workaround is to read
6056        * BASE_ARRAY and set it as the 3rd address operand for all 2D images.
6057        */
6058       assert(ctx->options->gfx_level == GFX9);
6059       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6060       Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1);
6061       /* Extract the BASE_ARRAY field [0:12] from the descriptor. */
6062       Temp first_layer = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), rsrc_word5, Operand::c32(0u),
6063                                   Operand::c32(13u));
6064
6065       if (has_lod) {
6066          /* If there's a lod parameter it matter if the image is 3d or 2d because
6067           * the hw reads either the fourth or third component as lod. So detect
6068           * 3d images and place the lod at the third component otherwise.
6069           * For non 3D descriptors we effectively add lod twice to coords,
6070           * but the hw will only read the first one, the second is ignored.
6071           */
6072          Temp rsrc_word3 = emit_extract_vector(ctx, rsrc, 3, s1);
6073          Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), rsrc_word3,
6074                               Operand::c32(28 | (4 << 16))); /* extract last 4 bits */
6075          Temp is_3d = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), type,
6076                                    Operand::c32(V_008F1C_SQ_RSRC_IMG_3D));
6077          first_layer =
6078             bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), as_vgpr(ctx, lod), first_layer, is_3d);
6079       }
6080
6081       if (a16)
6082          coords.emplace_back(emit_extract_vector(ctx, first_layer, 0, v2b));
6083       else
6084          coords.emplace_back(first_layer);
6085    }
6086
6087    if (is_ms && instr->intrinsic != nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6088       assert(instr->src[2].ssa->bit_size == (a16 ? 16 : 32));
6089       coords.emplace_back(get_ssa_temp_tex(ctx, instr->src[2].ssa, a16));
6090    }
6091
6092    if (has_lod)
6093       coords.emplace_back(lod);
6094
6095    return emit_pack_v1(ctx, coords);
6096 }
6097
6098 memory_sync_info
6099 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6100 {
6101    /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6102    if (semantics & semantic_atomicrmw)
6103       return memory_sync_info(storage, semantics);
6104
6105    unsigned access = nir_intrinsic_access(instr);
6106
6107    if (access & ACCESS_VOLATILE)
6108       semantics |= semantic_volatile;
6109    if (access & ACCESS_CAN_REORDER)
6110       semantics |= semantic_can_reorder | semantic_private;
6111
6112    return memory_sync_info(storage, semantics);
6113 }
6114
6115 Operand
6116 emit_tfe_init(Builder& bld, Temp dst)
6117 {
6118    Temp tmp = bld.tmp(dst.regClass());
6119
6120    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6121       aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6122    for (unsigned i = 0; i < dst.size(); i++)
6123       vec->operands[i] = Operand::zero();
6124    vec->definitions[0] = Definition(tmp);
6125    /* Since this is fixed to an instruction's definition register, any CSE will
6126     * just create copies. Copying costs about the same as zero-initialization,
6127     * but these copies can break up clauses.
6128     */
6129    vec->definitions[0].setNoCSE(true);
6130    bld.insert(std::move(vec));
6131
6132    return Operand(tmp);
6133 }
6134
6135 void
6136 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6137 {
6138    Builder bld(ctx->program, ctx->block);
6139    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6140    bool is_array = nir_intrinsic_image_array(instr);
6141    bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
6142    Temp dst = get_ssa_temp(ctx, &instr->def);
6143
6144    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6145    unsigned access = nir_intrinsic_access(instr);
6146
6147    unsigned result_size = instr->def.num_components - is_sparse;
6148    unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
6149    expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6150    if (dim == GLSL_SAMPLER_DIM_BUF)
6151       expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6152    unsigned dmask = expand_mask;
6153    if (instr->def.bit_size == 64) {
6154       expand_mask &= 0x9;
6155       /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6156       dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6157    }
6158    if (is_sparse)
6159       expand_mask |= 1 << result_size;
6160
6161    bool d16 = instr->def.bit_size == 16;
6162    assert(!d16 || !is_sparse);
6163
6164    unsigned num_bytes = util_bitcount(dmask) * (d16 ? 2 : 4) + is_sparse * 4;
6165
6166    Temp tmp;
6167    if (num_bytes == dst.bytes() && dst.type() == RegType::vgpr)
6168       tmp = dst;
6169    else
6170       tmp = bld.tmp(RegClass::get(RegType::vgpr, num_bytes));
6171
6172    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6173
6174    if (dim == GLSL_SAMPLER_DIM_BUF) {
6175       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6176
6177       aco_opcode opcode;
6178       if (!d16) {
6179          switch (util_bitcount(dmask)) {
6180          case 1: opcode = aco_opcode::buffer_load_format_x; break;
6181          case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6182          case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6183          case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6184          default: unreachable(">4 channel buffer image load");
6185          }
6186       } else {
6187          switch (util_bitcount(dmask)) {
6188          case 1: opcode = aco_opcode::buffer_load_format_d16_x; break;
6189          case 2: opcode = aco_opcode::buffer_load_format_d16_xy; break;
6190          case 3: opcode = aco_opcode::buffer_load_format_d16_xyz; break;
6191          case 4: opcode = aco_opcode::buffer_load_format_d16_xyzw; break;
6192          default: unreachable(">4 channel buffer image load");
6193          }
6194       }
6195       aco_ptr<MUBUF_instruction> load{
6196          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6197       load->operands[0] = Operand(resource);
6198       load->operands[1] = Operand(vindex);
6199       load->operands[2] = Operand::c32(0);
6200       load->definitions[0] = Definition(tmp);
6201       load->idxen = true;
6202       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6203       load->dlc =
6204          load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6205       load->sync = sync;
6206       load->tfe = is_sparse;
6207       if (load->tfe)
6208          load->operands[3] = emit_tfe_init(bld, tmp);
6209       ctx->block->instructions.emplace_back(std::move(load));
6210    } else {
6211       std::vector<Temp> coords = get_image_coords(ctx, instr);
6212
6213       aco_opcode opcode;
6214       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6215          opcode = aco_opcode::image_load;
6216       } else {
6217          bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6218          opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6219       }
6220
6221       Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6222       MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata);
6223       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6224       load->dlc =
6225          load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6226       load->a16 = instr->src[1].ssa->bit_size == 16;
6227       load->d16 = d16;
6228       load->dmask = dmask;
6229       load->unrm = true;
6230       load->tfe = is_sparse;
6231
6232       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6233          load->dim = is_array ? ac_image_2darray : ac_image_2d;
6234          load->da = is_array;
6235          load->sync = memory_sync_info();
6236       } else {
6237          ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6238          load->dim = sdim;
6239          load->da = should_declare_array(sdim);
6240          load->sync = sync;
6241       }
6242    }
6243
6244    if (is_sparse && instr->def.bit_size == 64) {
6245       /* The result components are 64-bit but the sparse residency code is
6246        * 32-bit. So add a zero to the end so expand_vector() works correctly.
6247        */
6248       tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6249                        Operand::zero());
6250    }
6251
6252    expand_vector(ctx, tmp, dst, instr->def.num_components, expand_mask, instr->def.bit_size == 64);
6253 }
6254
6255 void
6256 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6257 {
6258    Builder bld(ctx->program, ctx->block);
6259    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6260    bool is_array = nir_intrinsic_image_array(instr);
6261    Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6262    bool d16 = instr->src[3].ssa->bit_size == 16;
6263
6264    /* only R64_UINT and R64_SINT supported */
6265    if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6266       data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6267    data = as_vgpr(ctx, data);
6268
6269    uint32_t num_components = d16 ? instr->src[3].ssa->num_components : data.size();
6270
6271    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6272    unsigned access = nir_intrinsic_access(instr);
6273    bool glc = ctx->options->gfx_level == GFX6 ||
6274               ((access & (ACCESS_VOLATILE | ACCESS_COHERENT)) && ctx->program->gfx_level < GFX11);
6275
6276    if (dim == GLSL_SAMPLER_DIM_BUF) {
6277       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6278       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6279       aco_opcode opcode;
6280       if (!d16) {
6281          switch (num_components) {
6282          case 1: opcode = aco_opcode::buffer_store_format_x; break;
6283          case 2: opcode = aco_opcode::buffer_store_format_xy; break;
6284          case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
6285          case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
6286          default: unreachable(">4 channel buffer image store");
6287          }
6288       } else {
6289          switch (num_components) {
6290          case 1: opcode = aco_opcode::buffer_store_format_d16_x; break;
6291          case 2: opcode = aco_opcode::buffer_store_format_d16_xy; break;
6292          case 3: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
6293          case 4: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
6294          default: unreachable(">4 channel buffer image store");
6295          }
6296       }
6297       aco_ptr<MUBUF_instruction> store{
6298          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6299       store->operands[0] = Operand(rsrc);
6300       store->operands[1] = Operand(vindex);
6301       store->operands[2] = Operand::c32(0);
6302       store->operands[3] = Operand(data);
6303       store->idxen = true;
6304       store->glc = glc;
6305       store->dlc = false;
6306       store->disable_wqm = true;
6307       store->sync = sync;
6308       ctx->program->needs_exact = true;
6309       ctx->block->instructions.emplace_back(std::move(store));
6310       return;
6311    }
6312
6313    assert(data.type() == RegType::vgpr);
6314    std::vector<Temp> coords = get_image_coords(ctx, instr);
6315    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6316
6317    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6318    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6319
6320    uint32_t dmask = BITFIELD_MASK(num_components);
6321    /* remove zero/undef elements from data, components which aren't in dmask
6322     * are zeroed anyway
6323     */
6324    if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
6325       for (uint32_t i = 0; i < instr->num_components; i++) {
6326          nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
6327          if ((nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0) ||
6328              nir_scalar_is_undef(comp))
6329             dmask &= ~BITFIELD_BIT(i);
6330       }
6331
6332       /* dmask cannot be 0, at least one vgpr is always read */
6333       if (dmask == 0)
6334          dmask = 1;
6335
6336       if (dmask != BITFIELD_MASK(num_components)) {
6337          uint32_t dmask_count = util_bitcount(dmask);
6338          RegClass rc = d16 ? v2b : v1;
6339          if (dmask_count == 1) {
6340             data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
6341          } else {
6342             aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6343                aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
6344             uint32_t index = 0;
6345             u_foreach_bit (bit, dmask) {
6346                vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
6347             }
6348             data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
6349             vec->definitions[0] = Definition(data);
6350             bld.insert(std::move(vec));
6351          }
6352       }
6353    }
6354
6355    MIMG_instruction* store =
6356       emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
6357    store->glc = glc;
6358    store->dlc = false;
6359    store->a16 = instr->src[1].ssa->bit_size == 16;
6360    store->d16 = d16;
6361    store->dmask = dmask;
6362    store->unrm = true;
6363    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6364    store->dim = sdim;
6365    store->da = should_declare_array(sdim);
6366    store->disable_wqm = true;
6367    store->sync = sync;
6368    ctx->program->needs_exact = true;
6369    return;
6370 }
6371
6372 void
6373 translate_buffer_image_atomic_op(const nir_atomic_op op, aco_opcode* buf_op, aco_opcode* buf_op64,
6374                                  aco_opcode* image_op)
6375 {
6376    switch (op) {
6377    case nir_atomic_op_iadd:
6378       *buf_op = aco_opcode::buffer_atomic_add;
6379       *buf_op64 = aco_opcode::buffer_atomic_add_x2;
6380       *image_op = aco_opcode::image_atomic_add;
6381       break;
6382    case nir_atomic_op_umin:
6383       *buf_op = aco_opcode::buffer_atomic_umin;
6384       *buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6385       *image_op = aco_opcode::image_atomic_umin;
6386       break;
6387    case nir_atomic_op_imin:
6388       *buf_op = aco_opcode::buffer_atomic_smin;
6389       *buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6390       *image_op = aco_opcode::image_atomic_smin;
6391       break;
6392    case nir_atomic_op_umax:
6393       *buf_op = aco_opcode::buffer_atomic_umax;
6394       *buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6395       *image_op = aco_opcode::image_atomic_umax;
6396       break;
6397    case nir_atomic_op_imax:
6398       *buf_op = aco_opcode::buffer_atomic_smax;
6399       *buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6400       *image_op = aco_opcode::image_atomic_smax;
6401       break;
6402    case nir_atomic_op_iand:
6403       *buf_op = aco_opcode::buffer_atomic_and;
6404       *buf_op64 = aco_opcode::buffer_atomic_and_x2;
6405       *image_op = aco_opcode::image_atomic_and;
6406       break;
6407    case nir_atomic_op_ior:
6408       *buf_op = aco_opcode::buffer_atomic_or;
6409       *buf_op64 = aco_opcode::buffer_atomic_or_x2;
6410       *image_op = aco_opcode::image_atomic_or;
6411       break;
6412    case nir_atomic_op_ixor:
6413       *buf_op = aco_opcode::buffer_atomic_xor;
6414       *buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6415       *image_op = aco_opcode::image_atomic_xor;
6416       break;
6417    case nir_atomic_op_xchg:
6418       *buf_op = aco_opcode::buffer_atomic_swap;
6419       *buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6420       *image_op = aco_opcode::image_atomic_swap;
6421       break;
6422    case nir_atomic_op_cmpxchg:
6423       *buf_op = aco_opcode::buffer_atomic_cmpswap;
6424       *buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6425       *image_op = aco_opcode::image_atomic_cmpswap;
6426       break;
6427    case nir_atomic_op_inc_wrap:
6428       *buf_op = aco_opcode::buffer_atomic_inc;
6429       *buf_op64 = aco_opcode::buffer_atomic_inc_x2;
6430       *image_op = aco_opcode::image_atomic_inc;
6431       break;
6432    case nir_atomic_op_dec_wrap:
6433       *buf_op = aco_opcode::buffer_atomic_dec;
6434       *buf_op64 = aco_opcode::buffer_atomic_dec_x2;
6435       *image_op = aco_opcode::image_atomic_dec;
6436       break;
6437    case nir_atomic_op_fadd:
6438       *buf_op = aco_opcode::buffer_atomic_add_f32;
6439       *buf_op64 = aco_opcode::num_opcodes;
6440       *image_op = aco_opcode::num_opcodes;
6441       break;
6442    case nir_atomic_op_fmin:
6443       *buf_op = aco_opcode::buffer_atomic_fmin;
6444       *buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6445       *image_op = aco_opcode::image_atomic_fmin;
6446       break;
6447    case nir_atomic_op_fmax:
6448       *buf_op = aco_opcode::buffer_atomic_fmax;
6449       *buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6450       *image_op = aco_opcode::image_atomic_fmax;
6451       break;
6452    default: unreachable("unsupported atomic operation");
6453    }
6454 }
6455
6456 void
6457 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6458 {
6459    bool return_previous = !nir_def_is_unused(&instr->def);
6460    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6461    bool is_array = nir_intrinsic_image_array(instr);
6462    Builder bld(ctx->program, ctx->block);
6463
6464    const nir_atomic_op op = nir_intrinsic_atomic_op(instr);
6465    const bool cmpswap = op == nir_atomic_op_cmpxchg;
6466
6467    aco_opcode buf_op, buf_op64, image_op;
6468    translate_buffer_image_atomic_op(op, &buf_op, &buf_op64, &image_op);
6469
6470    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6471    bool is_64bit = data.bytes() == 8;
6472    assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6473
6474    if (cmpswap)
6475       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6476                         get_ssa_temp(ctx, instr->src[4].ssa), data);
6477
6478    Temp dst = get_ssa_temp(ctx, &instr->def);
6479    memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6480
6481    if (dim == GLSL_SAMPLER_DIM_BUF) {
6482       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6483       Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6484       // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
6485       // implemented.");
6486       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6487          is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6488       mubuf->operands[0] = Operand(resource);
6489       mubuf->operands[1] = Operand(vindex);
6490       mubuf->operands[2] = Operand::c32(0);
6491       mubuf->operands[3] = Operand(data);
6492       Definition def =
6493          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6494       if (return_previous)
6495          mubuf->definitions[0] = def;
6496       mubuf->offset = 0;
6497       mubuf->idxen = true;
6498       mubuf->glc = return_previous;
6499       mubuf->dlc = false; /* Not needed for atomics */
6500       mubuf->disable_wqm = true;
6501       mubuf->sync = sync;
6502       ctx->program->needs_exact = true;
6503       ctx->block->instructions.emplace_back(std::move(mubuf));
6504       if (return_previous && cmpswap)
6505          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6506       return;
6507    }
6508
6509    std::vector<Temp> coords = get_image_coords(ctx, instr);
6510    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6511    Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
6512    MIMG_instruction* mimg =
6513       emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data));
6514    mimg->glc = return_previous;
6515    mimg->dlc = false; /* Not needed for atomics */
6516    mimg->dmask = (1 << data.size()) - 1;
6517    mimg->a16 = instr->src[1].ssa->bit_size == 16;
6518    mimg->unrm = true;
6519    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6520    mimg->dim = sdim;
6521    mimg->da = should_declare_array(sdim);
6522    mimg->disable_wqm = true;
6523    mimg->sync = sync;
6524    ctx->program->needs_exact = true;
6525    if (return_previous && cmpswap)
6526       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::zero());
6527    return;
6528 }
6529
6530 void
6531 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6532 {
6533    Builder bld(ctx->program, ctx->block);
6534    unsigned num_components = instr->num_components;
6535
6536    Temp dst = get_ssa_temp(ctx, &instr->def);
6537    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6538
6539    unsigned access = nir_intrinsic_access(instr);
6540    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6541    unsigned size = instr->def.bit_size / 8;
6542
6543    bool allow_smem = access & ACCESS_CAN_REORDER;
6544
6545    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6546                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6547                get_memory_sync_info(instr, storage_buffer, 0));
6548 }
6549
6550 void
6551 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6552 {
6553    Builder bld(ctx->program, ctx->block);
6554    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6555    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6556    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6557    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6558
6559    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6560
6561    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6562    bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) &&
6563               ctx->program->gfx_level < GFX11;
6564
6565    unsigned write_count = 0;
6566    Temp write_datas[32];
6567    unsigned offsets[32];
6568    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6569                       write_datas, offsets);
6570
6571    /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6572     * correctly when the SGPR offset is used.
6573     */
6574    if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
6575       offset = as_vgpr(ctx, offset);
6576
6577    for (unsigned i = 0; i < write_count; i++) {
6578       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6579
6580       aco_ptr<MUBUF_instruction> store{
6581          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6582       store->operands[0] = Operand(rsrc);
6583       store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6584       store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6585       store->operands[3] = Operand(write_datas[i]);
6586       store->offset = offsets[i];
6587       store->offen = (offset.type() == RegType::vgpr);
6588       store->glc = glc;
6589       store->dlc = false;
6590       store->disable_wqm = true;
6591       store->sync = sync;
6592       ctx->program->needs_exact = true;
6593       ctx->block->instructions.emplace_back(std::move(store));
6594    }
6595 }
6596
6597 void
6598 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6599 {
6600    Builder bld(ctx->program, ctx->block);
6601    bool return_previous = !nir_def_is_unused(&instr->def);
6602    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6603
6604    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6605    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6606
6607    aco_opcode op32, op64, image_op;
6608    translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6609
6610    if (cmpswap)
6611       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6612                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6613
6614    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6615    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6616    Temp dst = get_ssa_temp(ctx, &instr->def);
6617
6618    aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6619    aco_ptr<MUBUF_instruction> mubuf{
6620       create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6621    mubuf->operands[0] = Operand(rsrc);
6622    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6623    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6624    mubuf->operands[3] = Operand(data);
6625    Definition def =
6626       return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6627    if (return_previous)
6628       mubuf->definitions[0] = def;
6629    mubuf->offset = 0;
6630    mubuf->offen = (offset.type() == RegType::vgpr);
6631    mubuf->glc = return_previous;
6632    mubuf->dlc = false; /* Not needed for atomics */
6633    mubuf->disable_wqm = true;
6634    mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6635    ctx->program->needs_exact = true;
6636    ctx->block->instructions.emplace_back(std::move(mubuf));
6637    if (return_previous && cmpswap)
6638       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6639 }
6640
6641 void
6642 parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset,
6643              Temp* offset)
6644 {
6645    bool is_store = intrin->intrinsic == nir_intrinsic_store_global_amd;
6646    *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa);
6647
6648    *const_offset = nir_intrinsic_base(intrin);
6649
6650    unsigned num_src = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
6651    nir_src offset_src = intrin->src[num_src - 1];
6652    if (!nir_src_is_const(offset_src) || nir_src_as_uint(offset_src))
6653       *offset = get_ssa_temp(ctx, offset_src.ssa);
6654    else
6655       *offset = Temp();
6656 }
6657
6658 void
6659 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6660 {
6661    Builder bld(ctx->program, ctx->block);
6662    unsigned num_components = instr->num_components;
6663    unsigned component_size = instr->def.bit_size / 8;
6664
6665    Temp addr, offset;
6666    uint32_t const_offset;
6667    parse_global(ctx, instr, &addr, &const_offset, &offset);
6668
6669    LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->def), num_components,
6670                         component_size};
6671    if (offset.id()) {
6672       info.resource = addr;
6673       info.offset = Operand(offset);
6674    }
6675    info.const_offset = const_offset;
6676    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6677    info.align_mul = nir_intrinsic_align_mul(instr);
6678    info.align_offset = nir_intrinsic_align_offset(instr);
6679    info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6680
6681    /* Don't expand global loads when they use MUBUF or SMEM.
6682     * Global loads don't have the bounds checking that buffer loads have that
6683     * makes this safe.
6684     */
6685    unsigned align = nir_intrinsic_align(instr);
6686    bool byte_align_for_smem_mubuf =
6687       can_use_byte_align_for_global_load(num_components, component_size, align, false);
6688
6689    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6690     * it's safe to use SMEM */
6691    bool can_use_smem =
6692       (nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE) && byte_align_for_smem_mubuf;
6693    if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->gfx_level < GFX8) ||
6694        !can_use_smem) {
6695       EmitLoadParameters params = global_load_params;
6696       params.byte_align_loads = ctx->options->gfx_level > GFX6 || byte_align_for_smem_mubuf;
6697       emit_load(ctx, bld, info, params);
6698    } else {
6699       if (info.resource.id())
6700          info.resource = bld.as_uniform(info.resource);
6701       info.offset = Operand(bld.as_uniform(info.offset));
6702       emit_load(ctx, bld, info, smem_load_params);
6703    }
6704 }
6705
6706 void
6707 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6708 {
6709    Builder bld(ctx->program, ctx->block);
6710    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6711    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6712
6713    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6714    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6715    bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) &&
6716               ctx->program->gfx_level < GFX11;
6717
6718    unsigned write_count = 0;
6719    Temp write_datas[32];
6720    unsigned offsets[32];
6721    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6722                       write_datas, offsets);
6723
6724    Temp addr, offset;
6725    uint32_t const_offset;
6726    parse_global(ctx, instr, &addr, &const_offset, &offset);
6727
6728    for (unsigned i = 0; i < write_count; i++) {
6729       Temp write_address = addr;
6730       uint32_t write_const_offset = const_offset;
6731       Temp write_offset = offset;
6732       lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset);
6733
6734       if (ctx->options->gfx_level >= GFX7) {
6735          bool global = ctx->options->gfx_level >= GFX9;
6736          aco_opcode op;
6737          switch (write_datas[i].bytes()) {
6738          case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6739          case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6740          case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6741          case 8:
6742             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6743             break;
6744          case 12:
6745             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6746             break;
6747          case 16:
6748             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6749             break;
6750          default: unreachable("store_global not implemented for this size.");
6751          }
6752
6753          aco_ptr<FLAT_instruction> flat{
6754             create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6755          if (write_address.regClass() == s2) {
6756             assert(global && write_offset.id() && write_offset.type() == RegType::vgpr);
6757             flat->operands[0] = Operand(write_offset);
6758             flat->operands[1] = Operand(write_address);
6759          } else {
6760             assert(write_address.type() == RegType::vgpr && !write_offset.id());
6761             flat->operands[0] = Operand(write_address);
6762             flat->operands[1] = Operand(s1);
6763          }
6764          flat->operands[2] = Operand(write_datas[i]);
6765          flat->glc = glc;
6766          flat->dlc = false;
6767          assert(global || !write_const_offset);
6768          flat->offset = write_const_offset;
6769          flat->disable_wqm = true;
6770          flat->sync = sync;
6771          ctx->program->needs_exact = true;
6772          ctx->block->instructions.emplace_back(std::move(flat));
6773       } else {
6774          assert(ctx->options->gfx_level == GFX6);
6775
6776          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6777
6778          Temp rsrc = get_gfx6_global_rsrc(bld, write_address);
6779
6780          aco_ptr<MUBUF_instruction> mubuf{
6781             create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6782          mubuf->operands[0] = Operand(rsrc);
6783          mubuf->operands[1] =
6784             write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
6785          mubuf->operands[2] = Operand(write_offset);
6786          mubuf->operands[3] = Operand(write_datas[i]);
6787          mubuf->glc = glc;
6788          mubuf->dlc = false;
6789          mubuf->offset = write_const_offset;
6790          mubuf->addr64 = write_address.type() == RegType::vgpr;
6791          mubuf->disable_wqm = true;
6792          mubuf->sync = sync;
6793          ctx->program->needs_exact = true;
6794          ctx->block->instructions.emplace_back(std::move(mubuf));
6795       }
6796    }
6797 }
6798
6799 void
6800 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6801 {
6802    Builder bld(ctx->program, ctx->block);
6803    bool return_previous = !nir_def_is_unused(&instr->def);
6804    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6805
6806    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6807    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6808
6809    if (cmpswap)
6810       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6811                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6812
6813    Temp dst = get_ssa_temp(ctx, &instr->def);
6814
6815    aco_opcode op32, op64;
6816
6817    Temp addr, offset;
6818    uint32_t const_offset;
6819    parse_global(ctx, instr, &addr, &const_offset, &offset);
6820    lower_global_address(bld, 0, &addr, &const_offset, &offset);
6821
6822    if (ctx->options->gfx_level >= GFX7) {
6823       bool global = ctx->options->gfx_level >= GFX9;
6824       switch (nir_op) {
6825       case nir_atomic_op_iadd:
6826          op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6827          op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6828          break;
6829       case nir_atomic_op_imin:
6830          op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6831          op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6832          break;
6833       case nir_atomic_op_umin:
6834          op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6835          op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6836          break;
6837       case nir_atomic_op_imax:
6838          op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6839          op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6840          break;
6841       case nir_atomic_op_umax:
6842          op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6843          op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6844          break;
6845       case nir_atomic_op_iand:
6846          op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6847          op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6848          break;
6849       case nir_atomic_op_ior:
6850          op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6851          op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6852          break;
6853       case nir_atomic_op_ixor:
6854          op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6855          op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6856          break;
6857       case nir_atomic_op_xchg:
6858          op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6859          op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6860          break;
6861       case nir_atomic_op_cmpxchg:
6862          op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6863          op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6864          break;
6865       case nir_atomic_op_fadd:
6866          op32 = global ? aco_opcode::global_atomic_add_f32 : aco_opcode::flat_atomic_add_f32;
6867          op64 = aco_opcode::num_opcodes;
6868          break;
6869       case nir_atomic_op_fmin:
6870          op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
6871          op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
6872          break;
6873       case nir_atomic_op_fmax:
6874          op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
6875          op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
6876          break;
6877       default: unreachable("unsupported atomic operation");
6878       }
6879
6880       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6881       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
6882          op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6883       if (addr.regClass() == s2) {
6884          assert(global && offset.id() && offset.type() == RegType::vgpr);
6885          flat->operands[0] = Operand(offset);
6886          flat->operands[1] = Operand(addr);
6887       } else {
6888          assert(addr.type() == RegType::vgpr && !offset.id());
6889          flat->operands[0] = Operand(addr);
6890          flat->operands[1] = Operand(s1);
6891       }
6892       flat->operands[2] = Operand(data);
6893       if (return_previous)
6894          flat->definitions[0] = Definition(dst);
6895       flat->glc = return_previous;
6896       flat->dlc = false; /* Not needed for atomics */
6897       assert(global || !const_offset);
6898       flat->offset = const_offset;
6899       flat->disable_wqm = true;
6900       flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6901       ctx->program->needs_exact = true;
6902       ctx->block->instructions.emplace_back(std::move(flat));
6903    } else {
6904       assert(ctx->options->gfx_level == GFX6);
6905
6906       UNUSED aco_opcode image_op;
6907       translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6908
6909       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6910
6911       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6912
6913       aco_ptr<MUBUF_instruction> mubuf{
6914          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6915       mubuf->operands[0] = Operand(rsrc);
6916       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6917       mubuf->operands[2] = Operand(offset);
6918       mubuf->operands[3] = Operand(data);
6919       Definition def =
6920          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6921       if (return_previous)
6922          mubuf->definitions[0] = def;
6923       mubuf->glc = return_previous;
6924       mubuf->dlc = false;
6925       mubuf->offset = const_offset;
6926       mubuf->addr64 = addr.type() == RegType::vgpr;
6927       mubuf->disable_wqm = true;
6928       mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6929       ctx->program->needs_exact = true;
6930       ctx->block->instructions.emplace_back(std::move(mubuf));
6931       if (return_previous && cmpswap)
6932          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6933    }
6934 }
6935
6936 unsigned
6937 aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)
6938 {
6939    unsigned storage = storage_none;
6940
6941    if (mem_mode & nir_var_shader_out)
6942       storage |= storage_vmem_output;
6943    if ((mem_mode & nir_var_mem_ssbo) || (mem_mode & nir_var_mem_global))
6944       storage |= storage_buffer;
6945    if (mem_mode & nir_var_mem_task_payload)
6946       storage |= storage_task_payload;
6947    if (mem_mode & nir_var_mem_shared)
6948       storage |= storage_shared;
6949    if (mem_mode & nir_var_image)
6950       storage |= storage_image;
6951
6952    return storage;
6953 }
6954
6955 void
6956 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
6957 {
6958    Builder bld(ctx->program, ctx->block);
6959
6960    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
6961    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
6962    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
6963                 !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]);
6964    bool v_offset_zero = nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]);
6965    bool s_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
6966
6967    Temp dst = get_ssa_temp(ctx, &intrin->def);
6968    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
6969    Temp v_offset =
6970       v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
6971    Temp s_offset =
6972       s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
6973    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
6974
6975    bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
6976    bool slc = nir_intrinsic_access(intrin) & ACCESS_NON_TEMPORAL;
6977
6978    unsigned const_offset = nir_intrinsic_base(intrin);
6979    unsigned elem_size_bytes = intrin->def.bit_size / 8u;
6980    unsigned num_components = intrin->def.num_components;
6981
6982    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
6983    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
6984
6985    LoadEmitInfo info = {Operand(v_offset), dst, num_components, elem_size_bytes, descriptor};
6986    info.idx = idx;
6987    info.glc = glc;
6988    info.slc = slc;
6989    info.soffset = s_offset;
6990    info.const_offset = const_offset;
6991    info.sync = sync;
6992
6993    if (intrin->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
6994       const pipe_format format = nir_intrinsic_format(intrin);
6995       const struct ac_vtx_format_info* vtx_info =
6996          ac_get_vtx_format_info(ctx->program->gfx_level, ctx->program->family, format);
6997       const struct util_format_description* f = util_format_description(format);
6998       const unsigned align_mul = nir_intrinsic_align_mul(intrin);
6999       const unsigned align_offset = nir_intrinsic_align_offset(intrin);
7000
7001       /* Avoid splitting:
7002        * - non-array formats because that would result in incorrect code
7003        * - when element size is same as component size (to reduce instruction count)
7004        */
7005       const bool can_split = f->is_array && elem_size_bytes != vtx_info->chan_byte_size;
7006
7007       info.align_mul = align_mul;
7008       info.align_offset = align_offset;
7009       info.format = format;
7010       info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
7011       info.split_by_component_stride = false;
7012
7013       emit_load(ctx, bld, info, mtbuf_load_params);
7014    } else {
7015       assert(intrin->intrinsic == nir_intrinsic_load_buffer_amd);
7016
7017       if (nir_intrinsic_access(intrin) & ACCESS_USES_FORMAT_AMD) {
7018          assert(!swizzled);
7019
7020          emit_load(ctx, bld, info, mubuf_load_format_params);
7021       } else {
7022          const unsigned swizzle_element_size =
7023             swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
7024
7025          info.component_stride = swizzle_element_size;
7026          info.swizzle_component_size = swizzle_element_size ? 4 : 0;
7027          info.align_mul = MIN2(elem_size_bytes, 4);
7028          info.align_offset = 0;
7029
7030          emit_load(ctx, bld, info, mubuf_load_params);
7031       }
7032    }
7033 }
7034
7035 void
7036 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7037 {
7038    Builder bld(ctx->program, ctx->block);
7039
7040    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7041    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7042    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7043                 !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]);
7044    bool v_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
7045    bool s_offset_zero = nir_src_is_const(intrin->src[3]) && !nir_src_as_uint(intrin->src[3]);
7046
7047    Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7048    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa));
7049    Temp v_offset =
7050       v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa));
7051    Temp s_offset =
7052       s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
7053    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
7054
7055    bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
7056    bool slc = nir_intrinsic_access(intrin) & ACCESS_NON_TEMPORAL;
7057
7058    unsigned const_offset = nir_intrinsic_base(intrin);
7059    unsigned write_mask = nir_intrinsic_write_mask(intrin);
7060    unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7061
7062    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7063    /* GS outputs are only written once. */
7064    const bool written_once =
7065       mem_mode == nir_var_shader_out && ctx->shader->info.stage == MESA_SHADER_GEOMETRY;
7066    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode),
7067                          written_once ? semantic_can_reorder : semantic_none);
7068
7069    store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, idx, const_offset,
7070                     elem_size_bytes, write_mask, swizzled, sync, glc, slc);
7071 }
7072
7073 void
7074 visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr)
7075 {
7076    Builder bld(ctx->program, ctx->block);
7077    Temp dst = get_ssa_temp(ctx, &instr->def);
7078    Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
7079    Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7080
7081    /* If base address is 32bit, convert to 64bit with the high 32bit part. */
7082    if (base.bytes() == 4) {
7083       base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), base,
7084                         Operand::c32(ctx->options->address32_hi));
7085    }
7086
7087    aco_opcode opcode = aco_opcode::s_load_dword;
7088    unsigned size = 1;
7089
7090    assert(dst.bytes() <= 64);
7091
7092    if (dst.bytes() > 32) {
7093       opcode = aco_opcode::s_load_dwordx16;
7094       size = 16;
7095    } else if (dst.bytes() > 16) {
7096       opcode = aco_opcode::s_load_dwordx8;
7097       size = 8;
7098    } else if (dst.bytes() > 8) {
7099       opcode = aco_opcode::s_load_dwordx4;
7100       size = 4;
7101    } else if (dst.bytes() > 4) {
7102       opcode = aco_opcode::s_load_dwordx2;
7103       size = 2;
7104    }
7105
7106    if (dst.size() != size) {
7107       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst),
7108                  bld.smem(opcode, bld.def(RegType::sgpr, size), base, offset), Operand::c32(0u));
7109    } else {
7110       bld.smem(opcode, Definition(dst), base, offset);
7111    }
7112    emit_split_vector(ctx, dst, instr->def.num_components);
7113 }
7114
7115 sync_scope
7116 translate_nir_scope(mesa_scope scope)
7117 {
7118    switch (scope) {
7119    case SCOPE_NONE:
7120    case SCOPE_INVOCATION: return scope_invocation;
7121    case SCOPE_SUBGROUP: return scope_subgroup;
7122    case SCOPE_WORKGROUP: return scope_workgroup;
7123    case SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7124    case SCOPE_DEVICE: return scope_device;
7125    case SCOPE_SHADER_CALL: return scope_invocation;
7126    }
7127    unreachable("invalid scope");
7128 }
7129
7130 void
7131 emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7132 {
7133    Builder bld(ctx->program, ctx->block);
7134
7135    unsigned storage_allowed = storage_buffer | storage_image;
7136    unsigned semantics = 0;
7137    sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7138    sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7139
7140    /* We use shared storage for the following:
7141     * - compute shaders expose it in their API
7142     * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7143     * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7144     * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7145     */
7146    bool shared_storage_used =
7147       ctx->stage.hw == AC_HW_COMPUTE_SHADER || ctx->stage.hw == AC_HW_LOCAL_SHADER ||
7148       ctx->stage.hw == AC_HW_HULL_SHADER ||
7149       (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->program->gfx_level >= GFX9) ||
7150       ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7151
7152    if (shared_storage_used)
7153       storage_allowed |= storage_shared;
7154
7155    /* Task payload: Task Shader output, Mesh Shader input */
7156    if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS))
7157       storage_allowed |= storage_task_payload;
7158
7159    /* Allow VMEM output for all stages that can have outputs. */
7160    if ((ctx->stage.hw != AC_HW_COMPUTE_SHADER && ctx->stage.hw != AC_HW_PIXEL_SHADER) ||
7161        ctx->stage.has(SWStage::TS))
7162       storage_allowed |= storage_vmem_output;
7163
7164    /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7165     * They are allowed in CS, TCS, and in any NGG shader.
7166     */
7167    ASSERTED bool workgroup_scope_allowed = ctx->stage.hw == AC_HW_COMPUTE_SHADER ||
7168                                            ctx->stage.hw == AC_HW_HULL_SHADER ||
7169                                            ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7170
7171    unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7172    unsigned storage = aco_storage_mode_from_nir_mem_mode(nir_storage);
7173    storage &= storage_allowed;
7174
7175    unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7176    if (nir_semantics & NIR_MEMORY_ACQUIRE)
7177       semantics |= semantic_acquire | semantic_release;
7178    if (nir_semantics & NIR_MEMORY_RELEASE)
7179       semantics |= semantic_acquire | semantic_release;
7180
7181    assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7182    assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7183
7184    bld.barrier(aco_opcode::p_barrier,
7185                memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7186                exec_scope);
7187 }
7188
7189 void
7190 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7191 {
7192    // TODO: implement sparse reads using ds_read2_b32 and nir_def_components_read()
7193    Temp dst = get_ssa_temp(ctx, &instr->def);
7194    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7195    Builder bld(ctx->program, ctx->block);
7196
7197    unsigned elem_size_bytes = instr->def.bit_size / 8;
7198    unsigned num_components = instr->def.num_components;
7199    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7200    load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7201 }
7202
7203 void
7204 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7205 {
7206    unsigned writemask = nir_intrinsic_write_mask(instr);
7207    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7208    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7209    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7210
7211    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7212    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7213 }
7214
7215 void
7216 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7217 {
7218    unsigned offset = nir_intrinsic_base(instr);
7219    Builder bld(ctx->program, ctx->block);
7220    Operand m = load_lds_size_m0(bld);
7221    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7222    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7223
7224    unsigned num_operands = 3;
7225    aco_opcode op32, op64, op32_rtn, op64_rtn;
7226    switch (nir_intrinsic_atomic_op(instr)) {
7227    case nir_atomic_op_iadd:
7228       op32 = aco_opcode::ds_add_u32;
7229       op64 = aco_opcode::ds_add_u64;
7230       op32_rtn = aco_opcode::ds_add_rtn_u32;
7231       op64_rtn = aco_opcode::ds_add_rtn_u64;
7232       break;
7233    case nir_atomic_op_imin:
7234       op32 = aco_opcode::ds_min_i32;
7235       op64 = aco_opcode::ds_min_i64;
7236       op32_rtn = aco_opcode::ds_min_rtn_i32;
7237       op64_rtn = aco_opcode::ds_min_rtn_i64;
7238       break;
7239    case nir_atomic_op_umin:
7240       op32 = aco_opcode::ds_min_u32;
7241       op64 = aco_opcode::ds_min_u64;
7242       op32_rtn = aco_opcode::ds_min_rtn_u32;
7243       op64_rtn = aco_opcode::ds_min_rtn_u64;
7244       break;
7245    case nir_atomic_op_imax:
7246       op32 = aco_opcode::ds_max_i32;
7247       op64 = aco_opcode::ds_max_i64;
7248       op32_rtn = aco_opcode::ds_max_rtn_i32;
7249       op64_rtn = aco_opcode::ds_max_rtn_i64;
7250       break;
7251    case nir_atomic_op_umax:
7252       op32 = aco_opcode::ds_max_u32;
7253       op64 = aco_opcode::ds_max_u64;
7254       op32_rtn = aco_opcode::ds_max_rtn_u32;
7255       op64_rtn = aco_opcode::ds_max_rtn_u64;
7256       break;
7257    case nir_atomic_op_iand:
7258       op32 = aco_opcode::ds_and_b32;
7259       op64 = aco_opcode::ds_and_b64;
7260       op32_rtn = aco_opcode::ds_and_rtn_b32;
7261       op64_rtn = aco_opcode::ds_and_rtn_b64;
7262       break;
7263    case nir_atomic_op_ior:
7264       op32 = aco_opcode::ds_or_b32;
7265       op64 = aco_opcode::ds_or_b64;
7266       op32_rtn = aco_opcode::ds_or_rtn_b32;
7267       op64_rtn = aco_opcode::ds_or_rtn_b64;
7268       break;
7269    case nir_atomic_op_ixor:
7270       op32 = aco_opcode::ds_xor_b32;
7271       op64 = aco_opcode::ds_xor_b64;
7272       op32_rtn = aco_opcode::ds_xor_rtn_b32;
7273       op64_rtn = aco_opcode::ds_xor_rtn_b64;
7274       break;
7275    case nir_atomic_op_xchg:
7276       op32 = aco_opcode::ds_write_b32;
7277       op64 = aco_opcode::ds_write_b64;
7278       op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7279       op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7280       break;
7281    case nir_atomic_op_cmpxchg:
7282       op32 = aco_opcode::ds_cmpst_b32;
7283       op64 = aco_opcode::ds_cmpst_b64;
7284       op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7285       op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7286       num_operands = 4;
7287       break;
7288    case nir_atomic_op_fadd:
7289       op32 = aco_opcode::ds_add_f32;
7290       op32_rtn = aco_opcode::ds_add_rtn_f32;
7291       op64 = aco_opcode::num_opcodes;
7292       op64_rtn = aco_opcode::num_opcodes;
7293       break;
7294    case nir_atomic_op_fmin:
7295       op32 = aco_opcode::ds_min_f32;
7296       op32_rtn = aco_opcode::ds_min_rtn_f32;
7297       op64 = aco_opcode::ds_min_f64;
7298       op64_rtn = aco_opcode::ds_min_rtn_f64;
7299       break;
7300    case nir_atomic_op_fmax:
7301       op32 = aco_opcode::ds_max_f32;
7302       op32_rtn = aco_opcode::ds_max_rtn_f32;
7303       op64 = aco_opcode::ds_max_f64;
7304       op64_rtn = aco_opcode::ds_max_rtn_f64;
7305       break;
7306    default: unreachable("Unhandled shared atomic intrinsic");
7307    }
7308
7309    bool return_previous = !nir_def_is_unused(&instr->def);
7310
7311    aco_opcode op;
7312    if (data.size() == 1) {
7313       assert(instr->def.bit_size == 32);
7314       op = return_previous ? op32_rtn : op32;
7315    } else {
7316       assert(instr->def.bit_size == 64);
7317       op = return_previous ? op64_rtn : op64;
7318    }
7319
7320    if (offset > 65535) {
7321       address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7322       offset = 0;
7323    }
7324
7325    aco_ptr<DS_instruction> ds;
7326    ds.reset(
7327       create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7328    ds->operands[0] = Operand(address);
7329    ds->operands[1] = Operand(data);
7330    if (num_operands == 4) {
7331       Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7332       ds->operands[2] = Operand(data2);
7333       if (bld.program->gfx_level >= GFX11)
7334          std::swap(ds->operands[1], ds->operands[2]);
7335    }
7336    ds->operands[num_operands - 1] = m;
7337    ds->offset0 = offset;
7338    if (return_previous)
7339       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->def));
7340    ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7341
7342    if (m.isUndefined())
7343       ds->operands.pop_back();
7344
7345    ctx->block->instructions.emplace_back(std::move(ds));
7346 }
7347
7348 void
7349 visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
7350 {
7351    bool is_store = instr->intrinsic == nir_intrinsic_store_shared2_amd;
7352    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa));
7353    Builder bld(ctx->program, ctx->block);
7354
7355    assert(bld.program->gfx_level >= GFX7);
7356
7357    bool is64bit = (is_store ? instr->src[0].ssa->bit_size : instr->def.bit_size) == 64;
7358    uint8_t offset0 = nir_intrinsic_offset0(instr);
7359    uint8_t offset1 = nir_intrinsic_offset1(instr);
7360    bool st64 = nir_intrinsic_st64(instr);
7361
7362    Operand m = load_lds_size_m0(bld);
7363    Instruction* ds;
7364    if (is_store) {
7365       aco_opcode op = st64
7366                          ? (is64bit ? aco_opcode::ds_write2st64_b64 : aco_opcode::ds_write2st64_b32)
7367                          : (is64bit ? aco_opcode::ds_write2_b64 : aco_opcode::ds_write2_b32);
7368       Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7369       RegClass comp_rc = is64bit ? v2 : v1;
7370       Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc);
7371       Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc);
7372       ds = bld.ds(op, address, data0, data1, m, offset0, offset1);
7373    } else {
7374       Temp dst = get_ssa_temp(ctx, &instr->def);
7375       Definition tmp_dst(dst.type() == RegType::vgpr ? dst : bld.tmp(is64bit ? v4 : v2));
7376       aco_opcode op = st64 ? (is64bit ? aco_opcode::ds_read2st64_b64 : aco_opcode::ds_read2st64_b32)
7377                            : (is64bit ? aco_opcode::ds_read2_b64 : aco_opcode::ds_read2_b32);
7378       ds = bld.ds(op, tmp_dst, address, m, offset0, offset1);
7379    }
7380    ds->ds().sync = memory_sync_info(storage_shared);
7381    if (m.isUndefined())
7382       ds->operands.pop_back();
7383
7384    if (!is_store) {
7385       Temp dst = get_ssa_temp(ctx, &instr->def);
7386       if (dst.type() == RegType::sgpr) {
7387          emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size());
7388          Temp comp[4];
7389          /* Use scalar v_readfirstlane_b32 for better 32-bit copy propagation */
7390          for (unsigned i = 0; i < dst.size(); i++)
7391             comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1));
7392          if (is64bit) {
7393             Temp comp0 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[0], comp[1]);
7394             Temp comp1 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[2], comp[3]);
7395             ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]};
7396             ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]};
7397             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp0, comp1);
7398             ctx->allocated_vec[dst.id()] = {comp0, comp1};
7399          } else {
7400             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp[0], comp[1]);
7401          }
7402       }
7403
7404       emit_split_vector(ctx, dst, 2);
7405    }
7406 }
7407
7408 Temp
7409 get_scratch_resource(isel_context* ctx)
7410 {
7411    Builder bld(ctx->program, ctx->block);
7412    Temp scratch_addr = ctx->program->private_segment_buffer;
7413    if (!scratch_addr.bytes()) {
7414       Temp addr_lo =
7415          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
7416       Temp addr_hi =
7417          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
7418       scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
7419    } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
7420       scratch_addr =
7421          bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7422    }
7423
7424    uint32_t rsrc_conf =
7425       S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7426
7427    if (ctx->program->gfx_level >= GFX10) {
7428       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7429                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
7430                    S_008F0C_RESOURCE_LEVEL(ctx->program->gfx_level < GFX11);
7431    } else if (ctx->program->gfx_level <=
7432               GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7433       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7434                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7435    }
7436
7437    /* older generations need element size = 4 bytes. element size removed in GFX9 */
7438    if (ctx->program->gfx_level <= GFX8)
7439       rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7440
7441    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7442                      Operand::c32(rsrc_conf));
7443 }
7444
7445 void
7446 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7447 {
7448    Builder bld(ctx->program, ctx->block);
7449    Temp dst = get_ssa_temp(ctx, &instr->def);
7450
7451    LoadEmitInfo info = {Operand(v1), dst, instr->def.num_components, instr->def.bit_size / 8u};
7452    info.align_mul = nir_intrinsic_align_mul(instr);
7453    info.align_offset = nir_intrinsic_align_offset(instr);
7454    info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
7455    info.sync = memory_sync_info(storage_scratch, semantic_private);
7456    if (ctx->program->gfx_level >= GFX9) {
7457       if (nir_src_is_const(instr->src[0])) {
7458          uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7459          info.offset =
7460             bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
7461          info.const_offset = nir_src_as_uint(instr->src[0]) % max;
7462       } else {
7463          info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
7464       }
7465       EmitLoadParameters params = scratch_flat_load_params;
7466       params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
7467       emit_load(ctx, bld, info, params);
7468    } else {
7469       info.resource = get_scratch_resource(ctx);
7470       info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7471       info.soffset = ctx->program->scratch_offset;
7472       emit_load(ctx, bld, info, scratch_mubuf_load_params);
7473    }
7474 }
7475
7476 void
7477 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7478 {
7479    Builder bld(ctx->program, ctx->block);
7480    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7481    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
7482
7483    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7484    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7485
7486    unsigned write_count = 0;
7487    Temp write_datas[32];
7488    unsigned offsets[32];
7489    unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
7490    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7491                       &write_count, write_datas, offsets);
7492
7493    if (ctx->program->gfx_level >= GFX9) {
7494       uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7495       offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset;
7496       uint32_t base_const_offset =
7497          nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0;
7498
7499       for (unsigned i = 0; i < write_count; i++) {
7500          aco_opcode op;
7501          switch (write_datas[i].bytes()) {
7502          case 1: op = aco_opcode::scratch_store_byte; break;
7503          case 2: op = aco_opcode::scratch_store_short; break;
7504          case 4: op = aco_opcode::scratch_store_dword; break;
7505          case 8: op = aco_opcode::scratch_store_dwordx2; break;
7506          case 12: op = aco_opcode::scratch_store_dwordx3; break;
7507          case 16: op = aco_opcode::scratch_store_dwordx4; break;
7508          default: unreachable("Unexpected store size");
7509          }
7510
7511          uint32_t const_offset = base_const_offset + offsets[i];
7512          assert(const_offset < max || offset.id() == 0);
7513
7514          Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
7515          Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
7516          if (offset.id() == 0)
7517             saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max)));
7518
7519          bld.scratch(op, addr, saddr, write_datas[i], const_offset % max,
7520                      memory_sync_info(storage_scratch, semantic_private));
7521       }
7522    } else {
7523       Temp rsrc = get_scratch_resource(ctx);
7524       offset = as_vgpr(ctx, offset);
7525       for (unsigned i = 0; i < write_count; i++) {
7526          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7527          Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
7528                                         write_datas[i], offsets[i], true, true);
7529          mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7530       }
7531    }
7532 }
7533
7534 void
7535 emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src, Temp dst)
7536 {
7537    Builder bld(ctx->program, ctx->block);
7538    assert(dst.regClass() == bld.lm);
7539
7540    if (cluster_size == 1) {
7541       bld.copy(Definition(dst), src);
7542    }
7543    if (op == nir_op_iand && cluster_size == 4) {
7544       /* subgroupClusteredAnd(val, 4) -> ~wqm(~val & exec) */
7545       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
7546       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
7547       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc),
7548                bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7549    } else if (op == nir_op_ior && cluster_size == 4) {
7550       /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
7551       bld.sop1(
7552          Builder::s_wqm, Definition(dst), bld.def(s1, scc),
7553          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7554    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7555       /* subgroupAnd(val) -> (~val & exec) == 0 */
7556       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
7557       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
7558                .def(1)
7559                .getTemp();
7560       Temp cond = bool_to_vector_condition(ctx, tmp);
7561       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
7562    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7563       /* subgroupOr(val) -> (val & exec) != 0 */
7564       Temp tmp =
7565          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
7566             .def(1)
7567             .getTemp();
7568       bool_to_vector_condition(ctx, tmp, dst);
7569    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7570       /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
7571       Temp tmp =
7572          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7573       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7574       tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
7575                .def(1)
7576                .getTemp();
7577       bool_to_vector_condition(ctx, tmp, dst);
7578    } else {
7579       /* subgroupClustered{And,Or,Xor}(val, n):
7580        *   lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
7581        *   cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)
7582        * subgroupClusteredAnd():
7583        *   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7584        * subgroupClusteredOr():
7585        *   return ((val & exec) >> cluster_offset) & cluster_mask != 0
7586        * subgroupClusteredXor():
7587        *   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7588        */
7589       Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
7590       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
7591                                      Operand::c32(~uint32_t(cluster_size - 1)), lane_id);
7592
7593       Temp tmp;
7594       if (op == nir_op_iand)
7595          tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,
7596                         Operand(exec, bld.lm));
7597       else
7598          tmp =
7599             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7600
7601       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7602
7603       if (ctx->program->gfx_level <= GFX7)
7604          tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7605       else if (ctx->program->wave_size == 64)
7606          tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7607       else
7608          tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7609       tmp = emit_extract_vector(ctx, tmp, 0, v1);
7610       if (cluster_mask != 0xffffffff)
7611          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
7612
7613       if (op == nir_op_iand) {
7614          bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand::c32(cluster_mask), tmp);
7615       } else if (op == nir_op_ior) {
7616          bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), tmp);
7617       } else if (op == nir_op_ixor) {
7618          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
7619                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
7620          bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), tmp);
7621       }
7622    }
7623 }
7624
7625 void
7626 emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src, Temp dst)
7627 {
7628    Builder bld(ctx->program, ctx->block);
7629    assert(src.regClass() == bld.lm);
7630
7631    /* subgroupExclusiveAnd(val) -> mbcnt(~val & exec) == 0
7632     * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7633     * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7634     */
7635    if (op == nir_op_iand)
7636       src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
7637
7638    Temp tmp =
7639       bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7640
7641    Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
7642
7643    if (op == nir_op_iand)
7644       bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand::zero(), mbcnt);
7645    else if (op == nir_op_ior)
7646       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), mbcnt);
7647    else if (op == nir_op_ixor)
7648       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(),
7649                bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
7650 }
7651
7652 void
7653 emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src, Temp dst)
7654 {
7655    Builder bld(ctx->program, ctx->block);
7656
7657    /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7658     * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7659     * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7660     */
7661    Temp tmp = bld.tmp(bld.lm);
7662    emit_boolean_exclusive_scan(ctx, op, src, tmp);
7663    if (op == nir_op_iand)
7664       bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, src);
7665    else if (op == nir_op_ior)
7666       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), tmp, src);
7667    else if (op == nir_op_ixor)
7668       bld.sop2(Builder::s_xor, Definition(dst), bld.def(s1, scc), tmp, src);
7669 }
7670
7671 ReduceOp
7672 get_reduce_op(nir_op op, unsigned bit_size)
7673 {
7674    switch (op) {
7675 #define CASEI(name)                                                                                \
7676    case nir_op_##name:                                                                             \
7677       return (bit_size == 32)   ? name##32                                                         \
7678              : (bit_size == 16) ? name##16                                                         \
7679              : (bit_size == 8)  ? name##8                                                          \
7680                                 : name##64;
7681 #define CASEF(name)                                                                                \
7682    case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7683       CASEI(iadd)
7684       CASEI(imul)
7685       CASEI(imin)
7686       CASEI(umin)
7687       CASEI(imax)
7688       CASEI(umax)
7689       CASEI(iand)
7690       CASEI(ior)
7691       CASEI(ixor)
7692       CASEF(fadd)
7693       CASEF(fmul)
7694       CASEF(fmin)
7695       CASEF(fmax)
7696    default: unreachable("unknown reduction op");
7697 #undef CASEI
7698 #undef CASEF
7699    }
7700 }
7701
7702 void
7703 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7704 {
7705    Builder bld(ctx->program, ctx->block);
7706    Definition dst(get_ssa_temp(ctx, &instr->def));
7707    assert(dst.regClass().type() != RegType::vgpr);
7708    if (src.regClass().type() == RegType::vgpr)
7709       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7710    else
7711       bld.copy(dst, src);
7712 }
7713
7714 void
7715 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7716 {
7717    Builder bld(ctx->program, ctx->block);
7718    Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7719
7720    if (op == nir_op_fadd) {
7721       src_tmp = as_vgpr(ctx, src_tmp);
7722       Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
7723                                       : dst.getTemp();
7724
7725       if (src.ssa->bit_size == 16) {
7726          count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7727          bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7728       } else {
7729          assert(src.ssa->bit_size == 32);
7730          count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7731          bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7732       }
7733
7734       if (tmp != dst.getTemp())
7735          bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7736
7737       return;
7738    }
7739
7740    if (dst.regClass() == s1)
7741       src_tmp = bld.as_uniform(src_tmp);
7742
7743    if (op == nir_op_ixor && count.type() == RegType::sgpr)
7744       count =
7745          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7746    else if (op == nir_op_ixor)
7747       count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7748
7749    assert(dst.getTemp().type() == count.type());
7750
7751    if (nir_src_is_const(src)) {
7752       if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7753          bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7754       else if (nir_src_as_uint(src) == 1)
7755          bld.copy(dst, count);
7756       else if (nir_src_as_uint(src) == 0)
7757          bld.copy(dst, Operand::zero(dst.bytes()));
7758       else if (count.type() == RegType::vgpr)
7759          bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7760       else
7761          bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7762    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
7763       bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7764    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
7765       bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7766    } else if (dst.getTemp().type() == RegType::vgpr) {
7767       bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7768    } else {
7769       bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7770    }
7771 }
7772
7773 bool
7774 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7775 {
7776    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7777    if (op == nir_op_imul || op == nir_op_fmul)
7778       return false;
7779
7780    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7781       Builder bld(ctx->program, ctx->block);
7782       Definition dst(get_ssa_temp(ctx, &instr->def));
7783       unsigned bit_size = instr->src[0].ssa->bit_size;
7784       if (bit_size > 32)
7785          return false;
7786
7787       Temp thread_count =
7788          bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7789       set_wqm(ctx, nir_intrinsic_include_helpers(instr));
7790
7791       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7792    } else {
7793       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7794    }
7795
7796    return true;
7797 }
7798
7799 bool
7800 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7801 {
7802    Builder bld(ctx->program, ctx->block);
7803    Definition dst(get_ssa_temp(ctx, &instr->def));
7804    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7805    bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7806
7807    if (op == nir_op_imul || op == nir_op_fmul)
7808       return false;
7809
7810    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7811       if (instr->src[0].ssa->bit_size > 32)
7812          return false;
7813
7814       Temp packed_tid;
7815       if (inc)
7816          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7817       else
7818          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7819       set_wqm(ctx);
7820
7821       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7822       return true;
7823    }
7824
7825    assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7826           op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7827
7828    if (inc) {
7829       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7830       return true;
7831    }
7832
7833    /* Copy the source and write the reduction operation identity to the first lane. */
7834    Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7835    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7836    ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7837    if (dst.bytes() == 8) {
7838       Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7839       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7840       uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7841       uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7842
7843       lo =
7844          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
7845       hi =
7846          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
7847       bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7848    } else {
7849       uint32_t identity = get_reduction_identity(reduce_op, 0);
7850       bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
7851                     as_vgpr(ctx, src));
7852    }
7853
7854    set_wqm(ctx);
7855    return true;
7856 }
7857
7858 Temp
7859 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7860                      Definition dst, Temp src)
7861 {
7862    assert(src.bytes() <= 8);
7863    assert(src.type() == RegType::vgpr);
7864
7865    Builder bld(ctx->program, ctx->block);
7866
7867    unsigned num_defs = 0;
7868    Definition defs[5];
7869    defs[num_defs++] = dst;
7870    defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7871
7872    /* scalar identity temporary */
7873    bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) &&
7874                      aco_op != aco_opcode::p_reduce;
7875    if (aco_op == aco_opcode::p_exclusive_scan) {
7876       need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7877                      op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7878                      op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7879                      op == fmul64);
7880    }
7881    if (need_sitmp)
7882       defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7883
7884    /* scc clobber */
7885    defs[num_defs++] = bld.def(s1, scc);
7886
7887    /* vcc clobber */
7888    bool clobber_vcc = false;
7889    if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9)
7890       clobber_vcc = true;
7891    if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8)
7892       clobber_vcc = true;
7893    if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7894       clobber_vcc = true;
7895
7896    if (clobber_vcc)
7897       defs[num_defs++] = bld.def(bld.lm, vcc);
7898
7899    Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7900       aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7901    reduce->operands[0] = Operand(src);
7902    /* setup_reduce_temp will update these undef operands if needed */
7903    reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7904    reduce->operands[2] = Operand(v1.as_linear());
7905    std::copy(defs, defs + num_defs, reduce->definitions.begin());
7906
7907    reduce->reduce_op = op;
7908    reduce->cluster_size = cluster_size;
7909    bld.insert(std::move(reduce));
7910
7911    return dst.getTemp();
7912 }
7913
7914 Temp
7915 inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Definition dst, Temp src)
7916 {
7917    Builder bld(ctx->program, ctx->block);
7918
7919    Temp scan = emit_reduction_instr(ctx, aco_opcode::p_inclusive_scan, op, ctx->program->wave_size,
7920                                     bld.def(dst.regClass()), src);
7921
7922    switch (op) {
7923    case iadd8:
7924    case iadd16:
7925    case iadd32: return bld.vsub32(dst, scan, src);
7926    case ixor64:
7927    case iadd64: {
7928       Temp src00 = bld.tmp(v1);
7929       Temp src01 = bld.tmp(v1);
7930       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), scan);
7931       Temp src10 = bld.tmp(v1);
7932       Temp src11 = bld.tmp(v1);
7933       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src);
7934
7935       Temp lower = bld.tmp(v1);
7936       Temp upper = bld.tmp(v1);
7937       if (op == iadd64) {
7938          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
7939          bld.vsub32(Definition(upper), src01, src11, false, borrow);
7940       } else {
7941          bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10);
7942          bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11);
7943       }
7944       return bld.pseudo(aco_opcode::p_create_vector, dst, lower, upper);
7945    }
7946    case ixor8:
7947    case ixor16:
7948    case ixor32: return bld.vop2(aco_opcode::v_xor_b32, dst, scan, src);
7949    default: unreachable("Unsupported op");
7950    }
7951 }
7952
7953 void
7954 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
7955 {
7956    Builder bld(ctx->program, ctx->block);
7957    Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7958    Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7959
7960    Temp ddx_1, ddx_2, ddy_1, ddy_2;
7961    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7962    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7963    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7964
7965    /* Build DD X/Y */
7966    if (ctx->program->gfx_level >= GFX8) {
7967       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7968       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7969       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7970       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7971       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7972       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7973    } else {
7974       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7975       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7976       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7977       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7978       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_1);
7979
7980       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7981       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7982       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_2);
7983       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7984       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7985    }
7986
7987    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7988    aco_opcode mad =
7989       ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
7990    Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
7991    Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
7992    tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
7993    tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
7994    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp1, tmp2);
7995    set_wqm(ctx, true);
7996    return;
7997 }
7998
7999 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
8000 Temp lanecount_to_mask(isel_context* ctx, Temp count);
8001 void pops_await_overlapped_waves(isel_context* ctx);
8002
8003 Temp
8004 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, enum glsl_interp_mode interp)
8005 {
8006    bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
8007    if (intrin == nir_intrinsic_load_barycentric_pixel ||
8008        intrin == nir_intrinsic_load_barycentric_at_offset) {
8009       return get_arg(ctx, linear ? ctx->args->linear_center : ctx->args->persp_center);
8010    } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
8011       return get_arg(ctx, linear ? ctx->args->linear_centroid : ctx->args->persp_centroid);
8012    } else {
8013       assert(intrin == nir_intrinsic_load_barycentric_sample);
8014       return get_arg(ctx, linear ? ctx->args->linear_sample : ctx->args->persp_sample);
8015    }
8016 }
8017
8018 void
8019 ds_ordered_count_offsets(isel_context* ctx, unsigned index_operand, unsigned wave_release,
8020                          unsigned wave_done, unsigned* offset0, unsigned* offset1)
8021 {
8022    unsigned ordered_count_index = index_operand & 0x3f;
8023    unsigned count_dword = (index_operand >> 24) & 0xf;
8024
8025    assert(ctx->options->gfx_level >= GFX10);
8026    assert(count_dword >= 1 && count_dword <= 4);
8027
8028    *offset0 = ordered_count_index << 2;
8029    *offset1 = wave_release | (wave_done << 1) | ((count_dword - 1) << 6);
8030
8031    if (ctx->options->gfx_level < GFX11)
8032       *offset1 |= 3 /* GS shader type */ << 2;
8033 }
8034
8035 struct aco_export_mrt {
8036    Operand out[4];
8037    unsigned enabled_channels;
8038    unsigned target;
8039    bool compr;
8040 };
8041
8042 static void
8043 create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
8044                                 const struct aco_export_mrt* mrt1)
8045 {
8046    Builder bld(ctx->program, ctx->block);
8047
8048    aco_ptr<Pseudo_instruction> exp{create_instruction<Pseudo_instruction>(
8049       aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
8050    for (unsigned i = 0; i < 4; i++) {
8051       exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
8052       exp->operands[i].setLateKill(true);
8053       exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
8054       exp->operands[i + 4].setLateKill(true);
8055    }
8056
8057    RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
8058    exp->definitions[0] = bld.def(type); /* mrt0 */
8059    exp->definitions[1] = bld.def(type); /* mrt1 */
8060    exp->definitions[2] = bld.def(bld.lm);
8061    exp->definitions[3] = bld.def(bld.lm);
8062    exp->definitions[4] = bld.def(bld.lm, vcc);
8063    exp->definitions[5] = bld.def(s1, scc);
8064    ctx->block->instructions.emplace_back(std::move(exp));
8065
8066    ctx->program->has_color_exports = true;
8067 }
8068
8069 void
8070 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
8071 {
8072    Builder bld(ctx->program, ctx->block);
8073    switch (instr->intrinsic) {
8074    case nir_intrinsic_load_barycentric_sample:
8075    case nir_intrinsic_load_barycentric_pixel:
8076    case nir_intrinsic_load_barycentric_centroid: {
8077       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
8078       Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
8079       assert(bary.size() == 2);
8080       Temp dst = get_ssa_temp(ctx, &instr->def);
8081       bld.copy(Definition(dst), bary);
8082       emit_split_vector(ctx, dst, 2);
8083       break;
8084    }
8085    case nir_intrinsic_load_barycentric_model: {
8086       Temp model = get_arg(ctx, ctx->args->pull_model);
8087       assert(model.size() == 3);
8088       Temp dst = get_ssa_temp(ctx, &instr->def);
8089       bld.copy(Definition(dst), model);
8090       emit_split_vector(ctx, dst, 3);
8091       break;
8092    }
8093    case nir_intrinsic_load_barycentric_at_offset: {
8094       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8095       RegClass rc = RegClass(offset.type(), 1);
8096       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8097       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8098       Temp bary = get_interp_param(ctx, instr->intrinsic,
8099                                    (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8100       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->def), bary, pos1, pos2);
8101       break;
8102    }
8103    case nir_intrinsic_load_front_face: {
8104       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8105                Operand::zero(), get_arg(ctx, ctx->args->front_face));
8106       break;
8107    }
8108    case nir_intrinsic_load_view_index: {
8109       Temp dst = get_ssa_temp(ctx, &instr->def);
8110       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->view_index)));
8111       break;
8112    }
8113    case nir_intrinsic_load_frag_coord: {
8114       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->def), 4);
8115       break;
8116    }
8117    case nir_intrinsic_load_frag_shading_rate:
8118       emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->def));
8119       break;
8120    case nir_intrinsic_load_sample_pos: {
8121       Temp posx = get_arg(ctx, ctx->args->frag_pos[0]);
8122       Temp posy = get_arg(ctx, ctx->args->frag_pos[1]);
8123       bld.pseudo(
8124          aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->def)),
8125          posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8126          posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8127       break;
8128    }
8129    case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8130    case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8131    case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8132    case nir_intrinsic_load_input:
8133    case nir_intrinsic_load_input_vertex:
8134       if (ctx->program->stage == fragment_fs)
8135          visit_load_fs_input(ctx, instr);
8136       else
8137          isel_err(&instr->instr, "Shader inputs should have been lowered in NIR.");
8138       break;
8139    case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8140    case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8141    case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8142    case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8143    case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8144    case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8145    case nir_intrinsic_shared_atomic:
8146    case nir_intrinsic_shared_atomic_swap: visit_shared_atomic(ctx, instr); break;
8147    case nir_intrinsic_load_shared2_amd:
8148    case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break;
8149    case nir_intrinsic_bindless_image_load:
8150    case nir_intrinsic_bindless_image_fragment_mask_load_amd:
8151    case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
8152    case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
8153    case nir_intrinsic_bindless_image_atomic:
8154    case nir_intrinsic_bindless_image_atomic_swap: visit_image_atomic(ctx, instr); break;
8155    case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8156    case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8157    case nir_intrinsic_load_typed_buffer_amd:
8158    case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8159    case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8160    case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;
8161    case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break;
8162    case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break;
8163    case nir_intrinsic_global_atomic_amd:
8164    case nir_intrinsic_global_atomic_swap_amd: visit_global_atomic(ctx, instr); break;
8165    case nir_intrinsic_ssbo_atomic:
8166    case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
8167    case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8168    case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8169    case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
8170    case nir_intrinsic_load_num_workgroups: {
8171       Temp dst = get_ssa_temp(ctx, &instr->def);
8172       if (ctx->options->load_grid_size_from_user_sgpr) {
8173          bld.copy(Definition(dst), get_arg(ctx, ctx->args->num_work_groups));
8174       } else {
8175          Temp addr = get_arg(ctx, ctx->args->num_work_groups);
8176          assert(addr.regClass() == s2);
8177          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8178                     bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
8179                     bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
8180       }
8181       emit_split_vector(ctx, dst, 3);
8182       break;
8183    }
8184    case nir_intrinsic_load_ray_launch_size: {
8185       Temp dst = get_ssa_temp(ctx, &instr->def);
8186       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->rt.launch_size)));
8187       emit_split_vector(ctx, dst, 3);
8188       break;
8189    }
8190    case nir_intrinsic_load_ray_launch_id: {
8191       Temp dst = get_ssa_temp(ctx, &instr->def);
8192       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->rt.launch_id)));
8193       emit_split_vector(ctx, dst, 3);
8194       break;
8195    }
8196    case nir_intrinsic_load_ray_launch_size_addr_amd: {
8197       Temp dst = get_ssa_temp(ctx, &instr->def);
8198       Temp addr = get_arg(ctx, ctx->args->rt.launch_size_addr);
8199       assert(addr.regClass() == s2);
8200       bld.copy(Definition(dst), Operand(addr));
8201       break;
8202    }
8203    case nir_intrinsic_load_local_invocation_id: {
8204       Temp dst = get_ssa_temp(ctx, &instr->def);
8205       if (ctx->options->gfx_level >= GFX11) {
8206          Temp local_ids[3];
8207
8208          /* Thread IDs are packed in VGPR0, 10 bits per component. */
8209          for (uint32_t i = 0; i < 3; i++) {
8210             if (i == 0 && ctx->shader->info.workgroup_size[1] == 1 &&
8211                 ctx->shader->info.workgroup_size[2] == 1 &&
8212                 !ctx->shader->info.workgroup_size_variable) {
8213                local_ids[i] = get_arg(ctx, ctx->args->local_invocation_ids);
8214             } else if (i == 2 || (i == 1 && ctx->shader->info.workgroup_size[2] == 1 &&
8215                                   !ctx->shader->info.workgroup_size_variable)) {
8216                local_ids[i] =
8217                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand::c32(i * 10u),
8218                            get_arg(ctx, ctx->args->local_invocation_ids));
8219             } else {
8220                local_ids[i] = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
8221                                        get_arg(ctx, ctx->args->local_invocation_ids),
8222                                        Operand::c32(i * 10u), Operand::c32(10u));
8223             }
8224          }
8225
8226          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), local_ids[0], local_ids[1],
8227                     local_ids[2]);
8228       } else {
8229          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->local_invocation_ids)));
8230       }
8231       emit_split_vector(ctx, dst, 3);
8232       break;
8233    }
8234    case nir_intrinsic_load_workgroup_id: {
8235       Temp dst = get_ssa_temp(ctx, &instr->def);
8236       if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
8237          const struct ac_arg* ids = ctx->args->workgroup_ids;
8238          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8239                     ids[0].used ? Operand(get_arg(ctx, ids[0])) : Operand::zero(),
8240                     ids[1].used ? Operand(get_arg(ctx, ids[1])) : Operand::zero(),
8241                     ids[2].used ? Operand(get_arg(ctx, ids[2])) : Operand::zero());
8242          emit_split_vector(ctx, dst, 3);
8243       } else {
8244          isel_err(&instr->instr, "Unsupported stage for load_workgroup_id");
8245       }
8246       break;
8247    }
8248    case nir_intrinsic_load_local_invocation_index: {
8249       if (ctx->stage.hw == AC_HW_LOCAL_SHADER || ctx->stage.hw == AC_HW_HULL_SHADER) {
8250          if (ctx->options->gfx_level >= GFX11) {
8251             /* On GFX11, RelAutoIndex is WaveID * WaveSize + ThreadID. */
8252             Temp wave_id =
8253                bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8254                         get_arg(ctx, ctx->args->tcs_wave_id), Operand::c32(0u | (3u << 16)));
8255
8256             Temp temp = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), wave_id,
8257                                  Operand::c32(ctx->program->wave_size));
8258             emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def), Operand(), Operand(temp));
8259          } else {
8260             bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
8261                      get_arg(ctx, ctx->args->vs_rel_patch_id));
8262          }
8263          break;
8264       } else if (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER ||
8265                  ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER) {
8266          bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), thread_id_in_threadgroup(ctx));
8267          break;
8268       } else if (ctx->program->workgroup_size <= ctx->program->wave_size) {
8269          emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8270          break;
8271       }
8272
8273       Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8274
8275       /* The tg_size bits [6:11] contain the subgroup id,
8276        * we need this multiplied by the wave size, and then OR the thread id to it.
8277        */
8278       if (ctx->program->wave_size == 64) {
8279          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8280           * feed that to v_or */
8281          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8282                                 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->tg_size));
8283          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num, id);
8284       } else {
8285          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8286          Temp tg_num =
8287             bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8288                      get_arg(ctx, ctx->args->tg_size), Operand::c32(0x6u | (0x6u << 16)));
8289          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num,
8290                   Operand::c32(0x5u), id);
8291       }
8292       break;
8293    }
8294    case nir_intrinsic_load_subgroup_invocation: {
8295       emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8296       break;
8297    }
8298    case nir_intrinsic_ballot: {
8299       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8300       Temp dst = get_ssa_temp(ctx, &instr->def);
8301
8302       if (instr->src[0].ssa->bit_size == 1) {
8303          assert(src.regClass() == bld.lm);
8304       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8305          src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8306       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8307          src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8308       } else {
8309          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8310       }
8311
8312       /* Make sure that all inactive lanes return zero.
8313        * Value-numbering might remove the comparison above */
8314       Definition def = dst.size() == bld.lm.size() ? Definition(dst) : bld.def(bld.lm);
8315       src = bld.sop2(Builder::s_and, def, bld.def(s1, scc), src, Operand(exec, bld.lm));
8316       if (dst.size() != bld.lm.size()) {
8317          /* Wave32 with ballot size set to 64 */
8318          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8319       }
8320
8321       set_wqm(ctx);
8322       break;
8323    }
8324    case nir_intrinsic_inverse_ballot: {
8325       Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8326       Temp dst = get_ssa_temp(ctx, &instr->def);
8327
8328       assert(dst.size() == bld.lm.size());
8329       if (src.size() > dst.size()) {
8330          emit_extract_vector(ctx, src, 0, dst);
8331       } else if (src.size() < dst.size()) {
8332          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8333       } else {
8334          bld.copy(Definition(dst), src);
8335       }
8336       break;
8337    }
8338    case nir_intrinsic_shuffle:
8339    case nir_intrinsic_read_invocation: {
8340       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8341       if (!nir_src_is_divergent(instr->src[0])) {
8342          emit_uniform_subgroup(ctx, instr, src);
8343       } else {
8344          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8345          if (instr->intrinsic == nir_intrinsic_read_invocation ||
8346              !nir_src_is_divergent(instr->src[1]))
8347             tid = bld.as_uniform(tid);
8348          Temp dst = get_ssa_temp(ctx, &instr->def);
8349
8350          if (instr->def.bit_size != 1)
8351             src = as_vgpr(ctx, src);
8352
8353          if (src.regClass() == v1b || src.regClass() == v2b) {
8354             Temp tmp = bld.tmp(v1);
8355             tmp = emit_bpermute(ctx, bld, tid, src);
8356             if (dst.type() == RegType::vgpr)
8357                bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8358                           bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8359             else
8360                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8361          } else if (src.regClass() == v1) {
8362             Temp tmp = emit_bpermute(ctx, bld, tid, src);
8363             bld.copy(Definition(dst), tmp);
8364          } else if (src.regClass() == v2) {
8365             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8366             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8367             lo = emit_bpermute(ctx, bld, tid, lo);
8368             hi = emit_bpermute(ctx, bld, tid, hi);
8369             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8370             emit_split_vector(ctx, dst, 2);
8371          } else if (instr->def.bit_size == 1 && tid.regClass() == s1) {
8372             assert(src.regClass() == bld.lm);
8373             Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
8374             bool_to_vector_condition(ctx, tmp, dst);
8375          } else if (instr->def.bit_size == 1 && tid.regClass() == v1) {
8376             assert(src.regClass() == bld.lm);
8377             Temp tmp;
8378             if (ctx->program->gfx_level <= GFX7)
8379                tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
8380             else if (ctx->program->wave_size == 64)
8381                tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
8382             else
8383                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
8384             tmp = emit_extract_vector(ctx, tmp, 0, v1);
8385             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
8386             bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), tmp);
8387          } else {
8388             isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8389          }
8390          set_wqm(ctx);
8391       }
8392       break;
8393    }
8394    case nir_intrinsic_load_sample_id: {
8395       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8396                get_arg(ctx, ctx->args->ancillary), Operand::c32(8u), Operand::c32(4u));
8397       break;
8398    }
8399    case nir_intrinsic_read_first_invocation: {
8400       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8401       Temp dst = get_ssa_temp(ctx, &instr->def);
8402       if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
8403          bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
8404       } else if (src.regClass() == v2) {
8405          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8406          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8407          lo = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo);
8408          hi = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi);
8409          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8410          emit_split_vector(ctx, dst, 2);
8411       } else if (instr->def.bit_size == 1) {
8412          assert(src.regClass() == bld.lm);
8413          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8414                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8415          bool_to_vector_condition(ctx, tmp, dst);
8416       } else {
8417          bld.copy(Definition(dst), src);
8418       }
8419       set_wqm(ctx);
8420       break;
8421    }
8422    case nir_intrinsic_vote_all: {
8423       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8424       Temp dst = get_ssa_temp(ctx, &instr->def);
8425       assert(src.regClass() == bld.lm);
8426       assert(dst.regClass() == bld.lm);
8427
8428       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8429       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
8430                .def(1)
8431                .getTemp();
8432       Temp cond = bool_to_vector_condition(ctx, tmp);
8433       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8434       set_wqm(ctx);
8435       break;
8436    }
8437    case nir_intrinsic_vote_any: {
8438       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8439       Temp dst = get_ssa_temp(ctx, &instr->def);
8440       assert(src.regClass() == bld.lm);
8441       assert(dst.regClass() == bld.lm);
8442
8443       Temp tmp = bool_to_scalar_condition(ctx, src);
8444       bool_to_vector_condition(ctx, tmp, dst);
8445       set_wqm(ctx);
8446       break;
8447    }
8448    case nir_intrinsic_reduce:
8449    case nir_intrinsic_inclusive_scan:
8450    case nir_intrinsic_exclusive_scan: {
8451       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8452       Temp dst = get_ssa_temp(ctx, &instr->def);
8453       nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8454       unsigned cluster_size =
8455          instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8456       cluster_size = util_next_power_of_two(
8457          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8458       bool create_helpers =
8459          instr->intrinsic == nir_intrinsic_reduce && nir_intrinsic_include_helpers(instr);
8460
8461       if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
8462           instr->def.bit_size != 1) {
8463          /* We use divergence analysis to assign the regclass, so check if it's
8464           * working as expected */
8465          ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8466          if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8467             expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8468          assert(instr->def.divergent == expected_divergent);
8469
8470          if (instr->intrinsic == nir_intrinsic_reduce) {
8471             if (emit_uniform_reduce(ctx, instr))
8472                break;
8473          } else if (emit_uniform_scan(ctx, instr)) {
8474             break;
8475          }
8476       }
8477
8478       if (instr->def.bit_size == 1) {
8479          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
8480             op = nir_op_iand;
8481          else if (op == nir_op_iadd)
8482             op = nir_op_ixor;
8483          else if (op == nir_op_umax || op == nir_op_imax)
8484             op = nir_op_ior;
8485          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
8486
8487          switch (instr->intrinsic) {
8488          case nir_intrinsic_reduce: emit_boolean_reduce(ctx, op, cluster_size, src, dst); break;
8489          case nir_intrinsic_exclusive_scan: emit_boolean_exclusive_scan(ctx, op, src, dst); break;
8490          case nir_intrinsic_inclusive_scan: emit_boolean_inclusive_scan(ctx, op, src, dst); break;
8491          default: assert(false);
8492          }
8493       } else if (cluster_size == 1) {
8494          bld.copy(Definition(dst), src);
8495       } else {
8496          unsigned bit_size = instr->src[0].ssa->bit_size;
8497
8498          src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8499
8500          ReduceOp reduce_op = get_reduce_op(op, bit_size);
8501
8502          aco_opcode aco_op;
8503          switch (instr->intrinsic) {
8504          case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8505          case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8506          case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8507          default: unreachable("unknown reduce intrinsic");
8508          }
8509
8510          /* Avoid whole wave shift. */
8511          const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan &&
8512                                                   (op == nir_op_iadd || op == nir_op_ixor) &&
8513                                                   dst.type() == RegType::vgpr;
8514          if (use_inclusive_for_exclusive)
8515             inclusive_scan_to_exclusive(ctx, reduce_op, Definition(dst), src);
8516          else
8517             emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src);
8518       }
8519       set_wqm(ctx, create_helpers);
8520       break;
8521    }
8522    case nir_intrinsic_quad_broadcast:
8523    case nir_intrinsic_quad_swap_horizontal:
8524    case nir_intrinsic_quad_swap_vertical:
8525    case nir_intrinsic_quad_swap_diagonal:
8526    case nir_intrinsic_quad_swizzle_amd: {
8527       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8528
8529       if (!instr->def.divergent) {
8530          emit_uniform_subgroup(ctx, instr, src);
8531          break;
8532       }
8533
8534       /* Quad broadcast lane. */
8535       unsigned lane = 0;
8536       /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8537       bool bool_use_valu = instr->def.bit_size == 1;
8538
8539       uint16_t dpp_ctrl = 0;
8540
8541       bool allow_fi = true;
8542       switch (instr->intrinsic) {
8543       case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8544       case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8545       case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8546       case nir_intrinsic_quad_swizzle_amd:
8547          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
8548          allow_fi &= nir_intrinsic_fetch_inactive(instr);
8549          break;
8550       case nir_intrinsic_quad_broadcast:
8551          lane = nir_src_as_const_value(instr->src[1])->u32;
8552          dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8553          bool_use_valu = false;
8554          break;
8555       default: break;
8556       }
8557
8558       Temp dst = get_ssa_temp(ctx, &instr->def);
8559
8560       /* Setup source. */
8561       if (bool_use_valu)
8562          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8563                             Operand::c32(-1), src);
8564       else if (instr->def.bit_size != 1)
8565          src = as_vgpr(ctx, src);
8566
8567       if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8568          /* Special case for quad broadcast using SALU only. */
8569          assert(src.regClass() == bld.lm && dst.regClass() == bld.lm);
8570
8571          uint32_t half_mask = 0x11111111u << lane;
8572          Operand mask_tmp = bld.lm.bytes() == 4
8573                                ? Operand::c32(half_mask)
8574                                : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8575                                             Operand::c32(half_mask), Operand::c32(half_mask));
8576
8577          src =
8578             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8579          src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8580          bld.sop1(Builder::s_wqm, Definition(dst), src);
8581       } else if (instr->def.bit_size <= 32 || bool_use_valu) {
8582          unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
8583          Definition def = (excess_bytes || bool_use_valu) ? bld.def(v1) : Definition(dst);
8584
8585          if (ctx->program->gfx_level >= GFX8)
8586             bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl, 0xf, 0xf, true, allow_fi);
8587          else
8588             bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8589
8590          if (excess_bytes)
8591             bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8592                        bld.def(RegClass::get(dst.type(), excess_bytes)), def.getTemp());
8593          if (bool_use_valu)
8594             bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), def.getTemp());
8595       } else if (instr->def.bit_size == 64) {
8596          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8597          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8598
8599          if (ctx->program->gfx_level >= GFX8) {
8600             lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl, 0xf, 0xf, true,
8601                               allow_fi);
8602             hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl, 0xf, 0xf, true,
8603                               allow_fi);
8604          } else {
8605             lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8606             hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8607          }
8608
8609          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8610          emit_split_vector(ctx, dst, 2);
8611       } else {
8612          isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8613       }
8614
8615       /* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */
8616       set_wqm(ctx, true);
8617       break;
8618    }
8619    case nir_intrinsic_masked_swizzle_amd: {
8620       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8621       if (!instr->def.divergent) {
8622          emit_uniform_subgroup(ctx, instr, src);
8623          break;
8624       }
8625       Temp dst = get_ssa_temp(ctx, &instr->def);
8626       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8627       bool allow_fi = nir_intrinsic_fetch_inactive(instr);
8628
8629       if (instr->def.bit_size != 1)
8630          src = as_vgpr(ctx, src);
8631
8632       if (instr->def.bit_size == 1) {
8633          assert(src.regClass() == bld.lm);
8634          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8635                             Operand::c32(-1), src);
8636          src = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8637          bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), src);
8638       } else if (dst.regClass() == v1b) {
8639          Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8640          emit_extract_vector(ctx, tmp, 0, dst);
8641       } else if (dst.regClass() == v2b) {
8642          Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8643          emit_extract_vector(ctx, tmp, 0, dst);
8644       } else if (dst.regClass() == v1) {
8645          bld.copy(Definition(dst), emit_masked_swizzle(ctx, bld, src, mask, allow_fi));
8646       } else if (dst.regClass() == v2) {
8647          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8648          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8649          lo = emit_masked_swizzle(ctx, bld, lo, mask, allow_fi);
8650          hi = emit_masked_swizzle(ctx, bld, hi, mask, allow_fi);
8651          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8652          emit_split_vector(ctx, dst, 2);
8653       } else {
8654          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8655       }
8656       set_wqm(ctx);
8657       break;
8658    }
8659    case nir_intrinsic_write_invocation_amd: {
8660       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8661       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8662       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8663       Temp dst = get_ssa_temp(ctx, &instr->def);
8664       if (dst.regClass() == v1) {
8665          /* src2 is ignored for writelane. RA assigns the same reg for dst */
8666          bld.writelane(Definition(dst), val, lane, src);
8667       } else if (dst.regClass() == v2) {
8668          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8669          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8670          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8671          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8672          Temp lo = bld.writelane(bld.def(v1), val_lo, lane, src_hi);
8673          Temp hi = bld.writelane(bld.def(v1), val_hi, lane, src_hi);
8674          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8675          emit_split_vector(ctx, dst, 2);
8676       } else {
8677          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8678       }
8679       break;
8680    }
8681    case nir_intrinsic_mbcnt_amd: {
8682       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8683       Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8684       Temp dst = get_ssa_temp(ctx, &instr->def);
8685       /* Fit 64-bit mask for wave32 */
8686       src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8687       emit_mbcnt(ctx, dst, Operand(src), Operand(add_src));
8688       set_wqm(ctx);
8689       break;
8690    }
8691    case nir_intrinsic_lane_permute_16_amd: {
8692       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8693       Temp dst = get_ssa_temp(ctx, &instr->def);
8694       assert(ctx->program->gfx_level >= GFX10);
8695
8696       if (src.regClass() == s1) {
8697          bld.copy(Definition(dst), src);
8698       } else if (dst.regClass() == v1 && src.regClass() == v1) {
8699          bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8700                   bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8701                   bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8702       } else {
8703          isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8704       }
8705       break;
8706    }
8707    case nir_intrinsic_load_helper_invocation:
8708    case nir_intrinsic_is_helper_invocation: {
8709       /* load_helper() after demote() get lowered to is_helper().
8710        * Otherwise, these two behave the same. */
8711       Temp dst = get_ssa_temp(ctx, &instr->def);
8712       bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8713       ctx->program->needs_exact = true;
8714       break;
8715    }
8716    case nir_intrinsic_demote:
8717    case nir_intrinsic_demote_if: {
8718       Operand cond = Operand::c32(-1u);
8719       if (instr->intrinsic == nir_intrinsic_demote_if) {
8720          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8721          assert(src.regClass() == bld.lm);
8722          cond =
8723             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8724       }
8725
8726       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8727
8728       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8729          ctx->cf_info.exec_potentially_empty_discard = true;
8730
8731       ctx->block->kind |= block_kind_uses_discard;
8732       ctx->program->needs_exact = true;
8733       break;
8734    }
8735    case nir_intrinsic_terminate:
8736    case nir_intrinsic_terminate_if:
8737    case nir_intrinsic_discard:
8738    case nir_intrinsic_discard_if: {
8739       Operand cond = Operand::c32(-1u);
8740       if (instr->intrinsic == nir_intrinsic_discard_if ||
8741           instr->intrinsic == nir_intrinsic_terminate_if) {
8742          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8743          assert(src.regClass() == bld.lm);
8744          cond =
8745             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8746
8747          ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(instr->src[0]);
8748       }
8749
8750       bld.pseudo(aco_opcode::p_discard_if, cond);
8751
8752       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8753          ctx->cf_info.exec_potentially_empty_discard = true;
8754       ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
8755       ctx->block->kind |= block_kind_uses_discard;
8756       ctx->program->needs_exact = true;
8757       break;
8758    }
8759    case nir_intrinsic_first_invocation: {
8760       bld.sop1(Builder::s_ff1_i32, Definition(get_ssa_temp(ctx, &instr->def)),
8761                Operand(exec, bld.lm));
8762       set_wqm(ctx);
8763       break;
8764    }
8765    case nir_intrinsic_last_invocation: {
8766       Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8767       bld.sop2(aco_opcode::s_sub_i32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
8768                Operand::c32(ctx->program->wave_size - 1u), flbit);
8769       set_wqm(ctx);
8770       break;
8771    }
8772    case nir_intrinsic_elect: {
8773       /* p_elect is lowered in aco_insert_exec_mask.
8774        * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8775        * two p_elect with different exec masks as the same.
8776        */
8777       bld.pseudo(aco_opcode::p_elect, Definition(get_ssa_temp(ctx, &instr->def)),
8778                  Operand(exec, bld.lm));
8779       set_wqm(ctx);
8780       break;
8781    }
8782    case nir_intrinsic_shader_clock: {
8783       Temp dst = get_ssa_temp(ctx, &instr->def);
8784       if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
8785           ctx->options->gfx_level >= GFX10_3) {
8786          /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8787          Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8788          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8789       } else if (nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE &&
8790                  ctx->options->gfx_level >= GFX11) {
8791          bld.sop1(aco_opcode::s_sendmsg_rtn_b64, Definition(dst),
8792                   Operand::c32(sendmsg_rtn_get_realtime));
8793       } else {
8794          aco_opcode opcode = nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE
8795                                 ? aco_opcode::s_memrealtime
8796                                 : aco_opcode::s_memtime;
8797          bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8798       }
8799       emit_split_vector(ctx, dst, 2);
8800       break;
8801    }
8802    case nir_intrinsic_load_vertex_id_zero_base: {
8803       Temp dst = get_ssa_temp(ctx, &instr->def);
8804       bld.copy(Definition(dst), get_arg(ctx, ctx->args->vertex_id));
8805       break;
8806    }
8807    case nir_intrinsic_load_first_vertex: {
8808       Temp dst = get_ssa_temp(ctx, &instr->def);
8809       bld.copy(Definition(dst), get_arg(ctx, ctx->args->base_vertex));
8810       break;
8811    }
8812    case nir_intrinsic_load_base_instance: {
8813       Temp dst = get_ssa_temp(ctx, &instr->def);
8814       bld.copy(Definition(dst), get_arg(ctx, ctx->args->start_instance));
8815       break;
8816    }
8817    case nir_intrinsic_load_instance_id: {
8818       Temp dst = get_ssa_temp(ctx, &instr->def);
8819       bld.copy(Definition(dst), get_arg(ctx, ctx->args->instance_id));
8820       break;
8821    }
8822    case nir_intrinsic_load_draw_id: {
8823       Temp dst = get_ssa_temp(ctx, &instr->def);
8824       bld.copy(Definition(dst), get_arg(ctx, ctx->args->draw_id));
8825       break;
8826    }
8827    case nir_intrinsic_load_invocation_id: {
8828       Temp dst = get_ssa_temp(ctx, &instr->def);
8829
8830       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8831          if (ctx->options->gfx_level >= GFX10)
8832             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
8833                          get_arg(ctx, ctx->args->gs_invocation_id));
8834          else
8835             bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_invocation_id));
8836       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8837          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->tcs_rel_ids),
8838                   Operand::c32(8u), Operand::c32(5u));
8839       } else {
8840          unreachable("Unsupported stage for load_invocation_id");
8841       }
8842
8843       break;
8844    }
8845    case nir_intrinsic_load_primitive_id: {
8846       Temp dst = get_ssa_temp(ctx, &instr->def);
8847
8848       switch (ctx->shader->info.stage) {
8849       case MESA_SHADER_GEOMETRY:
8850          bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
8851          break;
8852       case MESA_SHADER_TESS_CTRL:
8853          bld.copy(Definition(dst), get_arg(ctx, ctx->args->tcs_patch_id));
8854          break;
8855       case MESA_SHADER_TESS_EVAL:
8856          bld.copy(Definition(dst), get_arg(ctx, ctx->args->tes_patch_id));
8857          break;
8858       default:
8859          if (ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && !ctx->stage.has(SWStage::GS)) {
8860             /* In case of NGG, the GS threads always have the primitive ID
8861              * even if there is no SW GS. */
8862             bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
8863             break;
8864          } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
8865             bld.copy(Definition(dst), get_arg(ctx, ctx->args->vs_prim_id));
8866             break;
8867          }
8868          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8869       }
8870
8871       break;
8872    }
8873    case nir_intrinsic_sendmsg_amd: {
8874       unsigned imm = nir_intrinsic_base(instr);
8875       Temp m0_content = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8876       bld.sopp(aco_opcode::s_sendmsg, bld.m0(m0_content), -1, imm);
8877       break;
8878    }
8879    case nir_intrinsic_load_gs_wave_id_amd: {
8880       Temp dst = get_ssa_temp(ctx, &instr->def);
8881       if (ctx->args->merged_wave_info.used)
8882          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
8883                     get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(2u), Operand::c32(8u),
8884                     Operand::zero());
8885       else if (ctx->args->gs_wave_id.used)
8886          bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_wave_id));
8887       else
8888          unreachable("Shader doesn't have GS wave ID.");
8889       break;
8890    }
8891    case nir_intrinsic_is_subgroup_invocation_lt_amd: {
8892       Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8893       bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src));
8894       break;
8895    }
8896    case nir_intrinsic_gds_atomic_add_amd: {
8897       Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8898       Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8899       Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8900       Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8901       bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8902              true);
8903       break;
8904    }
8905    case nir_intrinsic_load_sbt_base_amd: {
8906       Temp dst = get_ssa_temp(ctx, &instr->def);
8907       Temp addr = get_arg(ctx, ctx->args->rt.sbt_descriptors);
8908       assert(addr.regClass() == s2);
8909       bld.copy(Definition(dst), Operand(addr));
8910       break;
8911    }
8912    case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8913    case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
8914       bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
8915                get_arg(ctx, ctx->args->rt.dynamic_callable_stack_base));
8916       break;
8917    case nir_intrinsic_load_resume_shader_address_amd: {
8918       bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)),
8919                  bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr)));
8920       break;
8921    }
8922    case nir_intrinsic_overwrite_vs_arguments_amd: {
8923       ctx->arg_temps[ctx->args->vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
8924       ctx->arg_temps[ctx->args->instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
8925       break;
8926    }
8927    case nir_intrinsic_overwrite_tes_arguments_amd: {
8928       ctx->arg_temps[ctx->args->tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
8929       ctx->arg_temps[ctx->args->tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
8930       ctx->arg_temps[ctx->args->tes_rel_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
8931       ctx->arg_temps[ctx->args->tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[2].ssa);
8932       break;
8933    }
8934    case nir_intrinsic_load_scalar_arg_amd:
8935    case nir_intrinsic_load_vector_arg_amd: {
8936       assert(nir_intrinsic_base(instr) < ctx->args->arg_count);
8937       Temp dst = get_ssa_temp(ctx, &instr->def);
8938       Temp src = ctx->arg_temps[nir_intrinsic_base(instr)];
8939       assert(src.id());
8940       assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr
8941                                                                                   : RegType::vgpr));
8942       bld.copy(Definition(dst), src);
8943       emit_split_vector(ctx, dst, dst.size());
8944       break;
8945    }
8946    case nir_intrinsic_ordered_xfb_counter_add_amd: {
8947       Temp dst = get_ssa_temp(ctx, &instr->def);
8948       Temp ordered_id = get_ssa_temp(ctx, instr->src[0].ssa);
8949       Temp counter = get_ssa_temp(ctx, instr->src[1].ssa);
8950
8951       Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
8952       unsigned offset0, offset1;
8953       Instruction* ds_instr;
8954       Operand m;
8955
8956       /* Lock a GDS mutex. */
8957       ds_ordered_count_offsets(ctx, 1 << 24u, false, false, &offset0, &offset1);
8958       m = bld.m0(bld.as_uniform(ordered_id));
8959       ds_instr =
8960          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
8961       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
8962
8963       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
8964          aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)};
8965       unsigned write_mask = nir_intrinsic_write_mask(instr);
8966
8967       bool use_gds_registers = ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl;
8968
8969       for (unsigned i = 0; i < instr->num_components; i++) {
8970          if (write_mask & (1 << i)) {
8971             Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
8972
8973             if (use_gds_registers) {
8974                ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), Operand(),
8975                                  chan_counter, i * 4, 0u, true);
8976             } else {
8977                m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0x100u)));
8978
8979                ds_instr = bld.ds(aco_opcode::ds_add_rtn_u32, bld.def(v1), gds_base, chan_counter, m,
8980                                  i * 4, 0u, true);
8981             }
8982             ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
8983
8984             vec->operands[i] = Operand(ds_instr->definitions[0].getTemp());
8985          } else {
8986             vec->operands[i] = Operand::zero();
8987          }
8988       }
8989
8990       vec->definitions[0] = Definition(dst);
8991       ctx->block->instructions.emplace_back(std::move(vec));
8992
8993       /* Unlock a GDS mutex. */
8994       ds_ordered_count_offsets(ctx, 1 << 24u, true, true, &offset0, &offset1);
8995       m = bld.m0(bld.as_uniform(ordered_id));
8996       ds_instr =
8997          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
8998       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
8999
9000       emit_split_vector(ctx, dst, instr->num_components);
9001       break;
9002    }
9003    case nir_intrinsic_xfb_counter_sub_amd: {
9004       bool use_gds_registers = ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl;
9005
9006       unsigned write_mask = nir_intrinsic_write_mask(instr);
9007       Temp counter = get_ssa_temp(ctx, instr->src[0].ssa);
9008       Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
9009
9010       u_foreach_bit (i, write_mask) {
9011          Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
9012          Instruction* ds_instr;
9013
9014          if (use_gds_registers) {
9015             ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
9016                               i * 4, 0u, true);
9017          } else {
9018             Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0x100u)));
9019
9020             ds_instr = bld.ds(aco_opcode::ds_sub_rtn_u32, bld.def(v1), gds_base, chan_counter, m,
9021                               i * 4, 0u, true);
9022          }
9023          ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
9024       }
9025       break;
9026    }
9027    case nir_intrinsic_export_amd: {
9028       unsigned flags = nir_intrinsic_flags(instr);
9029       unsigned target = nir_intrinsic_base(instr);
9030       unsigned write_mask = nir_intrinsic_write_mask(instr);
9031
9032       /* Mark vertex export block. */
9033       if (target == V_008DFC_SQ_EXP_POS || target <= V_008DFC_SQ_EXP_NULL)
9034          ctx->block->kind |= block_kind_export_end;
9035
9036       if (target < V_008DFC_SQ_EXP_MRTZ)
9037          ctx->program->has_color_exports = true;
9038
9039       aco_ptr<Export_instruction> exp{
9040          create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9041
9042       exp->dest = target;
9043       exp->enabled_mask = write_mask;
9044       exp->compressed = flags & AC_EXP_FLAG_COMPRESSED;
9045
9046       /* ACO may reorder position/mrt export instructions, then mark done for last
9047        * export instruction. So don't respect the nir AC_EXP_FLAG_DONE for position/mrt
9048        * exports here and leave it to ACO.
9049        */
9050       if (target == V_008DFC_SQ_EXP_PRIM)
9051          exp->done = flags & AC_EXP_FLAG_DONE;
9052       else
9053          exp->done = false;
9054
9055       /* ACO may reorder mrt export instructions, then mark valid mask for last
9056        * export instruction. So don't respect the nir AC_EXP_FLAG_VALID_MASK for mrt
9057        * exports here and leave it to ACO.
9058        */
9059       if (target > V_008DFC_SQ_EXP_NULL)
9060          exp->valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
9061       else
9062          exp->valid_mask = false;
9063
9064       /* Compressed export uses two bits for a channel. */
9065       uint32_t channel_mask =
9066          exp->compressed ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0) : write_mask;
9067
9068       Temp value = get_ssa_temp(ctx, instr->src[0].ssa);
9069       for (unsigned i = 0; i < 4; i++) {
9070          exp->operands[i] = channel_mask & BITFIELD_BIT(i)
9071                                ? Operand(emit_extract_vector(ctx, value, i, v1))
9072                                : Operand(v1);
9073       }
9074
9075       ctx->block->instructions.emplace_back(std::move(exp));
9076       break;
9077    }
9078    case nir_intrinsic_export_dual_src_blend_amd: {
9079       Temp val0 = get_ssa_temp(ctx, instr->src[0].ssa);
9080       Temp val1 = get_ssa_temp(ctx, instr->src[1].ssa);
9081       unsigned write_mask = nir_intrinsic_write_mask(instr);
9082
9083       struct aco_export_mrt mrt0, mrt1;
9084       for (unsigned i = 0; i < 4; i++) {
9085          mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val0, i, v1))
9086                                                     : Operand(v1);
9087
9088          mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val1, i, v1))
9089                                                     : Operand(v1);
9090       }
9091       mrt0.enabled_channels = mrt1.enabled_channels = write_mask;
9092
9093       create_fs_dual_src_export_gfx11(ctx, &mrt0, &mrt1);
9094
9095       ctx->block->kind |= block_kind_export_end;
9096       break;
9097    }
9098    case nir_intrinsic_strict_wqm_coord_amd: {
9099       Temp dst = get_ssa_temp(ctx, &instr->def);
9100       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9101       Temp tmp = bld.tmp(RegClass::get(RegType::vgpr, dst.bytes()));
9102       unsigned begin_size = nir_intrinsic_base(instr);
9103
9104       unsigned num_src = 1;
9105       auto it = ctx->allocated_vec.find(src.id());
9106       if (it != ctx->allocated_vec.end())
9107          num_src = src.bytes() / it->second[0].bytes();
9108
9109       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9110          aco_opcode::p_create_vector, Format::PSEUDO, num_src + !!begin_size, 1)};
9111
9112       if (begin_size)
9113          vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
9114       for (unsigned i = 0; i < num_src; i++) {
9115          Temp comp = it != ctx->allocated_vec.end() ? it->second[i] : src;
9116          vec->operands[i + !!begin_size] = Operand(comp);
9117       }
9118
9119       vec->definitions[0] = Definition(tmp);
9120       ctx->block->instructions.emplace_back(std::move(vec));
9121
9122       bld.pseudo(aco_opcode::p_start_linear_vgpr, Definition(dst), tmp);
9123       break;
9124    }
9125    case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
9126       Temp dst = get_ssa_temp(ctx, &instr->def);
9127       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9128                Operand::c32(aco_symbol_lds_ngg_scratch_base));
9129       break;
9130    }
9131    case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: {
9132       Temp dst = get_ssa_temp(ctx, &instr->def);
9133       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9134                Operand::c32(aco_symbol_lds_ngg_gs_out_vertex_base));
9135       break;
9136    }
9137    case nir_intrinsic_store_scalar_arg_amd: {
9138       ctx->arg_temps[nir_intrinsic_base(instr)] =
9139          bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9140       break;
9141    }
9142    case nir_intrinsic_store_vector_arg_amd: {
9143       ctx->arg_temps[nir_intrinsic_base(instr)] =
9144          as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
9145       break;
9146    }
9147    case nir_intrinsic_begin_invocation_interlock: {
9148       pops_await_overlapped_waves(ctx);
9149       break;
9150    }
9151    case nir_intrinsic_end_invocation_interlock: {
9152       if (ctx->options->gfx_level < GFX11)
9153          bld.pseudo(aco_opcode::p_pops_gfx9_ordered_section_done);
9154       break;
9155    }
9156    default:
9157       isel_err(&instr->instr, "Unimplemented intrinsic instr");
9158       abort();
9159
9160       break;
9161    }
9162 }
9163
9164 void
9165 get_const_vec(nir_def* vec, nir_const_value* cv[4])
9166 {
9167    if (vec->parent_instr->type != nir_instr_type_alu)
9168       return;
9169    nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9170    if (vec_instr->op != nir_op_vec(vec->num_components))
9171       return;
9172
9173    for (unsigned i = 0; i < vec->num_components; i++) {
9174       cv[i] =
9175          vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9176    }
9177 }
9178
9179 void
9180 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9181 {
9182    assert(instr->op != nir_texop_samples_identical);
9183
9184    Builder bld(ctx->program, ctx->block);
9185    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9186         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9187         has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
9188    Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9189                            offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(),
9190                            coord = Temp(), wqm_coord = Temp();
9191    std::vector<Temp> coords;
9192    std::vector<Temp> derivs;
9193    nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9194
9195    for (unsigned i = 0; i < instr->num_srcs; i++) {
9196       switch (instr->src[i].src_type) {
9197       case nir_tex_src_texture_handle:
9198          resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9199          break;
9200       case nir_tex_src_sampler_handle:
9201          sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9202          break;
9203       default: break;
9204       }
9205    }
9206
9207    bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
9208                                   (instr->dest_type & (nir_type_int | nir_type_uint));
9209    bool tg4_integer_cube_workaround =
9210       tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9211
9212    bool a16 = false, g16 = false;
9213
9214    int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
9215    if (coord_idx > 0)
9216       a16 = instr->src[coord_idx].src.ssa->bit_size == 16;
9217
9218    int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
9219    if (ddx_idx > 0)
9220       g16 = instr->src[ddx_idx].src.ssa->bit_size == 16;
9221
9222    for (unsigned i = 0; i < instr->num_srcs; i++) {
9223       switch (instr->src[i].src_type) {
9224       case nir_tex_src_coord: {
9225          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9226          coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9227          break;
9228       }
9229       case nir_tex_src_backend1: {
9230          assert(instr->src[i].src.ssa->bit_size == 32);
9231          wqm_coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9232          has_wqm_coord = true;
9233          break;
9234       }
9235       case nir_tex_src_bias:
9236          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9237          /* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */
9238          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9239          has_bias = true;
9240          break;
9241       case nir_tex_src_lod: {
9242          if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9243             level_zero = true;
9244          } else {
9245             assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9246             lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9247             has_lod = true;
9248          }
9249          break;
9250       }
9251       case nir_tex_src_min_lod:
9252          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9253          clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9254          has_clamped_lod = true;
9255          break;
9256       case nir_tex_src_comparator:
9257          if (instr->is_shadow) {
9258             assert(instr->src[i].src.ssa->bit_size == 32);
9259             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9260             has_compare = true;
9261          }
9262          break;
9263       case nir_tex_src_offset:
9264       case nir_tex_src_backend2:
9265          assert(instr->src[i].src.ssa->bit_size == 32);
9266          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9267          get_const_vec(instr->src[i].src.ssa, const_offset);
9268          has_offset = true;
9269          break;
9270       case nir_tex_src_ddx:
9271          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9272          ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9273          has_ddx = true;
9274          break;
9275       case nir_tex_src_ddy:
9276          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9277          ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9278          has_ddy = true;
9279          break;
9280       case nir_tex_src_ms_index:
9281          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9282          sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9283          has_sample_index = true;
9284          break;
9285       case nir_tex_src_texture_offset:
9286       case nir_tex_src_sampler_offset:
9287       default: break;
9288       }
9289    }
9290
9291    if (has_wqm_coord) {
9292       assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb ||
9293              instr->op == nir_texop_lod);
9294       assert(wqm_coord.regClass().is_linear_vgpr());
9295       assert(!a16 && !g16);
9296    }
9297
9298    if (instr->op == nir_texop_tg4 && !has_lod && !instr->is_gather_implicit_lod)
9299       level_zero = true;
9300
9301    if (has_offset) {
9302       assert(instr->op != nir_texop_txf);
9303
9304       aco_ptr<Instruction> tmp_instr;
9305       Temp acc, pack = Temp();
9306
9307       uint32_t pack_const = 0;
9308       for (unsigned i = 0; i < offset.size(); i++) {
9309          if (!const_offset[i])
9310             continue;
9311          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9312       }
9313
9314       if (offset.type() == RegType::sgpr) {
9315          for (unsigned i = 0; i < offset.size(); i++) {
9316             if (const_offset[i])
9317                continue;
9318
9319             acc = emit_extract_vector(ctx, offset, i, s1);
9320             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9321                            Operand::c32(0x3Fu));
9322
9323             if (i) {
9324                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9325                               Operand::c32(8u * i));
9326             }
9327
9328             if (pack == Temp()) {
9329                pack = acc;
9330             } else {
9331                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9332             }
9333          }
9334
9335          if (pack_const && pack != Temp())
9336             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9337                             Operand::c32(pack_const), pack);
9338       } else {
9339          for (unsigned i = 0; i < offset.size(); i++) {
9340             if (const_offset[i])
9341                continue;
9342
9343             acc = emit_extract_vector(ctx, offset, i, v1);
9344             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9345
9346             if (i) {
9347                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9348             }
9349
9350             if (pack == Temp()) {
9351                pack = acc;
9352             } else {
9353                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9354             }
9355          }
9356
9357          if (pack_const && pack != Temp())
9358             pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9359       }
9360       if (pack == Temp())
9361          offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9362       else
9363          offset = pack;
9364    }
9365
9366    std::vector<Temp> unpacked_coord;
9367    if (coord != Temp())
9368       unpacked_coord.push_back(coord);
9369    if (has_sample_index)
9370       unpacked_coord.push_back(sample_index);
9371    if (has_lod)
9372       unpacked_coord.push_back(lod);
9373    if (has_clamped_lod)
9374       unpacked_coord.push_back(clamped_lod);
9375
9376    coords = emit_pack_v1(ctx, unpacked_coord);
9377
9378    /* pack derivatives */
9379    if (has_ddx || has_ddy) {
9380       assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
9381       std::array<Temp, 2> ddxddy = {ddx, ddy};
9382       for (Temp tmp : ddxddy) {
9383          if (tmp == Temp())
9384             continue;
9385          std::vector<Temp> unpacked = {tmp};
9386          for (Temp derv : emit_pack_v1(ctx, unpacked))
9387             derivs.push_back(derv);
9388       }
9389       has_derivs = true;
9390    }
9391
9392    unsigned dim = 0;
9393    bool da = false;
9394    if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
9395       dim = ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array);
9396       da = should_declare_array((ac_image_dim)dim);
9397    }
9398
9399    /* Build tex instruction */
9400    unsigned dmask = nir_def_components_read(&instr->def) & 0xf;
9401    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9402       dmask = u_bit_consecutive(0, util_last_bit(dmask));
9403    if (instr->is_sparse)
9404       dmask = MAX2(dmask, 1) | 0x10;
9405    bool d16 = instr->def.bit_size == 16;
9406    Temp dst = get_ssa_temp(ctx, &instr->def);
9407    Temp tmp_dst = dst;
9408
9409    /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9410    if (instr->op == nir_texop_tg4) {
9411       assert(instr->def.num_components == (4 + instr->is_sparse));
9412       if (instr->is_shadow)
9413          dmask = 1;
9414       else
9415          dmask = 1 << instr->component;
9416       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9417          tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4));
9418    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9419       tmp_dst = bld.tmp(v1);
9420    } else if (util_bitcount(dmask) != instr->def.num_components || dst.type() == RegType::sgpr) {
9421       unsigned bytes = util_bitcount(dmask) * instr->def.bit_size / 8;
9422       tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes));
9423    }
9424
9425    Temp tg4_compare_cube_wa64 = Temp();
9426
9427    if (tg4_integer_workarounds) {
9428       Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9429       Temp size = bld.tmp(v2);
9430       MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, size, resource,
9431                                         Operand(s4), std::vector<Temp>{tg4_lod});
9432       tex->dim = dim;
9433       tex->dmask = 0x3;
9434       tex->da = da;
9435       emit_split_vector(ctx, size, size.size());
9436
9437       Temp half_texel[2];
9438       for (unsigned i = 0; i < 2; i++) {
9439          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9440          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9441          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9442          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9443                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9444       }
9445
9446       if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9447          /* In vulkan, whether the sampler uses unnormalized
9448           * coordinates or not is a dynamic property of the
9449           * sampler. Hence, to figure out whether or not we
9450           * need to divide by the texture size, we need to test
9451           * the sampler at runtime. This tests the bit set by
9452           * radv_init_sampler().
9453           */
9454          unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9455          Temp not_needed =
9456             bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx));
9457
9458          not_needed = bool_to_vector_condition(ctx, not_needed);
9459          half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9460                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9461          half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9462                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9463       }
9464
9465       Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9466                             bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9467
9468       if (tg4_integer_cube_workaround) {
9469          /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9470          Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9471          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9472             aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9473          split->operands[0] = Operand(resource);
9474          for (unsigned i = 0; i < resource.size(); i++) {
9475             desc[i] = bld.tmp(s1);
9476             split->definitions[i] = Definition(desc[i]);
9477          }
9478          ctx->block->instructions.emplace_back(std::move(split));
9479
9480          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9481                               Operand::c32(20u | (6u << 16)));
9482          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9483                                          Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9484
9485          Temp nfmt;
9486          if (instr->dest_type & nir_type_uint) {
9487             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9488                             Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9489                             Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9490          } else {
9491             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9492                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9493                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9494          }
9495          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9496          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9497
9498          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9499                          Operand::c32(26u));
9500
9501          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9502                             Operand::c32(C_008F14_NUM_FORMAT));
9503          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9504
9505          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9506             aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9507          for (unsigned i = 0; i < resource.size(); i++)
9508             vec->operands[i] = Operand(desc[i]);
9509          resource = bld.tmp(resource.regClass());
9510          vec->definitions[0] = Definition(resource);
9511          ctx->block->instructions.emplace_back(std::move(vec));
9512
9513          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9514                                   tg4_compare_cube_wa64);
9515          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9516                                   tg4_compare_cube_wa64);
9517       }
9518       coords[0] = new_coords[0];
9519       coords[1] = new_coords[1];
9520    }
9521
9522    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9523       // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9524       // ac_build_buffer_load_format_gfx9_safe()
9525
9526       assert(coords.size() == 1);
9527       aco_opcode op;
9528       if (d16) {
9529          switch (util_last_bit(dmask & 0xf)) {
9530          case 1: op = aco_opcode::buffer_load_format_d16_x; break;
9531          case 2: op = aco_opcode::buffer_load_format_d16_xy; break;
9532          case 3: op = aco_opcode::buffer_load_format_d16_xyz; break;
9533          case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break;
9534          default: unreachable("Tex instruction loads more than 4 components.");
9535          }
9536       } else {
9537          switch (util_last_bit(dmask & 0xf)) {
9538          case 1: op = aco_opcode::buffer_load_format_x; break;
9539          case 2: op = aco_opcode::buffer_load_format_xy; break;
9540          case 3: op = aco_opcode::buffer_load_format_xyz; break;
9541          case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9542          default: unreachable("Tex instruction loads more than 4 components.");
9543          }
9544       }
9545
9546       aco_ptr<MUBUF_instruction> mubuf{
9547          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9548       mubuf->operands[0] = Operand(resource);
9549       mubuf->operands[1] = Operand(coords[0]);
9550       mubuf->operands[2] = Operand::c32(0);
9551       mubuf->definitions[0] = Definition(tmp_dst);
9552       mubuf->idxen = true;
9553       mubuf->tfe = instr->is_sparse;
9554       if (mubuf->tfe)
9555          mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9556       ctx->block->instructions.emplace_back(std::move(mubuf));
9557
9558       expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9559       return;
9560    }
9561
9562    /* gather MIMG address components */
9563    std::vector<Temp> args;
9564    if (has_wqm_coord) {
9565       args.emplace_back(wqm_coord);
9566       if (!(ctx->block->kind & block_kind_top_level))
9567          ctx->unended_linear_vgprs.push_back(wqm_coord);
9568    }
9569    if (has_offset)
9570       args.emplace_back(offset);
9571    if (has_bias)
9572       args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
9573    if (has_compare)
9574       args.emplace_back(compare);
9575    if (has_derivs)
9576       args.insert(args.end(), derivs.begin(), derivs.end());
9577
9578    args.insert(args.end(), coords.begin(), coords.end());
9579
9580    if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9581        instr->op == nir_texop_fragment_mask_fetch_amd || instr->op == nir_texop_txf_ms) {
9582       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9583                             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9584                          ? aco_opcode::image_load
9585                          : aco_opcode::image_load_mip;
9586       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9587       MIMG_instruction* tex = emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, vdata);
9588       if (instr->op == nir_texop_fragment_mask_fetch_amd)
9589          tex->dim = da ? ac_image_2darray : ac_image_2d;
9590       else
9591          tex->dim = dim;
9592       tex->dmask = dmask & 0xf;
9593       tex->unrm = true;
9594       tex->da = da;
9595       tex->tfe = instr->is_sparse;
9596       tex->d16 = d16;
9597       tex->a16 = a16;
9598
9599       if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9600          /* Use 0x76543210 if the image doesn't have FMASK. */
9601          assert(dmask == 1 && dst.bytes() == 4);
9602          assert(dst.id() != tmp_dst.id());
9603
9604          if (dst.regClass() == s1) {
9605             Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9606                                         emit_extract_vector(ctx, resource, 1, s1));
9607             bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst),
9608                      Operand::c32(0x76543210), bld.scc(is_not_null));
9609          } else {
9610             Temp is_not_null = bld.tmp(bld.lm);
9611             bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9612                          emit_extract_vector(ctx, resource, 1, s1));
9613             bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9614                      bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9615          }
9616       } else {
9617          expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9618       }
9619       return;
9620    }
9621
9622    bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
9623
9624    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9625    aco_opcode opcode = aco_opcode::image_sample;
9626    if (has_offset) { /* image_sample_*_o */
9627       if (has_clamped_lod) {
9628          if (has_compare) {
9629             opcode = aco_opcode::image_sample_c_cl_o;
9630             if (separate_g16)
9631                opcode = aco_opcode::image_sample_c_d_cl_o_g16;
9632             else if (has_derivs)
9633                opcode = aco_opcode::image_sample_c_d_cl_o;
9634             if (has_bias)
9635                opcode = aco_opcode::image_sample_c_b_cl_o;
9636          } else {
9637             opcode = aco_opcode::image_sample_cl_o;
9638             if (separate_g16)
9639                opcode = aco_opcode::image_sample_d_cl_o_g16;
9640             else if (has_derivs)
9641                opcode = aco_opcode::image_sample_d_cl_o;
9642             if (has_bias)
9643                opcode = aco_opcode::image_sample_b_cl_o;
9644          }
9645       } else if (has_compare) {
9646          opcode = aco_opcode::image_sample_c_o;
9647          if (separate_g16)
9648             opcode = aco_opcode::image_sample_c_d_o_g16;
9649          else if (has_derivs)
9650             opcode = aco_opcode::image_sample_c_d_o;
9651          if (has_bias)
9652             opcode = aco_opcode::image_sample_c_b_o;
9653          if (level_zero)
9654             opcode = aco_opcode::image_sample_c_lz_o;
9655          if (has_lod)
9656             opcode = aco_opcode::image_sample_c_l_o;
9657       } else {
9658          opcode = aco_opcode::image_sample_o;
9659          if (separate_g16)
9660             opcode = aco_opcode::image_sample_d_o_g16;
9661          else if (has_derivs)
9662             opcode = aco_opcode::image_sample_d_o;
9663          if (has_bias)
9664             opcode = aco_opcode::image_sample_b_o;
9665          if (level_zero)
9666             opcode = aco_opcode::image_sample_lz_o;
9667          if (has_lod)
9668             opcode = aco_opcode::image_sample_l_o;
9669       }
9670    } else if (has_clamped_lod) { /* image_sample_*_cl */
9671       if (has_compare) {
9672          opcode = aco_opcode::image_sample_c_cl;
9673          if (separate_g16)
9674             opcode = aco_opcode::image_sample_c_d_cl_g16;
9675          else if (has_derivs)
9676             opcode = aco_opcode::image_sample_c_d_cl;
9677          if (has_bias)
9678             opcode = aco_opcode::image_sample_c_b_cl;
9679       } else {
9680          opcode = aco_opcode::image_sample_cl;
9681          if (separate_g16)
9682             opcode = aco_opcode::image_sample_d_cl_g16;
9683          else if (has_derivs)
9684             opcode = aco_opcode::image_sample_d_cl;
9685          if (has_bias)
9686             opcode = aco_opcode::image_sample_b_cl;
9687       }
9688    } else { /* no offset */
9689       if (has_compare) {
9690          opcode = aco_opcode::image_sample_c;
9691          if (separate_g16)
9692             opcode = aco_opcode::image_sample_c_d_g16;
9693          else if (has_derivs)
9694             opcode = aco_opcode::image_sample_c_d;
9695          if (has_bias)
9696             opcode = aco_opcode::image_sample_c_b;
9697          if (level_zero)
9698             opcode = aco_opcode::image_sample_c_lz;
9699          if (has_lod)
9700             opcode = aco_opcode::image_sample_c_l;
9701       } else {
9702          opcode = aco_opcode::image_sample;
9703          if (separate_g16)
9704             opcode = aco_opcode::image_sample_d_g16;
9705          else if (has_derivs)
9706             opcode = aco_opcode::image_sample_d;
9707          if (has_bias)
9708             opcode = aco_opcode::image_sample_b;
9709          if (level_zero)
9710             opcode = aco_opcode::image_sample_lz;
9711          if (has_lod)
9712             opcode = aco_opcode::image_sample_l;
9713       }
9714    }
9715
9716    if (instr->op == nir_texop_tg4) {
9717       /* GFX11 supports implicit LOD, but the extension is unsupported. */
9718       assert(level_zero || ctx->options->gfx_level < GFX11);
9719
9720       if (has_offset) { /* image_gather4_*_o */
9721          if (has_compare) {
9722             opcode = aco_opcode::image_gather4_c_o;
9723             if (level_zero)
9724                opcode = aco_opcode::image_gather4_c_lz_o;
9725             if (has_lod)
9726                opcode = aco_opcode::image_gather4_c_l_o;
9727             if (has_bias)
9728                opcode = aco_opcode::image_gather4_c_b_o;
9729          } else {
9730             opcode = aco_opcode::image_gather4_o;
9731             if (level_zero)
9732                opcode = aco_opcode::image_gather4_lz_o;
9733             if (has_lod)
9734                opcode = aco_opcode::image_gather4_l_o;
9735             if (has_bias)
9736                opcode = aco_opcode::image_gather4_b_o;
9737          }
9738       } else {
9739          if (has_compare) {
9740             opcode = aco_opcode::image_gather4_c;
9741             if (level_zero)
9742                opcode = aco_opcode::image_gather4_c_lz;
9743             if (has_lod)
9744                opcode = aco_opcode::image_gather4_c_l;
9745             if (has_bias)
9746                opcode = aco_opcode::image_gather4_c_b;
9747          } else {
9748             opcode = aco_opcode::image_gather4;
9749             if (level_zero)
9750                opcode = aco_opcode::image_gather4_lz;
9751             if (has_lod)
9752                opcode = aco_opcode::image_gather4_l;
9753             if (has_bias)
9754                opcode = aco_opcode::image_gather4_b;
9755          }
9756       }
9757    } else if (instr->op == nir_texop_lod) {
9758       opcode = aco_opcode::image_get_lod;
9759    }
9760
9761    bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9762                           !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9763                           instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9764
9765    Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9766    MIMG_instruction* tex = emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, vdata);
9767    tex->dim = dim;
9768    tex->dmask = dmask & 0xf;
9769    tex->da = da;
9770    tex->tfe = instr->is_sparse;
9771    tex->d16 = d16;
9772    tex->a16 = a16;
9773    if (implicit_derivs)
9774       set_wqm(ctx, true);
9775
9776    if (tg4_integer_cube_workaround) {
9777       assert(tmp_dst.id() != dst.id());
9778       assert(tmp_dst.size() == dst.size());
9779
9780       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9781       Temp val[4];
9782       for (unsigned i = 0; i < 4; i++) {
9783          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9784          Temp cvt_val;
9785          if (instr->dest_type & nir_type_uint)
9786             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9787          else
9788             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9789          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9790                            tg4_compare_cube_wa64);
9791       }
9792
9793       Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9794       if (instr->is_sparse)
9795          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9796                               val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9797       else
9798          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9799                               val[3]);
9800    }
9801    unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9802    expand_vector(ctx, tmp_dst, dst, instr->def.num_components, mask);
9803 }
9804
9805 Operand
9806 get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc, bool logical)
9807 {
9808    Temp tmp = get_ssa_temp(ctx, ssa);
9809    if (ssa->parent_instr->type == nir_instr_type_undef) {
9810       return Operand(rc);
9811    } else if (logical && ssa->bit_size == 1 &&
9812               ssa->parent_instr->type == nir_instr_type_load_const) {
9813       bool val = nir_instr_as_load_const(ssa->parent_instr)->value[0].b;
9814       return Operand::c32_or_c64(val ? -1 : 0, ctx->program->lane_mask == s2);
9815    } else {
9816       return Operand(tmp);
9817    }
9818 }
9819
9820 void
9821 visit_phi(isel_context* ctx, nir_phi_instr* instr)
9822 {
9823    aco_ptr<Pseudo_instruction> phi;
9824    Temp dst = get_ssa_temp(ctx, &instr->def);
9825    assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9826
9827    bool logical = !dst.is_linear() || instr->def.divergent;
9828    logical |= (ctx->block->kind & block_kind_merge) != 0;
9829    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9830
9831    /* we want a sorted list of sources, since the predecessor list is also sorted */
9832    std::map<unsigned, nir_def*> phi_src;
9833    nir_foreach_phi_src (src, instr)
9834       phi_src[src->pred->index] = src->src.ssa;
9835
9836    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9837    unsigned num_operands = 0;
9838    Operand* const operands = (Operand*)alloca(
9839       (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
9840    unsigned num_defined = 0;
9841    unsigned cur_pred_idx = 0;
9842    for (std::pair<unsigned, nir_def*> src : phi_src) {
9843       if (cur_pred_idx < preds.size()) {
9844          /* handle missing preds (IF merges with discard/break) and extra preds
9845           * (loop exit with discard) */
9846          unsigned block = ctx->cf_info.nir_to_aco[src.first];
9847          unsigned skipped = 0;
9848          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9849             skipped++;
9850          if (cur_pred_idx + skipped < preds.size()) {
9851             for (unsigned i = 0; i < skipped; i++)
9852                operands[num_operands++] = Operand(dst.regClass());
9853             cur_pred_idx += skipped;
9854          } else {
9855             continue;
9856          }
9857       }
9858       /* Handle missing predecessors at the end. This shouldn't happen with loop
9859        * headers and we can't ignore these sources for loop header phis. */
9860       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9861          continue;
9862       cur_pred_idx++;
9863       Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9864       operands[num_operands++] = op;
9865       num_defined += !op.isUndefined();
9866    }
9867    /* handle block_kind_continue_or_break at loop exit blocks */
9868    while (cur_pred_idx++ < preds.size())
9869       operands[num_operands++] = Operand(dst.regClass());
9870
9871    /* If the loop ends with a break, still add a linear continue edge in case
9872     * that break is divergent or continue_or_break is used. We'll either remove
9873     * this operand later in visit_loop() if it's not necessary or replace the
9874     * undef with something correct. */
9875    if (!logical && ctx->block->kind & block_kind_loop_header) {
9876       nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9877       nir_block* last = nir_loop_last_block(loop);
9878       if (last->successors[0] != instr->instr.block)
9879          operands[num_operands++] = Operand(RegClass());
9880    }
9881
9882    /* we can use a linear phi in some cases if one src is undef */
9883    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9884       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
9885                                                        num_operands, 1));
9886
9887       Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9888       Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9889       assert(invert->kind & block_kind_invert);
9890
9891       unsigned then_block = invert->linear_preds[0];
9892
9893       Block* insert_block = NULL;
9894       for (unsigned i = 0; i < num_operands; i++) {
9895          Operand op = operands[i];
9896          if (op.isUndefined())
9897             continue;
9898          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9899          phi->operands[0] = op;
9900          break;
9901       }
9902       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9903       phi->operands[1] = Operand(dst.regClass());
9904       phi->definitions[0] = Definition(dst);
9905       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9906       return;
9907    }
9908
9909    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9910    for (unsigned i = 0; i < num_operands; i++)
9911       phi->operands[i] = operands[i];
9912    phi->definitions[0] = Definition(dst);
9913    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9914 }
9915
9916 void
9917 visit_undef(isel_context* ctx, nir_undef_instr* instr)
9918 {
9919    Temp dst = get_ssa_temp(ctx, &instr->def);
9920
9921    assert(dst.type() == RegType::sgpr);
9922
9923    if (dst.size() == 1) {
9924       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9925    } else {
9926       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9927          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9928       for (unsigned i = 0; i < dst.size(); i++)
9929          vec->operands[i] = Operand::zero();
9930       vec->definitions[0] = Definition(dst);
9931       ctx->block->instructions.emplace_back(std::move(vec));
9932    }
9933 }
9934
9935 void
9936 begin_loop(isel_context* ctx, loop_context* lc)
9937 {
9938    // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9939    append_logical_end(ctx->block);
9940    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9941    Builder bld(ctx->program, ctx->block);
9942    bld.branch(aco_opcode::p_branch, bld.def(s2));
9943    unsigned loop_preheader_idx = ctx->block->index;
9944
9945    lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9946
9947    ctx->program->next_loop_depth++;
9948
9949    Block* loop_header = ctx->program->create_and_insert_block();
9950    loop_header->kind |= block_kind_loop_header;
9951    add_edge(loop_preheader_idx, loop_header);
9952    ctx->block = loop_header;
9953
9954    append_logical_start(ctx->block);
9955
9956    lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
9957    lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
9958    lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
9959    lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
9960    lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
9961 }
9962
9963 void
9964 end_loop(isel_context* ctx, loop_context* lc)
9965 {
9966    // TODO: what if a loop ends with a unconditional or uniformly branched continue
9967    //       and this branch is never taken?
9968    if (!ctx->cf_info.has_branch) {
9969       unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
9970       Builder bld(ctx->program, ctx->block);
9971       append_logical_end(ctx->block);
9972
9973       if (ctx->cf_info.exec_potentially_empty_discard ||
9974           ctx->cf_info.exec_potentially_empty_break) {
9975          /* Discards can result in code running with an empty exec mask.
9976           * This would result in divergent breaks not ever being taken. As a
9977           * workaround, break the loop when the loop mask is empty instead of
9978           * always continuing. */
9979          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9980          unsigned block_idx = ctx->block->index;
9981
9982          /* create helper blocks to avoid critical edges */
9983          Block* break_block = ctx->program->create_and_insert_block();
9984          break_block->kind = block_kind_uniform;
9985          bld.reset(break_block);
9986          bld.branch(aco_opcode::p_branch, bld.def(s2));
9987          add_linear_edge(block_idx, break_block);
9988          add_linear_edge(break_block->index, &lc->loop_exit);
9989
9990          Block* continue_block = ctx->program->create_and_insert_block();
9991          continue_block->kind = block_kind_uniform;
9992          bld.reset(continue_block);
9993          bld.branch(aco_opcode::p_branch, bld.def(s2));
9994          add_linear_edge(block_idx, continue_block);
9995          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
9996
9997          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9998             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
9999          ctx->block = &ctx->program->blocks[block_idx];
10000       } else {
10001          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10002          if (!ctx->cf_info.parent_loop.has_divergent_branch)
10003             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10004          else
10005             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10006       }
10007
10008       bld.reset(ctx->block);
10009       bld.branch(aco_opcode::p_branch, bld.def(s2));
10010    }
10011
10012    ctx->cf_info.has_branch = false;
10013    ctx->program->next_loop_depth--;
10014
10015    // TODO: if the loop has not a single exit, we must add one °°
10016    /* emit loop successor block */
10017    ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10018    append_logical_start(ctx->block);
10019
10020 #if 0
10021    // TODO: check if it is beneficial to not branch on continues
10022    /* trim linear phis in loop header */
10023    for (auto&& instr : loop_entry->instructions) {
10024       if (instr->opcode == aco_opcode::p_linear_phi) {
10025          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
10026          new_phi->definitions[0] = instr->definitions[0];
10027          for (unsigned i = 0; i < new_phi->operands.size(); i++)
10028             new_phi->operands[i] = instr->operands[i];
10029          /* check that the remaining operands are all the same */
10030          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
10031             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
10032          instr.swap(new_phi);
10033       } else if (instr->opcode == aco_opcode::p_phi) {
10034          continue;
10035       } else {
10036          break;
10037       }
10038    }
10039 #endif
10040
10041    ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10042    ctx->cf_info.parent_loop.exit = lc->exit_old;
10043    ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10044    ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10045    ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10046    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10047       ctx->cf_info.exec_potentially_empty_discard = false;
10048 }
10049
10050 void
10051 emit_loop_jump(isel_context* ctx, bool is_break)
10052 {
10053    Builder bld(ctx->program, ctx->block);
10054    Block* logical_target;
10055    append_logical_end(ctx->block);
10056    unsigned idx = ctx->block->index;
10057
10058    if (is_break) {
10059       logical_target = ctx->cf_info.parent_loop.exit;
10060       add_logical_edge(idx, logical_target);
10061       ctx->block->kind |= block_kind_break;
10062
10063       if (!ctx->cf_info.parent_if.is_divergent &&
10064           !ctx->cf_info.parent_loop.has_divergent_continue) {
10065          /* uniform break - directly jump out of the loop */
10066          ctx->block->kind |= block_kind_uniform;
10067          ctx->cf_info.has_branch = true;
10068          bld.branch(aco_opcode::p_branch, bld.def(s2));
10069          add_linear_edge(idx, logical_target);
10070          return;
10071       }
10072       ctx->cf_info.parent_loop.has_divergent_branch = true;
10073    } else {
10074       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10075       add_logical_edge(idx, logical_target);
10076       ctx->block->kind |= block_kind_continue;
10077
10078       if (!ctx->cf_info.parent_if.is_divergent) {
10079          /* uniform continue - directly jump to the loop header */
10080          ctx->block->kind |= block_kind_uniform;
10081          ctx->cf_info.has_branch = true;
10082          bld.branch(aco_opcode::p_branch, bld.def(s2));
10083          add_linear_edge(idx, logical_target);
10084          return;
10085       }
10086
10087       /* for potential uniform breaks after this continue,
10088          we must ensure that they are handled correctly */
10089       ctx->cf_info.parent_loop.has_divergent_continue = true;
10090       ctx->cf_info.parent_loop.has_divergent_branch = true;
10091    }
10092
10093    if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10094       ctx->cf_info.exec_potentially_empty_break = true;
10095       ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10096    }
10097
10098    /* remove critical edges from linear CFG */
10099    bld.branch(aco_opcode::p_branch, bld.def(s2));
10100    Block* break_block = ctx->program->create_and_insert_block();
10101    break_block->kind |= block_kind_uniform;
10102    add_linear_edge(idx, break_block);
10103    /* the loop_header pointer might be invalidated by this point */
10104    if (!is_break)
10105       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10106    add_linear_edge(break_block->index, logical_target);
10107    bld.reset(break_block);
10108    bld.branch(aco_opcode::p_branch, bld.def(s2));
10109
10110    Block* continue_block = ctx->program->create_and_insert_block();
10111    add_linear_edge(idx, continue_block);
10112    append_logical_start(continue_block);
10113    ctx->block = continue_block;
10114 }
10115
10116 void
10117 emit_loop_break(isel_context* ctx)
10118 {
10119    emit_loop_jump(ctx, true);
10120 }
10121
10122 void
10123 emit_loop_continue(isel_context* ctx)
10124 {
10125    emit_loop_jump(ctx, false);
10126 }
10127
10128 void
10129 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10130 {
10131    /* visit_block() would usually do this but divergent jumps updates ctx->block */
10132    ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10133
10134    switch (instr->type) {
10135    case nir_jump_break: emit_loop_break(ctx); break;
10136    case nir_jump_continue: emit_loop_continue(ctx); break;
10137    default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10138    }
10139 }
10140
10141 void
10142 visit_block(isel_context* ctx, nir_block* block)
10143 {
10144    if (ctx->block->kind & block_kind_top_level) {
10145       Builder bld(ctx->program, ctx->block);
10146       for (Temp tmp : ctx->unended_linear_vgprs)
10147          bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
10148       ctx->unended_linear_vgprs.clear();
10149    }
10150
10151    ctx->block->instructions.reserve(ctx->block->instructions.size() +
10152                                     exec_list_length(&block->instr_list) * 2);
10153    nir_foreach_instr (instr, block) {
10154       switch (instr->type) {
10155       case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10156       case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10157       case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10158       case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10159       case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10160       case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
10161       case nir_instr_type_deref: break;
10162       case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10163       default: isel_err(instr, "Unknown NIR instr type");
10164       }
10165    }
10166
10167    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10168       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10169 }
10170
10171 static Operand
10172 create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10173                      aco_ptr<Instruction>& header_phi, Operand* vals)
10174 {
10175    vals[0] = Operand(header_phi->definitions[0].getTemp());
10176    RegClass rc = vals[0].regClass();
10177
10178    unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10179
10180    unsigned next_pred = 1;
10181
10182    for (unsigned idx = first + 1; idx <= last; idx++) {
10183       Block& block = ctx->program->blocks[idx];
10184       if (block.loop_nest_depth != loop_nest_depth) {
10185          vals[idx - first] = vals[idx - 1 - first];
10186          continue;
10187       }
10188
10189       if ((block.kind & block_kind_continue) && block.index != last) {
10190          vals[idx - first] = header_phi->operands[next_pred];
10191          next_pred++;
10192          continue;
10193       }
10194
10195       bool all_same = true;
10196       for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10197          all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10198
10199       Operand val;
10200       if (all_same) {
10201          val = vals[block.linear_preds[0] - first];
10202       } else {
10203          aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10204             aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10205          for (unsigned i = 0; i < block.linear_preds.size(); i++)
10206             phi->operands[i] = vals[block.linear_preds[i] - first];
10207          val = Operand(ctx->program->allocateTmp(rc));
10208          phi->definitions[0] = Definition(val.getTemp());
10209          block.instructions.emplace(block.instructions.begin(), std::move(phi));
10210       }
10211       vals[idx - first] = val;
10212    }
10213
10214    return vals[last - first];
10215 }
10216
10217 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10218 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10219 static void end_uniform_if(isel_context* ctx, if_context* ic);
10220
10221 static void
10222 visit_loop(isel_context* ctx, nir_loop* loop)
10223 {
10224    assert(!nir_loop_has_continue_construct(loop));
10225    loop_context lc;
10226    begin_loop(ctx, &lc);
10227
10228    bool unreachable = visit_cf_list(ctx, &loop->body);
10229
10230    unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10231
10232    /* Fixup phis in loop header from unreachable blocks.
10233     * has_branch/has_divergent_branch also indicates if the loop ends with a
10234     * break/continue instruction, but we don't emit those if unreachable=true */
10235    if (unreachable) {
10236       assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10237       bool linear = ctx->cf_info.has_branch;
10238       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10239       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10240          if ((logical && instr->opcode == aco_opcode::p_phi) ||
10241              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
10242             /* the last operand should be the one that needs to be removed */
10243             instr->operands.pop_back();
10244          } else if (!is_phi(instr)) {
10245             break;
10246          }
10247       }
10248    }
10249
10250    /* Fixup linear phis in loop header from expecting a continue. Both this fixup
10251     * and the previous one shouldn't both happen at once because a break in the
10252     * merge block would get CSE'd */
10253    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10254       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10255       Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10256       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10257          if (instr->opcode == aco_opcode::p_linear_phi) {
10258             if (ctx->cf_info.has_branch)
10259                instr->operands.pop_back();
10260             else
10261                instr->operands.back() =
10262                   create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10263          } else if (!is_phi(instr)) {
10264             break;
10265          }
10266       }
10267    }
10268
10269    /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10270     * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10271     */
10272    if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10273       Builder bld(ctx->program, ctx->block);
10274       Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10275       if_context ic;
10276       begin_uniform_if_then(ctx, &ic, cond);
10277       emit_loop_break(ctx);
10278       begin_uniform_if_else(ctx, &ic);
10279       end_uniform_if(ctx, &ic);
10280    }
10281
10282    end_loop(ctx, &lc);
10283 }
10284
10285 static void
10286 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
10287                         nir_selection_control sel_ctrl = nir_selection_control_none)
10288 {
10289    ic->cond = cond;
10290
10291    append_logical_end(ctx->block);
10292    ctx->block->kind |= block_kind_branch;
10293
10294    /* branch to linear then block */
10295    assert(cond.regClass() == ctx->program->lane_mask);
10296    aco_ptr<Pseudo_branch_instruction> branch;
10297    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10298                                                               Format::PSEUDO_BRANCH, 1, 1));
10299    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10300    branch->operands[0] = Operand(cond);
10301    branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
10302                                       sel_ctrl == nir_selection_control_divergent_always_taken;
10303    ctx->block->instructions.push_back(std::move(branch));
10304
10305    ic->BB_if_idx = ctx->block->index;
10306    ic->BB_invert = Block();
10307    /* Invert blocks are intentionally not marked as top level because they
10308     * are not part of the logical cfg. */
10309    ic->BB_invert.kind |= block_kind_invert;
10310    ic->BB_endif = Block();
10311    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10312
10313    ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10314    ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10315    ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10316    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10317    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10318    ctx->cf_info.parent_if.is_divergent = true;
10319
10320    /* divergent branches use cbranch_execz */
10321    ctx->cf_info.exec_potentially_empty_discard = false;
10322    ctx->cf_info.exec_potentially_empty_break = false;
10323    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10324
10325    /** emit logical then block */
10326    ctx->program->next_divergent_if_logical_depth++;
10327    Block* BB_then_logical = ctx->program->create_and_insert_block();
10328    add_edge(ic->BB_if_idx, BB_then_logical);
10329    ctx->block = BB_then_logical;
10330    append_logical_start(BB_then_logical);
10331 }
10332
10333 static void
10334 begin_divergent_if_else(isel_context* ctx, if_context* ic,
10335                         nir_selection_control sel_ctrl = nir_selection_control_none)
10336 {
10337    Block* BB_then_logical = ctx->block;
10338    append_logical_end(BB_then_logical);
10339    /* branch from logical then block to invert block */
10340    aco_ptr<Pseudo_branch_instruction> branch;
10341    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10342                                                               Format::PSEUDO_BRANCH, 0, 1));
10343    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10344    BB_then_logical->instructions.emplace_back(std::move(branch));
10345    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10346    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10347       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10348    BB_then_logical->kind |= block_kind_uniform;
10349    assert(!ctx->cf_info.has_branch);
10350    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10351    ctx->cf_info.parent_loop.has_divergent_branch = false;
10352    ctx->program->next_divergent_if_logical_depth--;
10353
10354    /** emit linear then block */
10355    Block* BB_then_linear = ctx->program->create_and_insert_block();
10356    BB_then_linear->kind |= block_kind_uniform;
10357    add_linear_edge(ic->BB_if_idx, BB_then_linear);
10358    /* branch from linear then block to invert block */
10359    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10360                                                               Format::PSEUDO_BRANCH, 0, 1));
10361    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10362    BB_then_linear->instructions.emplace_back(std::move(branch));
10363    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10364
10365    /** emit invert merge block */
10366    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10367    ic->invert_idx = ctx->block->index;
10368
10369    /* branch to linear else block (skip else) */
10370    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10371                                                               Format::PSEUDO_BRANCH, 0, 1));
10372    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10373    branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
10374                                       sel_ctrl == nir_selection_control_divergent_always_taken;
10375    ctx->block->instructions.push_back(std::move(branch));
10376
10377    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10378    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10379    ic->exec_potentially_empty_break_depth_old = std::min(
10380       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10381    /* divergent branches use cbranch_execz */
10382    ctx->cf_info.exec_potentially_empty_discard = false;
10383    ctx->cf_info.exec_potentially_empty_break = false;
10384    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10385
10386    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10387    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10388
10389    /** emit logical else block */
10390    ctx->program->next_divergent_if_logical_depth++;
10391    Block* BB_else_logical = ctx->program->create_and_insert_block();
10392    add_logical_edge(ic->BB_if_idx, BB_else_logical);
10393    add_linear_edge(ic->invert_idx, BB_else_logical);
10394    ctx->block = BB_else_logical;
10395    append_logical_start(BB_else_logical);
10396 }
10397
10398 static void
10399 end_divergent_if(isel_context* ctx, if_context* ic)
10400 {
10401    Block* BB_else_logical = ctx->block;
10402    append_logical_end(BB_else_logical);
10403
10404    /* branch from logical else block to endif block */
10405    aco_ptr<Pseudo_branch_instruction> branch;
10406    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10407                                                               Format::PSEUDO_BRANCH, 0, 1));
10408    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10409    BB_else_logical->instructions.emplace_back(std::move(branch));
10410    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10411    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10412       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10413    BB_else_logical->kind |= block_kind_uniform;
10414    ctx->program->next_divergent_if_logical_depth--;
10415
10416    assert(!ctx->cf_info.has_branch);
10417    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10418
10419    /** emit linear else block */
10420    Block* BB_else_linear = ctx->program->create_and_insert_block();
10421    BB_else_linear->kind |= block_kind_uniform;
10422    add_linear_edge(ic->invert_idx, BB_else_linear);
10423
10424    /* branch from linear else block to endif block */
10425    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10426                                                               Format::PSEUDO_BRANCH, 0, 1));
10427    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10428    BB_else_linear->instructions.emplace_back(std::move(branch));
10429    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10430
10431    /** emit endif merge block */
10432    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10433    append_logical_start(ctx->block);
10434
10435    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10436    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10437    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10438    ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10439       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10440    if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10441        !ctx->cf_info.parent_if.is_divergent) {
10442       ctx->cf_info.exec_potentially_empty_break = false;
10443       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10444    }
10445    /* uniform control flow never has an empty exec-mask */
10446    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10447       ctx->cf_info.exec_potentially_empty_discard = false;
10448       ctx->cf_info.exec_potentially_empty_break = false;
10449       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10450    }
10451    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10452 }
10453
10454 static void
10455 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10456 {
10457    assert(cond.regClass() == s1);
10458
10459    append_logical_end(ctx->block);
10460    ctx->block->kind |= block_kind_uniform;
10461
10462    aco_ptr<Pseudo_branch_instruction> branch;
10463    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10464    branch.reset(
10465       create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10466    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10467    branch->operands[0] = Operand(cond);
10468    branch->operands[0].setFixed(scc);
10469    ctx->block->instructions.emplace_back(std::move(branch));
10470
10471    ic->BB_if_idx = ctx->block->index;
10472    ic->BB_endif = Block();
10473    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10474
10475    ctx->cf_info.has_branch = false;
10476    ctx->cf_info.parent_loop.has_divergent_branch = false;
10477
10478    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10479
10480    /** emit then block */
10481    ctx->program->next_uniform_if_depth++;
10482    Block* BB_then = ctx->program->create_and_insert_block();
10483    add_edge(ic->BB_if_idx, BB_then);
10484    append_logical_start(BB_then);
10485    ctx->block = BB_then;
10486 }
10487
10488 static void
10489 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10490 {
10491    Block* BB_then = ctx->block;
10492
10493    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10494    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10495
10496    if (!ic->uniform_has_then_branch) {
10497       append_logical_end(BB_then);
10498       /* branch from then block to endif block */
10499       aco_ptr<Pseudo_branch_instruction> branch;
10500       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10501                                                                  Format::PSEUDO_BRANCH, 0, 1));
10502       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10503       BB_then->instructions.emplace_back(std::move(branch));
10504       add_linear_edge(BB_then->index, &ic->BB_endif);
10505       if (!ic->then_branch_divergent)
10506          add_logical_edge(BB_then->index, &ic->BB_endif);
10507       BB_then->kind |= block_kind_uniform;
10508    }
10509
10510    ctx->cf_info.has_branch = false;
10511    ctx->cf_info.parent_loop.has_divergent_branch = false;
10512
10513    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10514    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10515
10516    /** emit else block */
10517    Block* BB_else = ctx->program->create_and_insert_block();
10518    add_edge(ic->BB_if_idx, BB_else);
10519    append_logical_start(BB_else);
10520    ctx->block = BB_else;
10521 }
10522
10523 static void
10524 end_uniform_if(isel_context* ctx, if_context* ic)
10525 {
10526    Block* BB_else = ctx->block;
10527
10528    if (!ctx->cf_info.has_branch) {
10529       append_logical_end(BB_else);
10530       /* branch from then block to endif block */
10531       aco_ptr<Pseudo_branch_instruction> branch;
10532       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10533                                                                  Format::PSEUDO_BRANCH, 0, 1));
10534       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10535       BB_else->instructions.emplace_back(std::move(branch));
10536       add_linear_edge(BB_else->index, &ic->BB_endif);
10537       if (!ctx->cf_info.parent_loop.has_divergent_branch)
10538          add_logical_edge(BB_else->index, &ic->BB_endif);
10539       BB_else->kind |= block_kind_uniform;
10540    }
10541
10542    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10543    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10544    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10545
10546    /** emit endif merge block */
10547    ctx->program->next_uniform_if_depth--;
10548    if (!ctx->cf_info.has_branch) {
10549       ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10550       append_logical_start(ctx->block);
10551    }
10552 }
10553
10554 static bool
10555 visit_if(isel_context* ctx, nir_if* if_stmt)
10556 {
10557    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10558    Builder bld(ctx->program, ctx->block);
10559    aco_ptr<Pseudo_branch_instruction> branch;
10560    if_context ic;
10561
10562    if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10563       /**
10564        * Uniform conditionals are represented in the following way*) :
10565        *
10566        * The linear and logical CFG:
10567        *                        BB_IF
10568        *                        /    \
10569        *       BB_THEN (logical)      BB_ELSE (logical)
10570        *                        \    /
10571        *                        BB_ENDIF
10572        *
10573        * *) Exceptions may be due to break and continue statements within loops
10574        *    If a break/continue happens within uniform control flow, it branches
10575        *    to the loop exit/entry block. Otherwise, it branches to the next
10576        *    merge block.
10577        **/
10578
10579       assert(cond.regClass() == ctx->program->lane_mask);
10580       cond = bool_to_scalar_condition(ctx, cond);
10581
10582       begin_uniform_if_then(ctx, &ic, cond);
10583       visit_cf_list(ctx, &if_stmt->then_list);
10584
10585       begin_uniform_if_else(ctx, &ic);
10586       visit_cf_list(ctx, &if_stmt->else_list);
10587
10588       end_uniform_if(ctx, &ic);
10589    } else { /* non-uniform condition */
10590       /**
10591        * To maintain a logical and linear CFG without critical edges,
10592        * non-uniform conditionals are represented in the following way*) :
10593        *
10594        * The linear CFG:
10595        *                        BB_IF
10596        *                        /    \
10597        *       BB_THEN (logical)      BB_THEN (linear)
10598        *                        \    /
10599        *                        BB_INVERT (linear)
10600        *                        /    \
10601        *       BB_ELSE (logical)      BB_ELSE (linear)
10602        *                        \    /
10603        *                        BB_ENDIF
10604        *
10605        * The logical CFG:
10606        *                        BB_IF
10607        *                        /    \
10608        *       BB_THEN (logical)      BB_ELSE (logical)
10609        *                        \    /
10610        *                        BB_ENDIF
10611        *
10612        * *) Exceptions may be due to break and continue statements within loops
10613        **/
10614
10615       begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
10616       visit_cf_list(ctx, &if_stmt->then_list);
10617
10618       begin_divergent_if_else(ctx, &ic, if_stmt->control);
10619       visit_cf_list(ctx, &if_stmt->else_list);
10620
10621       end_divergent_if(ctx, &ic);
10622    }
10623
10624    return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10625 }
10626
10627 static bool
10628 visit_cf_list(isel_context* ctx, struct exec_list* list)
10629 {
10630    foreach_list_typed (nir_cf_node, node, node, list) {
10631       switch (node->type) {
10632       case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10633       case nir_cf_node_if:
10634          if (!visit_if(ctx, nir_cf_node_as_if(node)))
10635             return true;
10636          break;
10637       case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10638       default: unreachable("unimplemented cf list type");
10639       }
10640    }
10641    return false;
10642 }
10643
10644 struct mrt_color_export {
10645    int slot;
10646    unsigned write_mask;
10647    Operand values[4];
10648    uint8_t col_format;
10649
10650    /* Fields below are only used for PS epilogs. */
10651    bool is_int8;
10652    bool is_int10;
10653    bool enable_mrt_output_nan_fixup;
10654 };
10655
10656 static void
10657 export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt)
10658 {
10659    Builder bld(ctx->program, ctx->block);
10660
10661    bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3],
10662            mrt->enabled_channels, mrt->target, mrt->compr);
10663
10664    ctx->program->has_color_exports = true;
10665 }
10666
10667 static bool
10668 export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export* out,
10669                     struct aco_export_mrt* mrt)
10670 {
10671    Builder bld(ctx->program, ctx->block);
10672    Operand values[4];
10673
10674    for (unsigned i = 0; i < 4; ++i) {
10675       values[i] = out->values[i];
10676    }
10677
10678    unsigned target;
10679    unsigned enabled_channels = 0;
10680    aco_opcode compr_op = aco_opcode::num_opcodes;
10681    bool compr = false;
10682    bool is_16bit = values[0].regClass() == v2b;
10683
10684    target = V_008DFC_SQ_EXP_MRT + out->slot;
10685
10686    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10687    if (out->enable_mrt_output_nan_fixup && !is_16bit &&
10688        (out->col_format == V_028714_SPI_SHADER_32_R ||
10689         out->col_format == V_028714_SPI_SHADER_32_GR ||
10690         out->col_format == V_028714_SPI_SHADER_32_AR ||
10691         out->col_format == V_028714_SPI_SHADER_32_ABGR ||
10692         out->col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10693       u_foreach_bit (i, out->write_mask) {
10694          Temp is_not_nan =
10695             bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]);
10696          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i],
10697                               is_not_nan);
10698       }
10699    }
10700
10701    switch (out->col_format) {
10702    case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10703
10704    case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10705
10706    case V_028714_SPI_SHADER_32_AR:
10707       if (ctx->options->gfx_level >= GFX10) {
10708          /* Special case: on GFX10, the outputs are different for 32_AR */
10709          enabled_channels = 0x3;
10710          values[1] = values[3];
10711          values[3] = Operand(v1);
10712       } else {
10713          enabled_channels = 0x9;
10714       }
10715       break;
10716
10717    case V_028714_SPI_SHADER_FP16_ABGR:
10718       for (int i = 0; i < 2; i++) {
10719          bool enabled = (out->write_mask >> (i * 2)) & 0x3;
10720          if (enabled) {
10721             enabled_channels |= 0x3 << (i * 2);
10722             if (is_16bit) {
10723                values[i] =
10724                   bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
10725                              values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
10726                              values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
10727             } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
10728                values[i] =
10729                   bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
10730                            values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10731                            values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10732             } else {
10733                values[i] =
10734                   bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
10735                            values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
10736                            values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
10737             }
10738          } else {
10739             values[i] = Operand(v1);
10740          }
10741       }
10742       values[2] = Operand(v1);
10743       values[3] = Operand(v1);
10744       compr = true;
10745       break;
10746
10747    case V_028714_SPI_SHADER_UNORM16_ABGR:
10748       if (is_16bit && ctx->options->gfx_level >= GFX9) {
10749          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10750       } else {
10751          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10752       }
10753       break;
10754
10755    case V_028714_SPI_SHADER_SNORM16_ABGR:
10756       if (is_16bit && ctx->options->gfx_level >= GFX9) {
10757          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10758       } else {
10759          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10760       }
10761       break;
10762
10763    case V_028714_SPI_SHADER_UINT16_ABGR:
10764       compr_op = aco_opcode::v_cvt_pk_u16_u32;
10765       if (out->is_int8 || out->is_int10) {
10766          /* clamp */
10767          uint32_t max_rgb = out->is_int8 ? 255 : out->is_int10 ? 1023 : 0;
10768
10769          u_foreach_bit (i, out->write_mask) {
10770             uint32_t max = i == 3 && out->is_int10 ? 3 : max_rgb;
10771
10772             values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]);
10773          }
10774       } else if (is_16bit) {
10775          u_foreach_bit (i, out->write_mask) {
10776             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10777             values[i] = Operand(tmp);
10778          }
10779       }
10780       break;
10781
10782    case V_028714_SPI_SHADER_SINT16_ABGR:
10783       compr_op = aco_opcode::v_cvt_pk_i16_i32;
10784       if (out->is_int8 || out->is_int10) {
10785          /* clamp */
10786          uint32_t max_rgb = out->is_int8 ? 127 : out->is_int10 ? 511 : 0;
10787          uint32_t min_rgb = out->is_int8 ? -128 : out->is_int10 ? -512 : 0;
10788
10789          u_foreach_bit (i, out->write_mask) {
10790             uint32_t max = i == 3 && out->is_int10 ? 1 : max_rgb;
10791             uint32_t min = i == 3 && out->is_int10 ? -2u : min_rgb;
10792
10793             values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]);
10794             values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]);
10795          }
10796       } else if (is_16bit) {
10797          u_foreach_bit (i, out->write_mask) {
10798             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10799             values[i] = Operand(tmp);
10800          }
10801       }
10802       break;
10803
10804    case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
10805
10806    case V_028714_SPI_SHADER_ZERO:
10807    default: return false;
10808    }
10809
10810    if (compr_op != aco_opcode::num_opcodes) {
10811       for (int i = 0; i < 2; i++) {
10812          /* check if at least one of the values to be compressed is enabled */
10813          bool enabled = (out->write_mask >> (i * 2)) & 0x3;
10814          if (enabled) {
10815             enabled_channels |= 0x3 << (i * 2);
10816             values[i] = bld.vop3(
10817                compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10818                values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10819          } else {
10820             values[i] = Operand(v1);
10821          }
10822       }
10823       values[2] = Operand(v1);
10824       values[3] = Operand(v1);
10825       compr = true;
10826    } else if (!compr) {
10827       for (int i = 0; i < 4; i++)
10828          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10829    }
10830
10831    if (ctx->program->gfx_level >= GFX11) {
10832       /* GFX11 doesn't use COMPR for exports, but the channel mask should be
10833        * 0x3 instead.
10834        */
10835       enabled_channels = compr ? 0x3 : enabled_channels;
10836       compr = false;
10837    }
10838
10839    for (unsigned i = 0; i < 4; i++)
10840       mrt->out[i] = values[i];
10841    mrt->target = target;
10842    mrt->enabled_channels = enabled_channels;
10843    mrt->compr = compr;
10844
10845    return true;
10846 }
10847
10848 static void
10849 create_fs_null_export(isel_context* ctx)
10850 {
10851    /* FS must always have exports.
10852     * So when there are none, we need to add a null export.
10853     */
10854
10855    Builder bld(ctx->program, ctx->block);
10856    /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */
10857    unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
10858    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
10859            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
10860
10861    ctx->program->has_color_exports = true;
10862 }
10863
10864 static void
10865 create_fs_jump_to_epilog(isel_context* ctx)
10866 {
10867    Builder bld(ctx->program, ctx->block);
10868    std::vector<Operand> color_exports;
10869    PhysReg exports_start(256); /* VGPR 0 */
10870
10871    for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) {
10872       unsigned color_index = slot - FRAG_RESULT_DATA0;
10873       unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
10874       unsigned write_mask = ctx->outputs.mask[slot];
10875
10876       if (!write_mask)
10877          continue;
10878
10879       PhysReg color_start(exports_start.reg() + color_index * 4);
10880
10881       for (unsigned i = 0; i < 4; i++) {
10882          if (!(write_mask & BITFIELD_BIT(i))) {
10883             color_exports.emplace_back(Operand(v1));
10884             continue;
10885          }
10886
10887          PhysReg chan_reg = color_start.advance(i * 4u);
10888          Operand chan(ctx->outputs.temps[slot * 4u + i]);
10889
10890          if (color_type == ACO_TYPE_FLOAT16) {
10891             chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan);
10892          } else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) {
10893             bool sign_ext = color_type == ACO_TYPE_INT16;
10894             Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
10895             chan = Operand(tmp);
10896          }
10897
10898          chan.setFixed(chan_reg);
10899          color_exports.emplace_back(chan);
10900       }
10901    }
10902
10903    Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.ps.epilog_pc));
10904
10905    aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
10906       aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + color_exports.size(), 0)};
10907    jump->operands[0] = Operand(continue_pc);
10908    for (unsigned i = 0; i < color_exports.size(); i++) {
10909       jump->operands[i + 1] = color_exports[i];
10910    }
10911    ctx->block->instructions.emplace_back(std::move(jump));
10912 }
10913
10914 PhysReg
10915 get_arg_reg(const struct ac_shader_args* args, struct ac_arg arg)
10916 {
10917    assert(arg.used);
10918    enum ac_arg_regfile file = args->args[arg.arg_index].file;
10919    unsigned reg = args->args[arg.arg_index].offset;
10920    return PhysReg(file == AC_ARG_SGPR ? reg : reg + 256);
10921 }
10922
10923 static Operand
10924 get_arg_for_end(isel_context* ctx, struct ac_arg arg)
10925 {
10926    return Operand(get_arg(ctx, arg), get_arg_reg(ctx->args, arg));
10927 }
10928
10929 static Temp
10930 get_tcs_out_current_patch_data_offset(isel_context* ctx)
10931 {
10932    Builder bld(ctx->program, ctx->block);
10933
10934    const unsigned output_vertex_size = ctx->program->info.tcs.num_linked_outputs * 4u;
10935    const unsigned pervertex_output_patch_size =
10936       ctx->program->info.tcs.tcs_vertices_out * output_vertex_size;
10937    const unsigned output_patch_stride =
10938       pervertex_output_patch_size + ctx->program->info.tcs.num_linked_patch_outputs * 4u;
10939
10940    Temp tcs_rel_ids = get_arg(ctx, ctx->args->tcs_rel_ids);
10941    Temp rel_patch_id =
10942       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand::c32(0u), Operand::c32(8u));
10943    Temp patch_offset = bld.v_mul_imm(bld.def(v1), rel_patch_id, output_patch_stride, false);
10944
10945    Temp tcs_offchip_layout = get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout);
10946
10947    Temp patch_control_points = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
10948                                         tcs_offchip_layout, Operand::c32(0x3f));
10949
10950    Temp num_patches = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10951                                tcs_offchip_layout, Operand::c32(0x60006));
10952
10953    Temp lshs_vertex_stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10954                                       tcs_offchip_layout, Operand::c32(0x8000c));
10955
10956    Temp input_patch_size =
10957       bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), patch_control_points, lshs_vertex_stride);
10958
10959    Temp output_patch0_offset =
10960       bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches, input_patch_size);
10961
10962    Temp output_patch_offset =
10963       bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
10964                      Operand::c32(pervertex_output_patch_size), output_patch0_offset);
10965
10966    return bld.nuw().vadd32(bld.def(v1), patch_offset, output_patch_offset);
10967 }
10968
10969 static Temp
10970 get_patch_base(isel_context* ctx)
10971 {
10972    Builder bld(ctx->program, ctx->block);
10973
10974    const unsigned output_vertex_size = ctx->program->info.tcs.num_linked_outputs * 16u;
10975    const unsigned pervertex_output_patch_size =
10976       ctx->program->info.tcs.tcs_vertices_out * output_vertex_size;
10977
10978    Temp num_patches =
10979       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10980                get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout), Operand::c32(0x60006));
10981
10982    return bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches,
10983                    Operand::c32(pervertex_output_patch_size));
10984 }
10985
10986 static void
10987 passthrough_all_args(isel_context* ctx, std::vector<Operand>& regs)
10988 {
10989    struct ac_arg arg;
10990    arg.used = true;
10991
10992    for (arg.arg_index = 0; arg.arg_index < ctx->args->arg_count; arg.arg_index++)
10993       regs.emplace_back(get_arg_for_end(ctx, arg));
10994 }
10995
10996 static void
10997 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
10998 {
10999    aco_ptr<Pseudo_instruction> end{create_instruction<Pseudo_instruction>(
11000       aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
11001
11002    for (unsigned i = 0; i < regs.size(); i++)
11003       end->operands[i] = regs[i];
11004
11005    ctx->block->instructions.emplace_back(std::move(end));
11006 }
11007
11008 static void
11009 create_tcs_jump_to_epilog(isel_context* ctx)
11010 {
11011    Builder bld(ctx->program, ctx->block);
11012
11013    PhysReg vgpr_start(256); /* VGPR 0 */
11014    PhysReg sgpr_start(0);   /* SGPR 0 */
11015
11016    /* SGPRs */
11017    Operand ring_offsets = Operand(get_arg(ctx, ctx->args->ring_offsets));
11018    ring_offsets.setFixed(sgpr_start);
11019
11020    Operand tess_offchip_offset = Operand(get_arg(ctx, ctx->args->tess_offchip_offset));
11021    tess_offchip_offset.setFixed(sgpr_start.advance(8u));
11022
11023    Operand tcs_factor_offset = Operand(get_arg(ctx, ctx->args->tcs_factor_offset));
11024    tcs_factor_offset.setFixed(sgpr_start.advance(12u));
11025
11026    Operand tcs_offchip_layout = Operand(get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout));
11027    tcs_offchip_layout.setFixed(sgpr_start.advance(16u));
11028
11029    Operand patch_base = Operand(get_patch_base(ctx));
11030    patch_base.setFixed(sgpr_start.advance(20u));
11031
11032    /* VGPRs */
11033    Operand tcs_out_current_patch_data_offset = Operand(get_tcs_out_current_patch_data_offset(ctx));
11034    tcs_out_current_patch_data_offset.setFixed(vgpr_start);
11035
11036    Operand invocation_id =
11037       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11038                Operand::c32(8u), Operand::c32(5u));
11039    invocation_id.setFixed(vgpr_start.advance(4u));
11040
11041    Operand rel_patch_id =
11042       bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11043                  Operand::c32(0u), Operand::c32(8u), Operand::c32(0u));
11044    rel_patch_id.setFixed(vgpr_start.advance(8u));
11045
11046    Temp continue_pc =
11047       convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.tcs.epilog_pc));
11048
11049    aco_ptr<Pseudo_instruction> jump{
11050       create_instruction<Pseudo_instruction>(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 9, 0)};
11051    jump->operands[0] = Operand(continue_pc);
11052    jump->operands[1] = ring_offsets;
11053    jump->operands[2] = tess_offchip_offset;
11054    jump->operands[3] = tcs_factor_offset;
11055    jump->operands[4] = tcs_offchip_layout;
11056    jump->operands[5] = patch_base;
11057    jump->operands[6] = tcs_out_current_patch_data_offset;
11058    jump->operands[7] = invocation_id;
11059    jump->operands[8] = rel_patch_id;
11060    ctx->block->instructions.emplace_back(std::move(jump));
11061 }
11062
11063 static void
11064 create_tcs_end_for_epilog(isel_context* ctx)
11065 {
11066    std::vector<Operand> regs;
11067
11068    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.tcs.tcs_offchip_layout));
11069    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.tcs.tes_offchip_addr));
11070    regs.emplace_back(get_arg_for_end(ctx, ctx->args->tess_offchip_offset));
11071    regs.emplace_back(get_arg_for_end(ctx, ctx->args->tcs_factor_offset));
11072
11073    Builder bld(ctx->program, ctx->block);
11074
11075    /* Leave a hole corresponding to the two input VGPRs. This ensures that
11076     * the invocation_id output does not alias the tcs_rel_ids input,
11077     * which saves a V_MOV on gfx9.
11078     */
11079    unsigned vgpr = 256 + ctx->args->num_vgprs_used;
11080
11081    Temp rel_patch_id =
11082       bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11083                  Operand::c32(0u), Operand::c32(8u), Operand::c32(0u));
11084    regs.emplace_back(Operand(rel_patch_id, PhysReg{vgpr++}));
11085
11086    Temp invocation_id =
11087       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11088                Operand::c32(8u), Operand::c32(5u));
11089    regs.emplace_back(Operand(invocation_id, PhysReg{vgpr++}));
11090
11091    if (ctx->program->info.tcs.pass_tessfactors_by_reg) {
11092       vgpr++; /* skip the tess factor LDS offset */
11093
11094       unsigned slot = VARYING_SLOT_TESS_LEVEL_OUTER;
11095       u_foreach_bit (i, ctx->outputs.mask[slot]) {
11096          regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11097       }
11098       vgpr += 4;
11099
11100       slot = VARYING_SLOT_TESS_LEVEL_INNER;
11101       u_foreach_bit (i, ctx->outputs.mask[slot]) {
11102          regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11103       }
11104    } else {
11105       Temp patch0_patch_data_offset =
11106          bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11107                   get_arg(ctx, ctx->program->info.tcs.vs_state_bits), Operand::c32(0xe000a));
11108
11109       Temp tf_lds_offset =
11110          bld.v_mul24_imm(bld.def(v1), rel_patch_id, ctx->program->info.tcs.patch_stride);
11111       tf_lds_offset = bld.nuw().vadd32(bld.def(v1), tf_lds_offset, patch0_patch_data_offset);
11112
11113       regs.emplace_back(Operand(tf_lds_offset, PhysReg{vgpr}));
11114    }
11115
11116    build_end_with_regs(ctx, regs);
11117 }
11118
11119 Pseudo_instruction*
11120 add_startpgm(struct isel_context* ctx)
11121 {
11122    unsigned def_count = 0;
11123    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11124       if (ctx->args->args[i].skip)
11125          continue;
11126       unsigned align = MIN2(4, util_next_power_of_two(ctx->args->args[i].size));
11127       if (ctx->args->args[i].file == AC_ARG_SGPR && ctx->args->args[i].offset % align)
11128          def_count += ctx->args->args[i].size;
11129       else
11130          def_count++;
11131    }
11132
11133    Pseudo_instruction* startpgm =
11134       create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
11135    ctx->block->instructions.emplace_back(startpgm);
11136    for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
11137       if (ctx->args->args[i].skip)
11138          continue;
11139
11140       enum ac_arg_regfile file = ctx->args->args[i].file;
11141       unsigned size = ctx->args->args[i].size;
11142       unsigned reg = ctx->args->args[i].offset;
11143       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11144
11145       if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) {
11146          Temp elems[16];
11147          for (unsigned j = 0; j < size; j++) {
11148             elems[j] = ctx->program->allocateTmp(s1);
11149             startpgm->definitions[arg++] = Definition(elems[j].id(), PhysReg{reg + j}, s1);
11150          }
11151          ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
11152       } else {
11153          Temp dst = ctx->program->allocateTmp(type);
11154          Definition def(dst);
11155          def.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11156          ctx->arg_temps[i] = dst;
11157          startpgm->definitions[arg++] = def;
11158
11159          if (ctx->args->args[i].pending_vmem) {
11160             assert(file == AC_ARG_VGPR);
11161             ctx->program->args_pending_vmem.push_back(def);
11162          }
11163       }
11164    }
11165
11166    /* epilog has no scratch */
11167    if (ctx->args->scratch_offset.used) {
11168       if (ctx->program->gfx_level < GFX9) {
11169          /* Stash these in the program so that they can be accessed later when
11170           * handling spilling.
11171           */
11172          if (ctx->args->ring_offsets.used)
11173             ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11174
11175          ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
11176       } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
11177          /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
11178           */
11179          Operand scratch_offset = Operand(get_arg(ctx, ctx->args->scratch_offset));
11180          scratch_offset.setLateKill(true);
11181
11182          Operand scratch_addr = ctx->args->ring_offsets.used
11183                                    ? Operand(get_arg(ctx, ctx->args->ring_offsets))
11184                                    : Operand(s2);
11185
11186          Builder bld(ctx->program, ctx->block);
11187          bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
11188                     scratch_offset);
11189       }
11190    }
11191
11192    return startpgm;
11193 }
11194
11195 void
11196 fix_ls_vgpr_init_bug(isel_context* ctx)
11197 {
11198    Builder bld(ctx->program, ctx->block);
11199    constexpr unsigned hs_idx = 1u;
11200    Builder::Result hs_thread_count =
11201       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11202                get_arg(ctx, ctx->args->merged_wave_info), Operand::c32((8u << 16) | (hs_idx * 8u)));
11203    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11204
11205    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11206
11207    Temp instance_id =
11208       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->vertex_id),
11209                get_arg(ctx, ctx->args->instance_id), ls_has_nonzero_hs_threads);
11210    Temp vs_rel_patch_id =
11211       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11212                get_arg(ctx, ctx->args->vs_rel_patch_id), ls_has_nonzero_hs_threads);
11213    Temp vertex_id =
11214       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_patch_id),
11215                get_arg(ctx, ctx->args->vertex_id), ls_has_nonzero_hs_threads);
11216
11217    ctx->arg_temps[ctx->args->instance_id.arg_index] = instance_id;
11218    ctx->arg_temps[ctx->args->vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11219    ctx->arg_temps[ctx->args->vertex_id.arg_index] = vertex_id;
11220 }
11221
11222 void
11223 split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11224 {
11225    /* Split all arguments except for the first (ring_offsets) and the last
11226     * (exec) so that the dead channels don't stay live throughout the program.
11227     */
11228    for (int i = 1; i < startpgm->definitions.size(); i++) {
11229       if (startpgm->definitions[i].regClass().size() > 1) {
11230          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11231                            startpgm->definitions[i].regClass().size());
11232       }
11233    }
11234 }
11235
11236 void
11237 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11238 {
11239    Program* program = ctx->program;
11240
11241    unsigned float_controls = shader->info.float_controls_execution_mode;
11242
11243    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11244       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11245    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11246       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11247                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11248
11249    program->next_fp_mode.must_flush_denorms32 =
11250       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11251    program->next_fp_mode.must_flush_denorms16_64 =
11252       float_controls &
11253       (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11254
11255    program->next_fp_mode.care_about_round32 =
11256       float_controls &
11257       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11258
11259    program->next_fp_mode.care_about_round16_64 =
11260       float_controls &
11261       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11262        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11263
11264    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11265     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11266    if (program->next_fp_mode.must_flush_denorms16_64)
11267       program->next_fp_mode.denorm16_64 = 0;
11268    else
11269       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11270
11271    /* preserving fp32 denorms is expensive, so only do it if asked */
11272    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11273       program->next_fp_mode.denorm32 = fp_denorm_keep;
11274    else
11275       program->next_fp_mode.denorm32 = 0;
11276
11277    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11278       program->next_fp_mode.round32 = fp_round_tz;
11279    else
11280       program->next_fp_mode.round32 = fp_round_ne;
11281
11282    if (float_controls &
11283        (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11284       program->next_fp_mode.round16_64 = fp_round_tz;
11285    else
11286       program->next_fp_mode.round16_64 = fp_round_ne;
11287
11288    ctx->block->fp_mode = program->next_fp_mode;
11289 }
11290
11291 void
11292 cleanup_cfg(Program* program)
11293 {
11294    /* create linear_succs/logical_succs */
11295    for (Block& BB : program->blocks) {
11296       for (unsigned idx : BB.linear_preds)
11297          program->blocks[idx].linear_succs.emplace_back(BB.index);
11298       for (unsigned idx : BB.logical_preds)
11299          program->blocks[idx].logical_succs.emplace_back(BB.index);
11300    }
11301 }
11302
11303 void
11304 finish_program(isel_context* ctx)
11305 {
11306    cleanup_cfg(ctx->program);
11307
11308    /* Insert a single p_end_wqm instruction after the last derivative calculation */
11309    if (ctx->program->stage == fragment_fs && ctx->program->needs_wqm && ctx->program->needs_exact) {
11310       /* Find the next BB at top-level CFG */
11311       while (!(ctx->program->blocks[ctx->wqm_block_idx].kind & block_kind_top_level)) {
11312          ctx->wqm_block_idx++;
11313          ctx->wqm_instruction_idx = 0;
11314       }
11315
11316       std::vector<aco_ptr<Instruction>>* instrs =
11317          &ctx->program->blocks[ctx->wqm_block_idx].instructions;
11318       auto it = instrs->begin() + ctx->wqm_instruction_idx;
11319
11320       /* Delay transistion to Exact to help optimizations and scheduling */
11321       while (it != instrs->end()) {
11322          aco_ptr<Instruction>& instr = *it;
11323          /* End WQM before: */
11324          if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP() ||
11325              instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
11326              instr->opcode == aco_opcode::p_logical_start)
11327             break;
11328
11329          ++it;
11330
11331          /* End WQM after: */
11332          if (instr->opcode == aco_opcode::p_logical_end ||
11333              instr->opcode == aco_opcode::p_discard_if ||
11334              instr->opcode == aco_opcode::p_demote_to_helper ||
11335              instr->opcode == aco_opcode::p_end_with_regs)
11336             break;
11337       }
11338
11339       Builder bld(ctx->program);
11340       bld.reset(instrs, it);
11341       bld.pseudo(aco_opcode::p_end_wqm);
11342    }
11343 }
11344
11345 Temp
11346 lanecount_to_mask(isel_context* ctx, Temp count)
11347 {
11348    assert(count.regClass() == s1);
11349
11350    Builder bld(ctx->program, ctx->block);
11351    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11352    Temp cond;
11353
11354    if (ctx->program->wave_size == 64) {
11355       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11356       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11357                                 Operand::c32(6u /* log2(64) */));
11358       cond =
11359          bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11360    } else {
11361       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11362        * the register */
11363       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11364    }
11365
11366    return cond;
11367 }
11368
11369 Temp
11370 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11371 {
11372    Builder bld(ctx->program, ctx->block);
11373
11374    /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11375    Temp count = i == 0 ? get_arg(ctx, ctx->args->merged_wave_info)
11376                        : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11377                                   get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(i * 8u));
11378
11379    return lanecount_to_mask(ctx, count);
11380 }
11381
11382 static void
11383 insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
11384 {
11385    unsigned src_count = ctx.args->arg_count;
11386    Pseudo_instruction* ret =
11387       create_instruction<Pseudo_instruction>(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
11388    ctx.block->instructions.emplace_back(ret);
11389
11390    for (unsigned i = 0; i < src_count; i++) {
11391       enum ac_arg_regfile file = ctx.args->args[i].file;
11392       unsigned size = ctx.args->args[i].size;
11393       unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
11394       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11395       Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
11396                                          : Operand(PhysReg{reg}, type);
11397       ret->operands[i] = op;
11398    }
11399
11400    Builder bld(ctx.program, ctx.block);
11401    bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
11402 }
11403
11404 void
11405 select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
11406                   const struct ac_shader_args* args)
11407 {
11408    for (unsigned i = 0; i < shader_count; i++) {
11409       if (i) {
11410          ctx.block = ctx.program->create_and_insert_block();
11411          ctx.block->kind = block_kind_top_level | block_kind_resume;
11412       }
11413
11414       nir_shader* nir = shaders[i];
11415       init_context(&ctx, nir);
11416       setup_fp_mode(&ctx, nir);
11417
11418       Pseudo_instruction* startpgm = add_startpgm(&ctx);
11419       append_logical_start(ctx.block);
11420       split_arguments(&ctx, startpgm);
11421       visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
11422       append_logical_end(ctx.block);
11423       ctx.block->kind |= block_kind_uniform;
11424
11425       /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
11426        * shader without shader calls.
11427        */
11428       if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
11429          insert_rt_jump_next(ctx, args);
11430
11431       cleanup_context(&ctx);
11432    }
11433
11434    ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
11435    finish_program(&ctx);
11436 }
11437
11438 void
11439 pops_await_overlapped_waves(isel_context* ctx)
11440 {
11441    ctx->program->has_pops_overlapped_waves_wait = true;
11442
11443    Builder bld(ctx->program, ctx->block);
11444
11445    if (ctx->program->gfx_level >= GFX11) {
11446       /* GFX11+ - waiting for the export from the overlapped waves.
11447        * Await the export_ready event (bit wait_event_imm_dont_wait_export_ready clear).
11448        */
11449       bld.sopp(aco_opcode::s_wait_event, -1, 0);
11450       return;
11451    }
11452
11453    /* Pre-GFX11 - sleep loop polling the exiting wave ID. */
11454
11455    const Temp collision = get_arg(ctx, ctx->args->pops_collision_wave_id);
11456
11457    /* Check if there's an overlap in the current wave - otherwise, the wait may result in a hang. */
11458    const Temp did_overlap =
11459       bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), collision, Operand::c32(31));
11460    if_context did_overlap_if_context;
11461    begin_uniform_if_then(ctx, &did_overlap_if_context, did_overlap);
11462    bld.reset(ctx->block);
11463
11464    /* Set the packer register - after this, pops_exiting_wave_id can be polled. */
11465    if (ctx->program->gfx_level >= GFX10) {
11466       /* 2 packer ID bits on GFX10-10.3. */
11467       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11468                                       collision, Operand::c32(0x2001c));
11469       /* POPS_PACKER register: bit 0 - POPS enabled for this wave, bits 2:1 - packer ID. */
11470       const Temp packer_id_hwreg_bits = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1),
11471                                                  bld.def(s1, scc), packer_id, Operand::c32(1));
11472       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((3 - 1) << 11) | 25);
11473    } else {
11474       /* 1 packer ID bit on GFX9. */
11475       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11476                                       collision, Operand::c32(0x1001c));
11477       /* MODE register: bit 24 - wave is associated with packer 0, bit 25 - with packer 1.
11478        * Packer index to packer bits: 0 to 0b01, 1 to 0b10.
11479        */
11480       const Temp packer_id_hwreg_bits =
11481          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), packer_id, Operand::c32(1));
11482       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((2 - 1) << 11) | (24 << 6) | 1);
11483    }
11484
11485    Temp newest_overlapped_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11486                                              collision, Operand::c32(0xa0010));
11487    if (ctx->program->gfx_level < GFX10) {
11488       /* On GFX9, the newest overlapped wave ID value passed to the shader is smaller than the
11489        * actual wave ID by 1 in case of wraparound.
11490        */
11491       const Temp current_wave_id = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11492                                             collision, Operand::c32(0x3ff));
11493       const Temp newest_overlapped_wave_id_wrapped = bld.sopc(
11494          aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), newest_overlapped_wave_id, current_wave_id);
11495       newest_overlapped_wave_id =
11496          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), newest_overlapped_wave_id,
11497                   newest_overlapped_wave_id_wrapped);
11498    }
11499
11500    /* The wave IDs are the low 10 bits of a monotonically increasing wave counter.
11501     * The overlapped and the exiting wave IDs can't be larger than the current wave ID, and they are
11502     * no more than 1023 values behind the current wave ID.
11503     * Remap the overlapped and the exiting wave IDs from wrapping to monotonic so an unsigned
11504     * comparison can be used: the wave `current - 1023` becomes 0, it's followed by a piece growing
11505     * away from 0, then a piece increasing until UINT32_MAX, and the current wave is UINT32_MAX.
11506     * To do that, subtract `current - 1023`, which with wrapping arithmetic is (current + 1), and
11507     * `a - (b + 1)` is `a + ~b`.
11508     * Note that if the 10-bit current wave ID is 1023 (thus 1024 will be subtracted), the wave
11509     * `current - 1023` will become `UINT32_MAX - 1023` rather than 0, but all the possible wave IDs
11510     * will still grow monotonically in the 32-bit value, and the unsigned comparison will behave as
11511     * expected.
11512     */
11513    const Temp wave_id_offset = bld.sop2(aco_opcode::s_nand_b32, bld.def(s1), bld.def(s1, scc),
11514                                         collision, Operand::c32(0x3ff));
11515    newest_overlapped_wave_id = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11516                                         newest_overlapped_wave_id, wave_id_offset);
11517
11518    /* Await the overlapped waves. */
11519
11520    loop_context wait_loop_context;
11521    begin_loop(ctx, &wait_loop_context);
11522    bld.reset(ctx->block);
11523
11524    const Temp exiting_wave_id = bld.pseudo(aco_opcode::p_pops_gfx9_add_exiting_wave_id, bld.def(s1),
11525                                            bld.def(s1, scc), wave_id_offset);
11526    /* If the exiting (not exited) wave ID is larger than the newest overlapped wave ID (after
11527     * remapping both to monotonically increasing unsigned integers), the newest overlapped wave has
11528     * exited the ordered section.
11529     */
11530    const Temp newest_overlapped_wave_exited = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc),
11531                                                        newest_overlapped_wave_id, exiting_wave_id);
11532    if_context newest_overlapped_wave_exited_if_context;
11533    begin_uniform_if_then(ctx, &newest_overlapped_wave_exited_if_context,
11534                          newest_overlapped_wave_exited);
11535    emit_loop_break(ctx);
11536    begin_uniform_if_else(ctx, &newest_overlapped_wave_exited_if_context);
11537    end_uniform_if(ctx, &newest_overlapped_wave_exited_if_context);
11538    bld.reset(ctx->block);
11539
11540    /* Sleep before rechecking to let overlapped waves run for some time. */
11541    bld.sopp(aco_opcode::s_sleep, -1, ctx->program->gfx_level >= GFX10 ? UINT16_MAX : 3);
11542
11543    end_loop(ctx, &wait_loop_context);
11544    bld.reset(ctx->block);
11545
11546    /* Indicate the wait has been done to subsequent compilation stages. */
11547    bld.pseudo(aco_opcode::p_pops_gfx9_overlapped_wave_wait_done);
11548
11549    begin_uniform_if_else(ctx, &did_overlap_if_context);
11550    end_uniform_if(ctx, &did_overlap_if_context);
11551    bld.reset(ctx->block);
11552 }
11553
11554 static void
11555 create_merged_jump_to_epilog(isel_context* ctx)
11556 {
11557    Builder bld(ctx->program, ctx->block);
11558    std::vector<Operand> regs;
11559
11560    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11561       if (!ctx->args->args[i].preserved)
11562          continue;
11563
11564       const enum ac_arg_regfile file = ctx->args->args[i].file;
11565       const unsigned reg = ctx->args->args[i].offset;
11566
11567       Operand op(ctx->arg_temps[i]);
11568       op.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11569       regs.emplace_back(op);
11570    }
11571
11572    Temp continue_pc =
11573       convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));
11574
11575    aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
11576       aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
11577    jump->operands[0] = Operand(continue_pc);
11578    for (unsigned i = 0; i < regs.size(); i++) {
11579       jump->operands[i + 1] = regs[i];
11580    }
11581    ctx->block->instructions.emplace_back(std::move(jump));
11582 }
11583
11584 void
11585 select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, const bool need_barrier,
11586               if_context* ic_merged_wave_info, const bool check_merged_wave_info,
11587               const bool endif_merged_wave_info)
11588 {
11589    init_context(&ctx, nir);
11590    setup_fp_mode(&ctx, nir);
11591
11592    Program* program = ctx.program;
11593
11594    if (need_startpgm) {
11595       /* Needs to be after init_context() for FS. */
11596       Pseudo_instruction* startpgm = add_startpgm(&ctx);
11597       append_logical_start(ctx.block);
11598
11599       if (unlikely(ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
11600          fix_ls_vgpr_init_bug(&ctx);
11601
11602       split_arguments(&ctx, startpgm);
11603
11604       if (!program->info.vs.has_prolog &&
11605           (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11606          Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11607       }
11608    }
11609
11610    if (program->gfx_level == GFX10 && program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER &&
11611        !program->stage.has(SWStage::GS)) {
11612       /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11613        * s_sendmsg(GS_ALLOC_REQ).
11614        */
11615       Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11616    }
11617
11618    if (check_merged_wave_info) {
11619       const unsigned i =
11620          nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ? 0 : 1;
11621       const Temp cond = merged_wave_info_to_mask(&ctx, i);
11622       begin_divergent_if_then(&ctx, ic_merged_wave_info, cond);
11623    }
11624
11625    if (need_barrier) {
11626       const sync_scope scope = ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq &&
11627                                      program->wave_size % nir->info.tess.tcs_vertices_out == 0
11628                                   ? scope_subgroup
11629                                   : scope_workgroup;
11630
11631       Builder(ctx.program, ctx.block)
11632          .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
11633                   scope);
11634    }
11635
11636    nir_function_impl* func = nir_shader_get_entrypoint(nir);
11637    visit_cf_list(&ctx, &func->body);
11638
11639    if (ctx.program->info.has_epilog) {
11640       if (ctx.stage == fragment_fs) {
11641          create_fs_jump_to_epilog(&ctx);
11642
11643          /* FS epilogs always have at least one color/null export. */
11644          ctx.program->has_color_exports = true;
11645          ctx.block->kind |= block_kind_export_end;
11646       } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
11647          assert(ctx.stage == tess_control_hs || ctx.stage == vertex_tess_control_hs);
11648          if (ctx.options->is_opengl)
11649             create_tcs_end_for_epilog(&ctx);
11650          else
11651             create_tcs_jump_to_epilog(&ctx);
11652       }
11653    }
11654
11655    if (endif_merged_wave_info) {
11656       begin_divergent_if_else(&ctx, ic_merged_wave_info);
11657       end_divergent_if(&ctx, ic_merged_wave_info);
11658    }
11659
11660    if (ctx.program->info.merged_shader_compiled_separately &&
11661        (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES)) {
11662       assert(program->gfx_level >= GFX9);
11663       create_merged_jump_to_epilog(&ctx);
11664       ctx.block->kind |= block_kind_export_end;
11665    }
11666
11667    cleanup_context(&ctx);
11668 }
11669
11670 void
11671 select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader* const* shaders)
11672 {
11673    if_context ic_merged_wave_info;
11674    const bool ngg_gs = ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.has(SWStage::GS);
11675
11676    for (unsigned i = 0; i < shader_count; i++) {
11677       nir_shader* nir = shaders[i];
11678
11679       /* We always need to insert p_startpgm at the beginning of the first shader.  */
11680       const bool need_startpgm = i == 0;
11681
11682       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11683       nir_function_impl* func = nir_shader_get_entrypoint(nir);
11684       const bool empty_shader =
11685          nir_cf_list_is_empty_block(&func->body) &&
11686          ((nir->info.stage == MESA_SHADER_VERTEX &&
11687            (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11688           (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11689
11690       /* See if we need to emit a check of the merged wave info SGPR. */
11691       const bool check_merged_wave_info =
11692          ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11693       const bool endif_merged_wave_info =
11694          ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11695
11696       /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11697       const bool tcs_skip_barrier =
11698          ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read;
11699
11700       /* A barrier is usually needed at the beginning of the second shader, with exceptions. */
11701       const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
11702
11703       select_shader(ctx, nir, need_startpgm, need_barrier, &ic_merged_wave_info,
11704                     check_merged_wave_info, endif_merged_wave_info);
11705
11706       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11707          /* Special handling when TCS input and output patch size is the same.
11708           * Outputs of the previous stage are inputs to the next stage.
11709           */
11710          ctx.inputs = ctx.outputs;
11711          ctx.outputs = shader_io_state();
11712       }
11713    }
11714 }
11715
11716 Temp
11717 get_tess_ring_descriptor(isel_context* ctx, const struct aco_tcs_epilog_info* einfo,
11718                          bool is_tcs_factor_ring)
11719 {
11720    Builder bld(ctx->program, ctx->block);
11721
11722    if (!ctx->options->is_opengl) {
11723       Temp ring_offsets = get_arg(ctx, ctx->args->ring_offsets);
11724       uint32_t tess_ring_offset =
11725          is_tcs_factor_ring ? 5 /* RING_HS_TESS_FACTOR */ : 6 /* RING_HS_TESS_OFFCHIP */;
11726       return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ring_offsets,
11727                       Operand::c32(tess_ring_offset * 16u));
11728    }
11729
11730    Temp addr = get_arg(ctx, einfo->tcs_out_lds_layout);
11731    /* TCS only receives high 13 bits of the address. */
11732    addr = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), addr,
11733                    Operand::c32(0xfff80000));
11734
11735    if (is_tcs_factor_ring) {
11736       addr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr,
11737                       Operand::c32(einfo->tess_offchip_ring_size));
11738    }
11739
11740    uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
11741                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
11742
11743    if (ctx->options->gfx_level >= GFX11) {
11744       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
11745                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
11746    } else if (ctx->options->gfx_level >= GFX10) {
11747       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
11748                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
11749    } else {
11750       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
11751                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
11752    }
11753
11754    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr,
11755                      Operand::c32(ctx->options->address32_hi), Operand::c32(0xffffffff),
11756                      Operand::c32(rsrc3));
11757 }
11758
11759 void
11760 store_tess_factor_to_tess_ring(isel_context* ctx, Temp tess_ring_desc, Temp factors[],
11761                                unsigned factor_comps, Temp sbase, Temp voffset, Temp num_patches,
11762                                unsigned patch_offset)
11763 {
11764    Builder bld(ctx->program, ctx->block);
11765
11766    Temp soffset = sbase;
11767    if (patch_offset) {
11768       Temp offset =
11769          bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches, Operand::c32(patch_offset));
11770       soffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), soffset, offset);
11771    }
11772
11773    Temp data = factor_comps == 1
11774                   ? factors[0]
11775                   : create_vec_from_array(ctx, factors, factor_comps, RegType::vgpr, 4);
11776
11777    emit_single_mubuf_store(ctx, tess_ring_desc, voffset, soffset, Temp(), data, 0,
11778                            memory_sync_info(storage_vmem_output), true, false, false);
11779 }
11780
11781 Temp
11782 build_fast_udiv_nuw(isel_context* ctx, Temp num, Temp multiplier, Temp pre_shift, Temp post_shift,
11783                     Temp increment)
11784 {
11785    Builder bld(ctx->program, ctx->block);
11786
11787    num = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), pre_shift, num);
11788    num = bld.nuw().vadd32(bld.def(v1), num, increment);
11789    num = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), num, multiplier);
11790    return bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), post_shift, num);
11791 }
11792
11793 Temp
11794 get_gl_vs_prolog_vertex_index(isel_context* ctx, const struct aco_gl_vs_prolog_info* vinfo,
11795                               unsigned input_index, Temp instance_divisor_constbuf)
11796 {
11797    bool divisor_is_one = vinfo->instance_divisor_is_one & (1u << input_index);
11798    bool divisor_is_fetched = vinfo->instance_divisor_is_fetched & (1u << input_index);
11799
11800    Builder bld(ctx->program, ctx->block);
11801
11802    Temp index;
11803    if (divisor_is_one) {
11804       index = get_arg(ctx, ctx->args->instance_id);
11805    } else if (divisor_is_fetched) {
11806       Temp instance_id = get_arg(ctx, ctx->args->instance_id);
11807
11808       Temp udiv_factors = bld.smem(aco_opcode::s_buffer_load_dwordx4, bld.def(s4),
11809                                    instance_divisor_constbuf, Operand::c32(input_index * 16));
11810       emit_split_vector(ctx, udiv_factors, 4);
11811
11812       index = build_fast_udiv_nuw(ctx, instance_id, emit_extract_vector(ctx, udiv_factors, 0, s1),
11813                                   emit_extract_vector(ctx, udiv_factors, 1, s1),
11814                                   emit_extract_vector(ctx, udiv_factors, 2, s1),
11815                                   emit_extract_vector(ctx, udiv_factors, 3, s1));
11816    }
11817
11818    if (divisor_is_one || divisor_is_fetched) {
11819       Temp start_instance = get_arg(ctx, ctx->args->start_instance);
11820       index = bld.vadd32(bld.def(v1), index, start_instance);
11821    } else {
11822       Temp base_vertex = get_arg(ctx, ctx->args->base_vertex);
11823       Temp vertex_id = get_arg(ctx, ctx->args->vertex_id);
11824       index = bld.vadd32(bld.def(v1), base_vertex, vertex_id);
11825    }
11826
11827    return index;
11828 }
11829
11830 } /* end namespace */
11831
11832 void
11833 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11834                ac_shader_config* config, const struct aco_compiler_options* options,
11835                const struct aco_shader_info* info, const struct ac_shader_args* args)
11836 {
11837    isel_context ctx =
11838       setup_isel_context(program, shader_count, shaders, config, options, info, args);
11839
11840    if (ctx.stage == raytracing_cs)
11841       return select_program_rt(ctx, shader_count, shaders, args);
11842
11843    if (shader_count >= 2) {
11844       select_program_merged(ctx, shader_count, shaders);
11845    } else {
11846       bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
11847       if_context ic_merged_wave_info;
11848
11849       /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
11850       if (ctx.program->info.merged_shader_compiled_separately) {
11851          assert(ctx.program->gfx_level >= GFX9);
11852          if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
11853             check_merged_wave_info = endif_merged_wave_info = true;
11854          } else {
11855             const bool ngg_gs =
11856                ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.sw == SWStage::GS;
11857             assert(ctx.stage == tess_control_hs || ctx.stage == geometry_gs || ngg_gs);
11858             check_merged_wave_info = endif_merged_wave_info = !ngg_gs;
11859             need_barrier = !ngg_gs;
11860          }
11861       }
11862
11863       select_shader(ctx, shaders[0], true, need_barrier, &ic_merged_wave_info,
11864                     check_merged_wave_info, endif_merged_wave_info);
11865    }
11866
11867    program->config->float_mode = program->blocks[0].fp_mode.val;
11868
11869    append_logical_end(ctx.block);
11870    ctx.block->kind |= block_kind_uniform;
11871
11872    if (!ctx.program->info.has_epilog ||
11873        (shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL &&
11874         options->gfx_level >= GFX9)) {
11875       Builder bld(ctx.program, ctx.block);
11876       bld.sopp(aco_opcode::s_endpgm);
11877    }
11878
11879    finish_program(&ctx);
11880 }
11881
11882 void
11883 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
11884                            const struct aco_compiler_options* options,
11885                            const struct aco_shader_info* info, const struct ac_shader_args* args)
11886 {
11887    assert(options->gfx_level == GFX8);
11888
11889    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
11890                 config);
11891
11892    isel_context ctx = {};
11893    ctx.program = program;
11894    ctx.args = args;
11895    ctx.options = options;
11896    ctx.stage = program->stage;
11897
11898    ctx.block = ctx.program->create_and_insert_block();
11899    ctx.block->kind = block_kind_top_level;
11900
11901    program->workgroup_size = 1; /* XXX */
11902
11903    add_startpgm(&ctx);
11904    append_logical_start(ctx.block);
11905
11906    Builder bld(ctx.program, ctx.block);
11907
11908    /* Load the buffer descriptor from TMA. */
11909    bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
11910             Operand::zero());
11911
11912    /* Store TTMP0-TTMP1. */
11913    bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
11914             Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
11915
11916    uint32_t hw_regs_idx[] = {
11917       2, /* HW_REG_STATUS */
11918       3, /* HW_REG_TRAP_STS */
11919       4, /* HW_REG_HW_ID */
11920       7, /* HW_REG_IB_STS */
11921    };
11922
11923    /* Store some hardware registers. */
11924    for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
11925       /* "((size - 1) << 11) | register" */
11926       bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
11927                ((20 - 1) << 11) | hw_regs_idx[i]);
11928
11929       bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
11930                Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
11931    }
11932
11933    program->config->float_mode = program->blocks[0].fp_mode.val;
11934
11935    append_logical_end(ctx.block);
11936    ctx.block->kind |= block_kind_uniform;
11937    bld.sopp(aco_opcode::s_endpgm);
11938
11939    finish_program(&ctx);
11940 }
11941
11942 Operand
11943 get_arg_fixed(const struct ac_shader_args* args, struct ac_arg arg)
11944 {
11945    enum ac_arg_regfile file = args->args[arg.arg_index].file;
11946    unsigned size = args->args[arg.arg_index].size;
11947    RegClass rc = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11948    return Operand(get_arg_reg(args, arg), rc);
11949 }
11950
11951 unsigned
11952 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
11953 {
11954    unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
11955
11956    unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
11957    if (bld.program->gfx_level >= GFX10 && num_loads > 1)
11958       bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
11959
11960    for (unsigned i = 0; i < count;) {
11961       unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
11962
11963       if (size == 4)
11964          bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
11965                   Operand::c32((start + i) * 16u));
11966       else if (size == 2)
11967          bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
11968                   Operand::c32((start + i) * 16u));
11969       else
11970          bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
11971                   Operand::c32((start + i) * 16u));
11972
11973       dest = dest.advance(size * 16u);
11974       i += size;
11975    }
11976
11977    return count;
11978 }
11979
11980 Operand
11981 calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args,
11982                             const struct aco_vs_prolog_info* pinfo, unsigned index,
11983                             Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
11984                             PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
11985 {
11986    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
11987             get_arg_fixed(args, pinfo->inputs), Operand::c32(8u + index * 8u));
11988
11989    wait_imm lgkm_imm;
11990    lgkm_imm.lgkm = 0;
11991    bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->gfx_level));
11992
11993    Definition fetch_index_def(tmp_vgpr0, v1);
11994    Operand fetch_index(tmp_vgpr0, v1);
11995
11996    Operand div_info(tmp_sgpr, s1);
11997    if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX11) {
11998       /* use SDWA */
11999       if (bld.program->gfx_level < GFX9) {
12000          bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
12001          div_info = Operand(tmp_vgpr1, v1);
12002       }
12003
12004       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12005
12006       Instruction* instr;
12007       if (bld.program->gfx_level >= GFX9)
12008          instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
12009       else
12010          instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
12011                                div_info, fetch_index)
12012                     .instr;
12013       instr->sdwa().sel[0] = SubdwordSel::ubyte1;
12014
12015       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
12016                fetch_index);
12017
12018       instr =
12019          bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
12020       instr->sdwa().sel[0] = SubdwordSel::ubyte2;
12021    } else {
12022       Operand tmp_op(tmp_vgpr1, v1);
12023       Definition tmp_def(tmp_vgpr1, v1);
12024
12025       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12026
12027       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
12028       bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
12029
12030       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
12031                Operand(tmp_sgpr.advance(4), s1));
12032
12033       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
12034       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
12035    }
12036
12037    bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
12038
12039    return fetch_index;
12040 }
12041
12042 void
12043 select_rt_prolog(Program* program, ac_shader_config* config,
12044                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12045                  const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
12046 {
12047    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12048                 config);
12049    Block* block = program->create_and_insert_block();
12050    block->kind = block_kind_top_level;
12051    program->workgroup_size = info->workgroup_size;
12052    program->wave_size = info->workgroup_size;
12053    calc_min_waves(program);
12054    Builder bld(program, block);
12055    block->instructions.reserve(32);
12056    unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
12057    unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
12058
12059    /* Inputs:
12060     * Ring offsets:                s[0-1]
12061     * Indirect descriptor sets:    s[2]
12062     * Push constants pointer:      s[3]
12063     * SBT descriptors:             s[4-5]
12064     * Traversal shader address:    s[6-7]
12065     * Ray launch size address:     s[8-9]
12066     * Dynamic callable stack base: s[10]
12067     * Workgroup IDs (xyz):         s[11], s[12], s[13]
12068     * Scratch offset:              s[14]
12069     * Local invocation IDs:        v[0-2]
12070     */
12071    PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
12072    PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
12073    PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
12074    PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
12075    PhysReg in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]);
12076    PhysReg in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]);
12077    PhysReg in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]);
12078    PhysReg in_scratch_offset;
12079    if (options->gfx_level < GFX11)
12080       in_scratch_offset = get_arg_reg(in_args, in_args->scratch_offset);
12081    PhysReg in_local_ids[2] = {
12082       get_arg_reg(in_args, in_args->local_invocation_ids),
12083       get_arg_reg(in_args, in_args->local_invocation_ids).advance(4),
12084    };
12085
12086    /* Outputs:
12087     * Callee shader PC:            s[0-1]
12088     * Indirect descriptor sets:    s[2]
12089     * Push constants pointer:      s[3]
12090     * SBT descriptors:             s[4-5]
12091     * Traversal shader address:    s[6-7]
12092     * Ray launch sizes (xyz):      s[8], s[9], s[10]
12093     * Scratch offset (<GFX9 only): s[11]
12094     * Ring offsets (<GFX9 only):   s[12-13]
12095     * Ray launch IDs:              v[0-2]
12096     * Stack pointer:               v[3]
12097     * Shader VA:                   v[4-5]
12098     * Shader Record Ptr:           v[6-7]
12099     */
12100    PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
12101    PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_size);
12102    PhysReg out_launch_size_z = out_launch_size_x.advance(8);
12103    PhysReg out_launch_ids[3];
12104    for (unsigned i = 0; i < 3; i++)
12105       out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_id).advance(i * 4);
12106    PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
12107    PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
12108
12109    /* Temporaries: */
12110    num_sgprs = align(num_sgprs, 2) + 4;
12111    PhysReg tmp_raygen_sbt = PhysReg{num_sgprs - 4};
12112    PhysReg tmp_ring_offsets = PhysReg{num_sgprs - 2};
12113
12114    /* Confirm some assumptions about register aliasing */
12115    assert(in_ring_offsets == out_uniform_shader_addr);
12116    assert(get_arg_reg(in_args, in_args->push_constants) ==
12117           get_arg_reg(out_args, out_args->push_constants));
12118    assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
12119           get_arg_reg(out_args, out_args->rt.sbt_descriptors));
12120    assert(in_launch_size_addr == out_launch_size_x);
12121    assert(in_stack_base == out_launch_size_z);
12122    assert(in_local_ids[0] == out_launch_ids[0]);
12123
12124    /* load raygen sbt */
12125    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
12126             Operand::c32(0u));
12127
12128    /* init scratch */
12129    if (options->gfx_level < GFX9) {
12130       /* copy ring offsets to temporary location*/
12131       bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
12132                Operand(in_ring_offsets, s2));
12133    } else if (options->gfx_level < GFX11) {
12134       hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
12135                       Operand(in_scratch_offset, s1));
12136    }
12137
12138    /* set stack ptr */
12139    bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
12140
12141    /* load raygen address */
12142    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
12143             Operand(tmp_raygen_sbt, s2), Operand::c32(0u));
12144
12145    /* load ray launch sizes */
12146    bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
12147             Operand(in_launch_size_addr, s2), Operand::c32(8u));
12148    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
12149             Operand(in_launch_size_addr, s2), Operand::c32(0u));
12150
12151    /* calculate ray launch ids */
12152    if (options->gfx_level >= GFX11) {
12153       /* Thread IDs are packed in VGPR0, 10 bits per component. */
12154       bld.vop3(aco_opcode::v_bfe_u32, Definition(in_local_ids[1], v1), Operand(in_local_ids[0], v1),
12155                Operand::c32(10u), Operand::c32(3u));
12156       bld.vop2(aco_opcode::v_and_b32, Definition(in_local_ids[0], v1), Operand::c32(0x7),
12157                Operand(in_local_ids[0], v1));
12158    }
12159    /* Do this backwards to reduce some RAW hazards on GFX11+ */
12160    bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
12161    bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1), Operand(in_wg_id_y, s1),
12162             Operand::c32(program->workgroup_size == 32 ? 4 : 8), Operand(in_local_ids[1], v1));
12163    bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1),
12164             Operand::c32(8), Operand(in_local_ids[0], v1));
12165
12166    if (options->gfx_level < GFX9) {
12167       /* write scratch/ring offsets to outputs, if needed */
12168       bld.sop1(aco_opcode::s_mov_b32,
12169                Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
12170                Operand(in_scratch_offset, s1));
12171       bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
12172                Operand(tmp_ring_offsets, s2));
12173    }
12174
12175    /* calculate shader record ptr: SBT + RADV_RT_HANDLE_SIZE */
12176    if (options->gfx_level < GFX9) {
12177       bld.vop2_e64(aco_opcode::v_add_co_u32, Definition(out_record_ptr, v1), Definition(vcc, s2),
12178                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12179    } else {
12180       bld.vop2_e64(aco_opcode::v_add_u32, Definition(out_record_ptr, v1),
12181                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12182    }
12183    bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
12184             Operand(tmp_raygen_sbt.advance(4), s1));
12185
12186    /* jump to raygen */
12187    bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
12188
12189    program->config->float_mode = program->blocks[0].fp_mode.val;
12190    program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12191    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12192 }
12193
12194 void
12195 select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config,
12196                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12197                  const struct ac_shader_args* args)
12198 {
12199    assert(pinfo->num_attributes > 0);
12200
12201    /* This should be enough for any shader/stage. */
12202    unsigned max_user_sgprs = options->gfx_level >= GFX9 ? 32 : 16;
12203
12204    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12205                 config);
12206    program->dev.vgpr_limit = 256;
12207
12208    Block* block = program->create_and_insert_block();
12209    block->kind = block_kind_top_level;
12210
12211    program->workgroup_size = 64;
12212    calc_min_waves(program);
12213
12214    Builder bld(program, block);
12215
12216    block->instructions.reserve(16 + pinfo->num_attributes * 4);
12217
12218    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
12219
12220    uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
12221    bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors & attrib_mask;
12222
12223    wait_imm lgkm_imm;
12224    lgkm_imm.lgkm = 0;
12225
12226    /* choose sgprs */
12227    PhysReg vertex_buffers(align(max_user_sgprs + 14, 2));
12228    PhysReg prolog_input = vertex_buffers.advance(8);
12229    PhysReg desc(
12230       align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
12231
12232    Operand start_instance = get_arg_fixed(args, args->start_instance);
12233    Operand instance_id = get_arg_fixed(args, args->instance_id);
12234
12235    PhysReg attributes_start(256 + args->num_vgprs_used);
12236    /* choose vgprs that won't be used for anything else until the last attribute load */
12237    PhysReg vertex_index(attributes_start.reg() + pinfo->num_attributes * 4 - 1);
12238    PhysReg instance_index(attributes_start.reg() + pinfo->num_attributes * 4 - 2);
12239    PhysReg start_instance_vgpr(attributes_start.reg() + pinfo->num_attributes * 4 - 3);
12240    PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + pinfo->num_attributes * 4 - 4);
12241    PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + pinfo->num_attributes * 4);
12242
12243    bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
12244             get_arg_fixed(args, args->vertex_buffers));
12245    if (options->address32_hi >= 0xffff8000 || options->address32_hi <= 0x7fff) {
12246       bld.sopk(aco_opcode::s_movk_i32, Definition(vertex_buffers.advance(4), s1),
12247                options->address32_hi & 0xFFFF);
12248    } else {
12249       bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
12250                Operand::c32((unsigned)options->address32_hi));
12251    }
12252
12253    /* calculate vgpr requirements */
12254    unsigned num_vgprs = attributes_start.reg() - 256;
12255    num_vgprs += pinfo->num_attributes * 4;
12256    if (has_nontrivial_divisors && program->gfx_level <= GFX8)
12257       num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
12258    unsigned num_sgprs = 0;
12259
12260    const struct ac_vtx_format_info* vtx_info_table =
12261       ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
12262
12263    for (unsigned loc = 0; loc < pinfo->num_attributes;) {
12264       unsigned num_descs =
12265          load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc);
12266       num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
12267
12268       if (loc == 0) {
12269          /* perform setup while we load the descriptors */
12270          if (pinfo->is_ngg || pinfo->next_stage != MESA_SHADER_VERTEX) {
12271             Operand count = get_arg_fixed(args, args->merged_wave_info);
12272             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
12273             if (program->wave_size == 64) {
12274                bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
12275                         Operand::c32(6u /* log2(64) */));
12276                bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
12277                         Operand(exec, s2), Operand(scc, s1));
12278             }
12279          }
12280
12281          bool needs_instance_index = false;
12282          bool needs_start_instance = false;
12283          u_foreach_bit (i, pinfo->state.instance_rate_inputs & attrib_mask) {
12284             needs_instance_index |= pinfo->state.divisors[i] == 1;
12285             needs_start_instance |= pinfo->state.divisors[i] == 0;
12286          }
12287          bool needs_vertex_index = ~pinfo->state.instance_rate_inputs & attrib_mask;
12288          if (needs_vertex_index)
12289             bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
12290                        get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);
12291          if (needs_instance_index)
12292             bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12293                        Operand(s2), true);
12294          if (needs_start_instance)
12295             bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12296       }
12297
12298       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
12299
12300       for (unsigned i = 0; i < num_descs;) {
12301          PhysReg dest(attributes_start.reg() + loc * 4u);
12302
12303          /* calculate index */
12304          Operand fetch_index = Operand(vertex_index, v1);
12305          if (pinfo->state.instance_rate_inputs & (1u << loc)) {
12306             uint32_t divisor = pinfo->state.divisors[loc];
12307             if (divisor) {
12308                fetch_index = instance_id;
12309                if (pinfo->state.nontrivial_divisors & (1u << loc)) {
12310                   unsigned index =
12311                      util_bitcount(pinfo->state.nontrivial_divisors & BITFIELD_MASK(loc));
12312                   fetch_index = calc_nontrivial_instance_id(
12313                      bld, args, pinfo, index, instance_id, start_instance, prolog_input,
12314                      nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12315                } else {
12316                   fetch_index = Operand(instance_index, v1);
12317                }
12318             } else {
12319                fetch_index = Operand(start_instance_vgpr, v1);
12320             }
12321          }
12322
12323          /* perform load */
12324          PhysReg cur_desc = desc.advance(i * 16);
12325          if ((pinfo->misaligned_mask & (1u << loc))) {
12326             const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->state.formats[loc]];
12327
12328             assert(vtx_info->has_hw_format & 0x1);
12329             unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12330             unsigned nfmt = vtx_info->hw_format[0] >> 4;
12331
12332             for (unsigned j = 0; j < vtx_info->num_channels; j++) {
12333                bool post_shuffle = pinfo->state.post_shuffle & (1u << loc);
12334                unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12335
12336                /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
12337                 * doesn't require this to work, but some GL CTS tests over Zink do this anyway.
12338                 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
12339                 * care).
12340                 */
12341                if (dfmt == V_008F0C_BUF_DATA_FORMAT_32)
12342                   bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
12343                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
12344                             false, true);
12345                else if (vtx_info->chan_byte_size == 8)
12346                   bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
12347                             Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
12348                             fetch_index, Operand::c32(0u), dfmt, nfmt, offset, false, true);
12349                else
12350                   bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12351                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), dfmt, nfmt,
12352                             offset, false, true);
12353             }
12354             uint32_t one =
12355                nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
12356                   ? 1u
12357                   : 0x3f800000u;
12358             /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
12359              * For 64-bit data types, no default attribute values are provided. Input variables must
12360              * not use more components than provided by the attribute.
12361              */
12362             for (unsigned j = vtx_info->num_channels; vtx_info->chan_byte_size != 8 && j < 4; j++) {
12363                bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
12364                         Operand::c32(j == 3 ? one : 0u));
12365             }
12366
12367             unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
12368             loc += slots;
12369             i += slots;
12370          } else {
12371             bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12372                       Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
12373             loc++;
12374             i++;
12375          }
12376       }
12377    }
12378
12379    if (pinfo->state.alpha_adjust_lo | pinfo->state.alpha_adjust_hi) {
12380       wait_imm vm_imm;
12381       vm_imm.vm = 0;
12382       bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->gfx_level));
12383    }
12384
12385    /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
12386     * so we may need to fix it up. */
12387    u_foreach_bit (loc, (pinfo->state.alpha_adjust_lo | pinfo->state.alpha_adjust_hi)) {
12388       PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
12389
12390       unsigned alpha_adjust = (pinfo->state.alpha_adjust_lo >> loc) & 0x1;
12391       alpha_adjust |= ((pinfo->state.alpha_adjust_hi >> loc) & 0x1) << 1;
12392
12393       if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
12394          bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
12395
12396       /* For the integer-like cases, do a natural sign extension.
12397        *
12398        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
12399        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
12400        * exponent.
12401        */
12402       unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
12403       bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
12404                Operand::c32(offset), Operand::c32(2u));
12405
12406       /* Convert back to the right type. */
12407       if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
12408          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12409          bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
12410                   Operand(alpha, v1));
12411       } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
12412          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12413       }
12414    }
12415
12416    block->kind |= block_kind_uniform;
12417
12418    /* continue on to the main shader */
12419    Operand continue_pc = get_arg_fixed(args, pinfo->inputs);
12420    if (has_nontrivial_divisors) {
12421       bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
12422                get_arg_fixed(args, pinfo->inputs), Operand::c32(0u));
12423       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
12424       continue_pc = Operand(prolog_input, s2);
12425    }
12426
12427    bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
12428
12429    program->config->float_mode = program->blocks[0].fp_mode.val;
12430    /* addition on GFX6-8 requires a carry-out (we use VCC) */
12431    program->needs_vcc = program->gfx_level <= GFX8;
12432    program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
12433    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12434 }
12435
12436 void
12437 select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
12438                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12439                  const struct ac_shader_args* args)
12440 {
12441    const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo;
12442    isel_context ctx =
12443       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
12444
12445    ctx.block->fp_mode = program->next_fp_mode;
12446
12447    add_startpgm(&ctx);
12448    append_logical_start(ctx.block);
12449
12450    Builder bld(ctx.program, ctx.block);
12451
12452    /* Export all color render targets */
12453    struct aco_export_mrt mrts[8];
12454    uint8_t exported_mrts = 0;
12455
12456    for (unsigned i = 0; i < 8; i++) {
12457       unsigned col_format = (einfo->spi_shader_col_format >> (i * 4)) & 0xf;
12458
12459       if (col_format == V_028714_SPI_SHADER_ZERO)
12460          continue;
12461
12462       struct mrt_color_export out;
12463
12464       out.slot = i;
12465       out.write_mask = 0xf;
12466       out.col_format = col_format;
12467       out.is_int8 = (einfo->color_is_int8 >> i) & 1;
12468       out.is_int10 = (einfo->color_is_int10 >> i) & 1;
12469       out.enable_mrt_output_nan_fixup = (options->enable_mrt_output_nan_fixup >> i) & 1;
12470
12471       Temp inputs = get_arg(&ctx, einfo->inputs[i]);
12472       emit_split_vector(&ctx, inputs, 4);
12473       for (unsigned c = 0; c < 4; ++c) {
12474          out.values[c] = Operand(emit_extract_vector(&ctx, inputs, c, v1));
12475       }
12476
12477       if (export_fs_mrt_color(&ctx, &out, &mrts[i])) {
12478          exported_mrts |= 1 << i;
12479       }
12480    }
12481
12482    if (exported_mrts) {
12483       if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) {
12484          struct aco_export_mrt* mrt0 = (exported_mrts & BITFIELD_BIT(0)) ? &mrts[0] : NULL;
12485          struct aco_export_mrt* mrt1 = (exported_mrts & BITFIELD_BIT(1)) ? &mrts[1] : NULL;
12486          create_fs_dual_src_export_gfx11(&ctx, mrt0, mrt1);
12487       } else {
12488          u_foreach_bit (i, exported_mrts) {
12489             export_mrt(&ctx, &mrts[i]);
12490          }
12491       }
12492    } else {
12493       create_fs_null_export(&ctx);
12494    }
12495
12496    program->config->float_mode = program->blocks[0].fp_mode.val;
12497
12498    append_logical_end(ctx.block);
12499    ctx.block->kind |= block_kind_export_end;
12500    bld.reset(ctx.block);
12501    bld.sopp(aco_opcode::s_endpgm);
12502
12503    finish_program(&ctx);
12504 }
12505
12506 void
12507 select_tcs_epilog(Program* program, void* pinfo, ac_shader_config* config,
12508                   const struct aco_compiler_options* options, const struct aco_shader_info* info,
12509                   const struct ac_shader_args* args)
12510 {
12511    const struct aco_tcs_epilog_info* einfo = (const struct aco_tcs_epilog_info*)pinfo;
12512    isel_context ctx =
12513       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::TCS);
12514
12515    ctx.block->fp_mode = program->next_fp_mode;
12516
12517    add_startpgm(&ctx);
12518    append_logical_start(ctx.block);
12519
12520    Builder bld(ctx.program, ctx.block);
12521
12522    /* Add a barrier before loading tess factors from LDS. */
12523    if (!einfo->pass_tessfactors_by_reg) {
12524       /* To generate s_waitcnt lgkmcnt(0) when waitcnt insertion. */
12525       program->pending_lds_access = true;
12526
12527       sync_scope scope = einfo->tcs_out_patch_fits_subgroup ? scope_subgroup : scope_workgroup;
12528       bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
12529                   scope);
12530    }
12531
12532    Temp invocation_id = get_arg(&ctx, einfo->invocation_id);
12533
12534    Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), invocation_id);
12535
12536    if_context ic_invoc_0;
12537    begin_divergent_if_then(&ctx, &ic_invoc_0, cond);
12538
12539    int outer_comps, inner_comps;
12540    switch (einfo->primitive_mode) {
12541    case TESS_PRIMITIVE_ISOLINES:
12542       outer_comps = 2;
12543       inner_comps = 0;
12544       break;
12545    case TESS_PRIMITIVE_TRIANGLES:
12546       outer_comps = 3;
12547       inner_comps = 1;
12548       break;
12549    case TESS_PRIMITIVE_QUADS:
12550       outer_comps = 4;
12551       inner_comps = 2;
12552       break;
12553    default: unreachable("invalid primitive mode"); return;
12554    }
12555
12556    bld.reset(ctx.block);
12557
12558    unsigned tess_lvl_out_loc =
12559       ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER) * 16;
12560    unsigned tess_lvl_in_loc =
12561       ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER) * 16;
12562
12563    Temp outer[4];
12564    Temp inner[2];
12565    if (einfo->pass_tessfactors_by_reg) {
12566       for (int i = 0; i < outer_comps; i++)
12567          outer[i] = get_arg(&ctx, einfo->tess_lvl_out[i]);
12568
12569       for (int i = 0; i < inner_comps; i++)
12570          inner[i] = get_arg(&ctx, einfo->tess_lvl_in[i]);
12571    } else {
12572       Temp addr = get_arg(&ctx, einfo->tcs_out_current_patch_data_offset);
12573       addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr);
12574
12575       Temp data = program->allocateTmp(RegClass(RegType::vgpr, outer_comps));
12576       load_lds(&ctx, 4, outer_comps, data, addr, tess_lvl_out_loc, 4);
12577       for (int i = 0; i < outer_comps; i++)
12578          outer[i] = emit_extract_vector(&ctx, data, i, v1);
12579
12580       if (inner_comps) {
12581          data = program->allocateTmp(RegClass(RegType::vgpr, inner_comps));
12582          load_lds(&ctx, 4, inner_comps, data, addr, tess_lvl_in_loc, 4);
12583          for (int i = 0; i < inner_comps; i++)
12584             inner[i] = emit_extract_vector(&ctx, data, i, v1);
12585       }
12586    }
12587
12588    Temp tess_factor_ring_desc = get_tess_ring_descriptor(&ctx, einfo, true);
12589    Temp tess_factor_ring_base = get_arg(&ctx, args->tcs_factor_offset);
12590    Temp rel_patch_id = get_arg(&ctx, einfo->rel_patch_id);
12591    unsigned tess_factor_ring_const_offset = 0;
12592
12593    if (program->gfx_level <= GFX8) {
12594       /* Store the dynamic HS control word. */
12595       cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), rel_patch_id);
12596
12597       if_context ic_patch_0;
12598       begin_divergent_if_then(&ctx, &ic_patch_0, cond);
12599
12600       bld.reset(ctx.block);
12601
12602       Temp data = bld.copy(bld.def(v1), Operand::c32(0x80000000u));
12603
12604       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, Temp(0, v1), tess_factor_ring_base,
12605                               Temp(), data, 0, memory_sync_info(), true, false, false);
12606
12607       tess_factor_ring_const_offset += 4;
12608
12609       begin_divergent_if_else(&ctx, &ic_patch_0);
12610       end_divergent_if(&ctx, &ic_patch_0);
12611    }
12612
12613    bld.reset(ctx.block);
12614
12615    Temp tess_factor_ring_offset =
12616       bld.v_mul_imm(bld.def(v1), rel_patch_id, (inner_comps + outer_comps) * 4, false);
12617
12618    switch (einfo->primitive_mode) {
12619    case TESS_PRIMITIVE_ISOLINES: {
12620       /* For isolines, the hardware expects tess factors in the reverse order. */
12621       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), outer[1], outer[0]);
12622       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
12623                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
12624                               memory_sync_info(), true, false, false);
12625       break;
12626    }
12627    case TESS_PRIMITIVE_TRIANGLES: {
12628       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), outer[0], outer[1], outer[2],
12629                              inner[0]);
12630       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
12631                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
12632                               memory_sync_info(), true, false, false);
12633       break;
12634    }
12635    case TESS_PRIMITIVE_QUADS: {
12636       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), outer[0], outer[1], outer[2],
12637                              outer[3]);
12638       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
12639                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
12640                               memory_sync_info(), true, false, false);
12641
12642       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), inner[0], inner[1]);
12643       emit_single_mubuf_store(
12644          &ctx, tess_factor_ring_desc, tess_factor_ring_offset, tess_factor_ring_base, Temp(), data,
12645          tess_factor_ring_const_offset + 16, memory_sync_info(), true, false, false);
12646       break;
12647    }
12648    default: unreachable("invalid primitive mode"); break;
12649    }
12650
12651    if (einfo->tes_reads_tessfactors) {
12652       Temp layout = get_arg(&ctx, einfo->tcs_offchip_layout);
12653       Temp num_patches, patch_base;
12654
12655       if (ctx.options->is_opengl) {
12656          num_patches = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), layout,
12657                                 Operand::c32(0x3f));
12658          num_patches = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), num_patches,
12659                                 Operand::c32(1));
12660
12661          patch_base = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), layout,
12662                                Operand::c32(16));
12663       } else {
12664          num_patches = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), layout,
12665                                 Operand::c32(0x60006));
12666
12667          patch_base = get_arg(&ctx, einfo->patch_base);
12668       }
12669
12670       Temp tess_ring_desc = get_tess_ring_descriptor(&ctx, einfo, false);
12671       Temp tess_ring_base = get_arg(&ctx, args->tess_offchip_offset);
12672
12673       Temp sbase =
12674          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), tess_ring_base, patch_base);
12675
12676       Temp voffset =
12677          bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4), rel_patch_id);
12678
12679       store_tess_factor_to_tess_ring(&ctx, tess_ring_desc, outer, outer_comps, sbase, voffset,
12680                                      num_patches, tess_lvl_out_loc);
12681
12682       if (inner_comps) {
12683          store_tess_factor_to_tess_ring(&ctx, tess_ring_desc, inner, inner_comps, sbase, voffset,
12684                                         num_patches, tess_lvl_in_loc);
12685       }
12686    }
12687
12688    begin_divergent_if_else(&ctx, &ic_invoc_0);
12689    end_divergent_if(&ctx, &ic_invoc_0);
12690
12691    program->config->float_mode = program->blocks[0].fp_mode.val;
12692
12693    append_logical_end(ctx.block);
12694
12695    bld.reset(ctx.block);
12696    bld.sopp(aco_opcode::s_endpgm);
12697
12698    finish_program(&ctx);
12699 }
12700
12701 void
12702 select_gl_vs_prolog(Program* program, void* pinfo, ac_shader_config* config,
12703                     const struct aco_compiler_options* options, const struct aco_shader_info* info,
12704                     const struct ac_shader_args* args)
12705 {
12706    const struct aco_gl_vs_prolog_info* vinfo = (const struct aco_gl_vs_prolog_info*)pinfo;
12707    isel_context ctx =
12708       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::VS);
12709
12710    ctx.block->fp_mode = program->next_fp_mode;
12711
12712    add_startpgm(&ctx);
12713    append_logical_start(ctx.block);
12714
12715    Builder bld(ctx.program, ctx.block);
12716
12717    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
12718
12719    if (vinfo->as_ls && options->has_ls_vgpr_init_bug)
12720       fix_ls_vgpr_init_bug(&ctx);
12721
12722    std::vector<Operand> regs;
12723    passthrough_all_args(&ctx, regs);
12724
12725    Temp instance_divisor_constbuf;
12726
12727    if (vinfo->instance_divisor_is_fetched) {
12728       Temp list = get_arg(&ctx, vinfo->internal_bindings);
12729       list = convert_pointer_to_64_bit(&ctx, list);
12730
12731       instance_divisor_constbuf = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list,
12732                                            Operand::c32(vinfo->instance_diviser_buf_offset));
12733    }
12734
12735    unsigned vgpr = 256 + ctx.args->num_vgprs_used;
12736
12737    for (unsigned i = 0; i < vinfo->num_inputs; i++) {
12738       Temp index = get_gl_vs_prolog_vertex_index(&ctx, vinfo, i, instance_divisor_constbuf);
12739       regs.emplace_back(Operand(index, PhysReg{vgpr + i}));
12740    }
12741
12742    program->config->float_mode = program->blocks[0].fp_mode.val;
12743
12744    append_logical_end(ctx.block);
12745
12746    build_end_with_regs(&ctx, regs);
12747
12748    finish_program(&ctx);
12749 }
12750
12751 } // namespace aco