src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include "aco_instruction_selection.h"
  27
  28 #include "aco_builder.h"
  29 #include "aco_interface.h"
  30 #include "aco_ir.h"
  31
  32 #include "common/ac_nir.h"
  33 #include "common/sid.h"
  34
  35 #include "util/fast_idiv_by_const.h"
  36 #include "util/memstream.h"
  37
  38 #include <array>
  39 #include <functional>
  40 #include <map>
  41 #include <numeric>
  42 #include <stack>
  43 #include <utility>
  44 #include <vector>
  45
  46 namespace aco {
  47 namespace {
  48
  49 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
  50
  51 static void
  52 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
  53           const char* msg)
  54 {
  55    char* out;
  56    size_t outsize;
  57    struct u_memstream mem;
  58    u_memstream_open(&mem, &out, &outsize);
  59    FILE* const memf = u_memstream_get(&mem);
  60
  61    fprintf(memf, "%s: ", msg);
  62    nir_print_instr(instr, memf);
  63    u_memstream_close(&mem);
  64
  65    _aco_err(ctx->program, file, line, out);
  66    free(out);
  67 }
  68
  69 struct if_context {
  70    Temp cond;
  71
  72    bool divergent_old;
  73    bool exec_potentially_empty_discard_old;
  74    bool exec_potentially_empty_break_old;
  75    bool had_divergent_discard_old;
  76    bool had_divergent_discard_then;
  77    uint16_t exec_potentially_empty_break_depth_old;
  78
  79    unsigned BB_if_idx;
  80    unsigned invert_idx;
  81    bool uniform_has_then_branch;
  82    bool then_branch_divergent;
  83    Block BB_invert;
  84    Block BB_endif;
  85 };
  86
  87 struct loop_context {
  88    Block loop_exit;
  89
  90    unsigned header_idx_old;
  91    Block* exit_old;
  92    bool divergent_cont_old;
  93    bool divergent_branch_old;
  94    bool divergent_if_old;
  95 };
  96
  97 static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
  98
  99 static void
 100 add_logical_edge(unsigned pred_idx, Block* succ)
 101 {
 102    succ->logical_preds.emplace_back(pred_idx);
 103 }
 104
 105 static void
 106 add_linear_edge(unsigned pred_idx, Block* succ)
 107 {
 108    succ->linear_preds.emplace_back(pred_idx);
 109 }
 110
 111 static void
 112 add_edge(unsigned pred_idx, Block* succ)
 113 {
 114    add_logical_edge(pred_idx, succ);
 115    add_linear_edge(pred_idx, succ);
 116 }
 117
 118 static void
 119 append_logical_start(Block* b)
 120 {
 121    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 122 }
 123
 124 static void
 125 append_logical_end(Block* b)
 126 {
 127    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 128 }
 129
 130 Temp
 131 get_ssa_temp(struct isel_context* ctx, nir_def* def)
 132 {
 133    uint32_t id = ctx->first_temp_id + def->index;
 134    return Temp(id, ctx->program->temp_rc[id]);
 135 }
 136
 137 Temp
 138 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
 139 {
 140    Builder bld(ctx->program, ctx->block);
 141    assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
 142    assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
 143
 144    if (ctx->program->wave_size == 32) {
 145       Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
 146       return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
 147    }
 148
 149    Operand mask_lo = Operand::c32(-1u);
 150    Operand mask_hi = Operand::c32(-1u);
 151
 152    if (mask.isTemp()) {
 153       RegClass rc = RegClass(mask.regClass().type(), 1);
 154       Builder::Result mask_split =
 155          bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
 156       mask_lo = Operand(mask_split.def(0).getTemp());
 157       mask_hi = Operand(mask_split.def(1).getTemp());
 158    } else if (mask.physReg() == exec) {
 159       mask_lo = Operand(exec_lo, s1);
 160       mask_hi = Operand(exec_hi, s1);
 161    }
 162
 163    Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
 164
 165    if (ctx->program->gfx_level <= GFX7)
 166       return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
 167    else
 168       return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
 169 }
 170
 171 Temp
 172 emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)
 173 {
 174    if (bld.program->stage != fragment_fs) {
 175       if (!dst.id())
 176          return src;
 177       else
 178          return bld.copy(Definition(dst), src);
 179    } else if (!dst.id()) {
 180       dst = bld.tmp(src.regClass());
 181    }
 182
 183    assert(src.bytes() == dst.bytes());
 184    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 185    bld.program->needs_wqm |= program_needs_wqm;
 186    return dst;
 187 }
 188
 189 static Temp
 190 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
 191 {
 192    if (index.regClass() == s1)
 193       return bld.readlane(bld.def(s1), data, index);
 194
 195    /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists
 196     * of multiple binaries, because the VGPR use is not known when choosing
 197     * which registers to use for the shared VGPRs.
 198     */
 199    const bool avoid_shared_vgprs =
 200       ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 &&
 201       ctx->program->wave_size == 64 &&
 202       (ctx->program->info.has_epilog || !ctx->program->info.is_monolithic ||
 203        ctx->stage == raytracing_cs);
 204
 205    if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
 206       /* GFX6-7: there is no bpermute instruction */
 207       Operand index_op(index);
 208       Operand input_data(data);
 209       index_op.setLateKill(true);
 210       input_data.setLateKill(true);
 211
 212       return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
 213                         bld.def(bld.lm, vcc), index_op, input_data);
 214    } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
 215
 216       /* GFX10 wave64 mode: emulate full-wave bpermute */
 217       Temp index_is_lo =
 218          bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
 219       Builder::Result index_is_lo_split =
 220          bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
 221       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
 222                                      index_is_lo_split.def(1).getTemp());
 223       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 224                                      index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
 225       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
 226       Operand input_data(data);
 227
 228       index_x4.setLateKill(true);
 229       input_data.setLateKill(true);
 230       same_half.setLateKill(true);
 231
 232       if (ctx->options->gfx_level <= GFX10_3) {
 233          /* We need one pair of shared VGPRs:
 234           * Note, that these have twice the allocation granularity of normal VGPRs
 235           */
 236          ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
 237
 238          return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2),
 239                            bld.def(s1, scc), index_x4, input_data, same_half);
 240       } else {
 241          return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2),
 242                            bld.def(s1, scc), Operand(v1.as_linear()), index_x4, input_data,
 243                            same_half);
 244       }
 245    } else {
 246       /* GFX8-9 or GFX10 wave32: bpermute works normally */
 247       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
 248       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
 249    }
 250 }
 251
 252 static Temp
 253 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
 254 {
 255    if (ctx->options->gfx_level >= GFX8) {
 256       unsigned and_mask = mask & 0x1f;
 257       unsigned or_mask = (mask >> 5) & 0x1f;
 258       unsigned xor_mask = (mask >> 10) & 0x1f;
 259
 260       uint16_t dpp_ctrl = 0xffff;
 261
 262       /* DPP16 before DPP8 before v_permlane(x)16_b32
 263        * because DPP16 supports modifiers and v_permlane
 264        * can't be folded into valu instructions.
 265        */
 266       if ((and_mask & 0x1c) == 0x1c && or_mask < 4 && xor_mask < 4) {
 267          unsigned res[4] = {0, 1, 2, 3};
 268          for (unsigned i = 0; i < 4; i++)
 269             res[i] = (((res[i] & and_mask) | or_mask) ^ xor_mask) & 0x3;
 270          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
 271       } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
 272          dpp_ctrl = dpp_row_rr(8);
 273       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
 274          dpp_ctrl = dpp_row_mirror;
 275       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
 276          dpp_ctrl = dpp_row_half_mirror;
 277       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x10 && or_mask < 0x10 &&
 278                  xor_mask < 0x10) {
 279          dpp_ctrl = dpp_row_share(or_mask ^ xor_mask);
 280       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && !or_mask &&
 281                  xor_mask < 0x10) {
 282          dpp_ctrl = dpp_row_xmask(xor_mask);
 283       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && or_mask < 8 &&
 284                  xor_mask < 8) {
 285          Builder::Result ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src);
 286          for (unsigned i = 0; i < 8; i++) {
 287             ret->dpp8().lane_sel[i] = (((i & and_mask) | or_mask) ^ xor_mask) & 0x7;
 288          }
 289          return ret;
 290       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10 && or_mask < 0x10) {
 291          uint64_t lane_mask = 0;
 292          for (unsigned i = 0; i < 16; i++)
 293             lane_mask |= uint64_t(((i & and_mask) | or_mask) ^ (xor_mask & 0xf)) << i * 4;
 294          aco_opcode opcode =
 295             xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32;
 296          Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff));
 297          Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
 298          Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
 299          ret->valu().opsel = 0x3; /* set BOUND_CTRL/FETCH_INACTIVE */
 300          return ret;
 301       }
 302
 303       if (dpp_ctrl != 0xffff)
 304          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
 305    }
 306
 307    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
 308 }
 309
 310 Temp
 311 as_vgpr(Builder& bld, Temp val)
 312 {
 313    if (val.type() == RegType::sgpr)
 314       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 315    assert(val.type() == RegType::vgpr);
 316    return val;
 317 }
 318
 319 Temp
 320 as_vgpr(isel_context* ctx, Temp val)
 321 {
 322    Builder bld(ctx->program, ctx->block);
 323    return as_vgpr(bld, val);
 324 }
 325
 326 void
 327 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 328 {
 329    Builder bld(ctx->program, ctx->block);
 330    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
 331 }
 332
 333 Temp
 334 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 335 {
 336    /* no need to extract the whole vector */
 337    if (src.regClass() == dst_rc) {
 338       assert(idx == 0);
 339       return src;
 340    }
 341
 342    assert(src.bytes() > (idx * dst_rc.bytes()));
 343    Builder bld(ctx->program, ctx->block);
 344    auto it = ctx->allocated_vec.find(src.id());
 345    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
 346       if (it->second[idx].regClass() == dst_rc) {
 347          return it->second[idx];
 348       } else {
 349          assert(!dst_rc.is_subdword());
 350          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 351          return bld.copy(bld.def(dst_rc), it->second[idx]);
 352       }
 353    }
 354
 355    if (dst_rc.is_subdword())
 356       src = as_vgpr(ctx, src);
 357
 358    if (src.bytes() == dst_rc.bytes()) {
 359       assert(idx == 0);
 360       return bld.copy(bld.def(dst_rc), src);
 361    } else {
 362       Temp dst = bld.tmp(dst_rc);
 363       emit_extract_vector(ctx, src, idx, dst);
 364       return dst;
 365    }
 366 }
 367
 368 void
 369 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 370 {
 371    if (num_components == 1)
 372       return;
 373    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 374       return;
 375    RegClass rc;
 376    if (num_components > vec_src.size()) {
 377       if (vec_src.type() == RegType::sgpr) {
 378          /* should still help get_alu_src() */
 379          emit_split_vector(ctx, vec_src, vec_src.size());
 380          return;
 381       }
 382       /* sub-dword split */
 383       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
 384    } else {
 385       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
 386    }
 387    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
 388       aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 389    split->operands[0] = Operand(vec_src);
 390    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 391    for (unsigned i = 0; i < num_components; i++) {
 392       elems[i] = ctx->program->allocateTmp(rc);
 393       split->definitions[i] = Definition(elems[i]);
 394    }
 395    ctx->block->instructions.emplace_back(std::move(split));
 396    ctx->allocated_vec.emplace(vec_src.id(), elems);
 397 }
 398
 399 /* This vector expansion uses a mask to determine which elements in the new vector
 400  * come from the original vector. The other elements are undefined. */
 401 void
 402 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
 403               bool zero_padding = false)
 404 {
 405    assert(vec_src.type() == RegType::vgpr);
 406    Builder bld(ctx->program, ctx->block);
 407
 408    if (dst.type() == RegType::sgpr && num_components > dst.size()) {
 409       Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components));
 410       expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
 411       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst);
 412       ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
 413       return;
 414    }
 415
 416    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 417
 418    if (vec_src == dst)
 419       return;
 420
 421    if (num_components == 1) {
 422       if (dst.type() == RegType::sgpr)
 423          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 424       else
 425          bld.copy(Definition(dst), vec_src);
 426       return;
 427    }
 428
 429    unsigned component_bytes = dst.bytes() / num_components;
 430    RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes);
 431    RegClass dst_rc = RegClass::get(dst.type(), component_bytes);
 432    assert(dst.type() == RegType::vgpr || !src_rc.is_subdword());
 433    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 434
 435    Temp padding = Temp(0, dst_rc);
 436    if (zero_padding)
 437       padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));
 438
 439    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
 440       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 441    vec->definitions[0] = Definition(dst);
 442    unsigned k = 0;
 443    for (unsigned i = 0; i < num_components; i++) {
 444       if (mask & (1 << i)) {
 445          Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
 446          if (dst.type() == RegType::sgpr)
 447             src = bld.as_uniform(src);
 448          vec->operands[i] = Operand(src);
 449          elems[i] = src;
 450       } else {
 451          vec->operands[i] = Operand::zero(component_bytes);
 452          elems[i] = padding;
 453       }
 454    }
 455    ctx->block->instructions.emplace_back(std::move(vec));
 456    ctx->allocated_vec.emplace(dst.id(), elems);
 457 }
 458
 459 /* adjust misaligned small bit size loads */
 460 void
 461 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
 462 {
 463    Builder bld(ctx->program, ctx->block);
 464    Operand shift;
 465    Temp select = Temp();
 466    if (offset.isConstant()) {
 467       assert(offset.constantValue() && offset.constantValue() < 4);
 468       shift = Operand::c32(offset.constantValue() * 8);
 469    } else {
 470       /* bit_offset = 8 * (offset & 0x3) */
 471       Temp tmp =
 472          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
 473       select = bld.tmp(s1);
 474       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
 475                        Operand::c32(3u));
 476    }
 477
 478    if (vec.size() == 1) {
 479       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
 480    } else if (vec.size() == 2) {
 481       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
 482       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
 483       if (tmp == dst)
 484          emit_split_vector(ctx, dst, 2);
 485       else
 486          emit_extract_vector(ctx, tmp, 0, dst);
 487    } else if (vec.size() == 3 || vec.size() == 4) {
 488       Temp lo = bld.tmp(s2), hi;
 489       if (vec.size() == 3) {
 490          /* this can happen if we use VMEM for a uniform load */
 491          hi = bld.tmp(s1);
 492          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
 493       } else {
 494          hi = bld.tmp(s2);
 495          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
 496          hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
 497       }
 498       if (select != Temp())
 499          hi =
 500             bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
 501       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
 502       Temp mid = bld.tmp(s1);
 503       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
 504       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
 505       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
 506       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
 507       emit_split_vector(ctx, dst, 2);
 508    }
 509 }
 510
 511 void
 512 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
 513 {
 514    Builder bld(ctx->program, ctx->block);
 515    if (offset.isTemp()) {
 516       Temp tmp[4] = {vec, vec, vec, vec};
 517
 518       if (vec.size() == 4) {
 519          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
 520          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
 521                     Definition(tmp[2]), Definition(tmp[3]), vec);
 522       } else if (vec.size() == 3) {
 523          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
 524          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
 525                     Definition(tmp[2]), vec);
 526       } else if (vec.size() == 2) {
 527          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
 528          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
 529       }
 530       for (unsigned i = 0; i < dst.size(); i++)
 531          tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
 532
 533       vec = tmp[0];
 534       if (dst.size() == 2)
 535          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
 536
 537       offset = Operand::zero();
 538    }
 539
 540    unsigned num_components = vec.bytes() / component_size;
 541    if (vec.regClass() == dst.regClass()) {
 542       assert(offset.constantValue() == 0);
 543       bld.copy(Definition(dst), vec);
 544       emit_split_vector(ctx, dst, num_components);
 545       return;
 546    }
 547
 548    emit_split_vector(ctx, vec, num_components);
 549    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 550    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
 551
 552    assert(offset.constantValue() % component_size == 0);
 553    unsigned skip = offset.constantValue() / component_size;
 554    for (unsigned i = skip; i < num_components; i++)
 555       elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
 556
 557    if (dst.type() == RegType::vgpr) {
 558       /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
 559       num_components = dst.bytes() / component_size;
 560       aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
 561          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 562       for (unsigned i = 0; i < num_components; i++)
 563          create_vec->operands[i] = Operand(elems[i]);
 564       create_vec->definitions[0] = Definition(dst);
 565       bld.insert(std::move(create_vec));
 566
 567    } else if (skip) {
 568       /* if dst is sgpr - split the src, but move the original to sgpr. */
 569       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
 570       byte_align_scalar(ctx, vec, offset, dst);
 571    } else {
 572       assert(dst.size() == vec.size());
 573       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
 574    }
 575
 576    ctx->allocated_vec.emplace(dst.id(), elems);
 577 }
 578
 579 Temp
 580 get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit)
 581 {
 582    RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components);
 583    Temp tmp = get_ssa_temp(ctx, def);
 584    if (tmp.bytes() != rc.bytes())
 585       return emit_extract_vector(ctx, tmp, 0, rc);
 586    else
 587       return tmp;
 588 }
 589
 590 Temp
 591 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
 592 {
 593    Builder bld(ctx->program, ctx->block);
 594    if (!dst.id())
 595       dst = bld.tmp(bld.lm);
 596
 597    assert(val.regClass() == s1);
 598    assert(dst.regClass() == bld.lm);
 599
 600    return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
 601                    bld.scc(val));
 602 }
 603
 604 Temp
 605 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
 606 {
 607    Builder bld(ctx->program, ctx->block);
 608    if (!dst.id())
 609       dst = bld.tmp(s1);
 610
 611    assert(val.regClass() == bld.lm);
 612    assert(dst.regClass() == s1);
 613
 614    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
 615    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm));
 616    return dst;
 617 }
 618
 619 /**
 620  * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
 621  * src_bits and dst_bits are truncated.
 622  *
 623  * Sign extension may be applied using the sign_extend parameter. The position of the input sign
 624  * bit is indicated by src_bits in this case.
 625  *
 626  * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
 627  */
 628 Temp
 629 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
 630             bool sign_extend, Temp dst = Temp())
 631 {
 632    assert(!(sign_extend && dst_bits < src_bits) &&
 633           "Shrinking integers is not supported for signed inputs");
 634
 635    if (!dst.id()) {
 636       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
 637          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
 638       else
 639          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
 640    }
 641
 642    assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
 643    assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
 644
 645    if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
 646       /* Copy the raw value, leaving an undefined value in the upper bits for
 647        * the caller to handle appropriately */
 648       return bld.copy(Definition(dst), src);
 649    } else if (dst.bytes() < src.bytes()) {
 650       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
 651    }
 652
 653    Temp tmp = dst;
 654    if (dst_bits == 64)
 655       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
 656
 657    if (tmp == src) {
 658    } else if (src.regClass() == s1) {
 659       assert(src_bits < 32);
 660       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
 661                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
 662    } else {
 663       assert(src_bits < 32);
 664       bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(),
 665                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
 666    }
 667
 668    if (dst_bits == 64) {
 669       if (sign_extend && dst.regClass() == s2) {
 670          Temp high =
 671             bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
 672          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
 673       } else if (sign_extend && dst.regClass() == v2) {
 674          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
 675          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
 676       } else {
 677          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
 678       }
 679    }
 680
 681    return dst;
 682 }
 683
 684 enum sgpr_extract_mode {
 685    sgpr_extract_sext,
 686    sgpr_extract_zext,
 687    sgpr_extract_undef,
 688 };
 689
 690 Temp
 691 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
 692 {
 693    Temp vec = get_ssa_temp(ctx, src->src.ssa);
 694    unsigned src_size = src->src.ssa->bit_size;
 695    unsigned swizzle = src->swizzle[0];
 696
 697    if (vec.size() > 1) {
 698       assert(src_size == 16);
 699       vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
 700       swizzle = swizzle & 1;
 701    }
 702
 703    Builder bld(ctx->program, ctx->block);
 704    Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
 705
 706    if (mode == sgpr_extract_undef && swizzle == 0)
 707       bld.copy(Definition(tmp), vec);
 708    else
 709       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
 710                  Operand::c32(swizzle), Operand::c32(src_size),
 711                  Operand::c32((mode == sgpr_extract_sext)));
 712
 713    if (dst.regClass() == s2)
 714       convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
 715
 716    return dst;
 717 }
 718
 719 Temp
 720 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
 721 {
 722    if (src.src.ssa->num_components == 1 && size == 1)
 723       return get_ssa_temp(ctx, src.src.ssa);
 724
 725    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 726    unsigned elem_size = src.src.ssa->bit_size / 8u;
 727    bool identity_swizzle = true;
 728
 729    for (unsigned i = 0; identity_swizzle && i < size; i++) {
 730       if (src.swizzle[i] != i)
 731          identity_swizzle = false;
 732    }
 733    if (identity_swizzle)
 734       return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
 735
 736    assert(elem_size > 0);
 737    assert(vec.bytes() % elem_size == 0);
 738
 739    if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
 740       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
 741       return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
 742                                            sgpr_extract_undef);
 743    }
 744
 745    bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
 746    if (as_uniform)
 747       vec = as_vgpr(ctx, vec);
 748
 749    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
 750                                     : RegClass(vec.type(), elem_size / 4);
 751    if (size == 1) {
 752       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 753    } else {
 754       assert(size <= 4);
 755       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 756       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
 757          aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 758       for (unsigned i = 0; i < size; ++i) {
 759          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 760          vec_instr->operands[i] = Operand{elems[i]};
 761       }
 762       Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
 763       vec_instr->definitions[0] = Definition(dst);
 764       ctx->block->instructions.emplace_back(std::move(vec_instr));
 765       ctx->allocated_vec.emplace(dst.id(), elems);
 766       return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
 767    }
 768 }
 769
 770 Temp
 771 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
 772 {
 773    /* returns v2b or v1 for vop3p usage.
 774     * The source expects exactly 2 16bit components
 775     * which are within the same dword
 776     */
 777    assert(src.src.ssa->bit_size == 16);
 778    assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
 779
 780    Temp tmp = get_ssa_temp(ctx, src.src.ssa);
 781    if (tmp.size() == 1)
 782       return tmp;
 783
 784    /* the size is larger than 1 dword: check the swizzle */
 785    unsigned dword = src.swizzle[0] >> 1;
 786
 787    /* extract a full dword if possible */
 788    if (tmp.bytes() >= (dword + 1) * 4) {
 789       /* if the source is split into components, use p_create_vector */
 790       auto it = ctx->allocated_vec.find(tmp.id());
 791       if (it != ctx->allocated_vec.end()) {
 792          unsigned index = dword << 1;
 793          Builder bld(ctx->program, ctx->block);
 794          if (it->second[index].regClass() == v2b)
 795             return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
 796                               it->second[index + 1]);
 797       }
 798       return emit_extract_vector(ctx, tmp, dword, v1);
 799    } else {
 800       /* This must be a swizzled access to %a.zz where %a is v6b */
 801       assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
 802       assert(tmp.regClass() == v6b && dword == 1);
 803       return emit_extract_vector(ctx, tmp, dword * 2, v2b);
 804    }
 805 }
 806
 807 uint32_t
 808 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
 809 {
 810    nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
 811    return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
 812 }
 813
 814 Temp
 815 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
 816 {
 817    if (ptr.size() == 2)
 818       return ptr;
 819    Builder bld(ctx->program, ctx->block);
 820    if (ptr.type() == RegType::vgpr && !non_uniform)
 821       ptr = bld.as_uniform(ptr);
 822    return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
 823                      Operand::c32((unsigned)ctx->options->address32_hi));
 824 }
 825
 826 void
 827 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
 828                       bool writes_scc, uint8_t uses_ub = 0)
 829 {
 830    aco_ptr<SOP2_instruction> sop2{
 831       create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 832    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 833    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 834    sop2->definitions[0] = Definition(dst);
 835    if (instr->no_unsigned_wrap)
 836       sop2->definitions[0].setNUW(true);
 837    if (writes_scc)
 838       sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
 839
 840    for (int i = 0; i < 2; i++) {
 841       if (uses_ub & (1 << i)) {
 842          uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
 843          if (src_ub <= 0xffff)
 844             sop2->operands[i].set16bit(true);
 845          else if (src_ub <= 0xffffff)
 846             sop2->operands[i].set24bit(true);
 847       }
 848    }
 849
 850    ctx->block->instructions.emplace_back(std::move(sop2));
 851 }
 852
 853 void
 854 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
 855                       bool commutative, bool swap_srcs = false, bool flush_denorms = false,
 856                       bool nuw = false, uint8_t uses_ub = 0)
 857 {
 858    Builder bld(ctx->program, ctx->block);
 859    bld.is_precise = instr->exact;
 860
 861    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 862    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 863    if (src1.type() == RegType::sgpr) {
 864       if (commutative && src0.type() == RegType::vgpr) {
 865          Temp t = src0;
 866          src0 = src1;
 867          src1 = t;
 868       } else {
 869          src1 = as_vgpr(ctx, src1);
 870       }
 871    }
 872
 873    Operand op[2] = {Operand(src0), Operand(src1)};
 874
 875    for (int i = 0; i < 2; i++) {
 876       if (uses_ub & (1 << i)) {
 877          uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
 878          if (src_ub <= 0xffff)
 879             op[i].set16bit(true);
 880          else if (src_ub <= 0xffffff)
 881             op[i].set24bit(true);
 882       }
 883    }
 884
 885    if (flush_denorms && ctx->program->gfx_level < GFX9) {
 886       assert(dst.size() == 1);
 887       Temp tmp = bld.vop2(opc, bld.def(v1), op[0], op[1]);
 888       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
 889    } else {
 890       if (nuw) {
 891          bld.nuw().vop2(opc, Definition(dst), op[0], op[1]);
 892       } else {
 893          bld.vop2(opc, Definition(dst), op[0], op[1]);
 894       }
 895    }
 896 }
 897
 898 void
 899 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
 900 {
 901    Builder bld(ctx->program, ctx->block);
 902    bld.is_precise = instr->exact;
 903
 904    Temp src0 = get_alu_src(ctx, instr->src[0]);
 905    Temp src1 = get_alu_src(ctx, instr->src[1]);
 906
 907    if (src1.type() == RegType::sgpr) {
 908       assert(src0.type() == RegType::vgpr);
 909       std::swap(src0, src1);
 910    }
 911
 912    Temp src00 = bld.tmp(src0.type(), 1);
 913    Temp src01 = bld.tmp(src0.type(), 1);
 914    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
 915    Temp src10 = bld.tmp(v1);
 916    Temp src11 = bld.tmp(v1);
 917    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
 918    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
 919    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
 920    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
 921 }
 922
 923 void
 924 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
 925                        bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
 926 {
 927    assert(num_sources == 2 || num_sources == 3);
 928    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
 929    bool has_sgpr = false;
 930    for (unsigned i = 0; i < num_sources; i++) {
 931       src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
 932       if (has_sgpr)
 933          src[i] = as_vgpr(ctx, src[i]);
 934       else
 935          has_sgpr = src[i].type() == RegType::sgpr;
 936    }
 937
 938    Builder bld(ctx->program, ctx->block);
 939    bld.is_precise = instr->exact;
 940    if (flush_denorms && ctx->program->gfx_level < GFX9) {
 941       Temp tmp;
 942       if (num_sources == 3)
 943          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
 944       else
 945          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
 946       if (dst.size() == 1)
 947          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
 948       else
 949          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
 950    } else if (num_sources == 3) {
 951       bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
 952    } else {
 953       bld.vop3(op, Definition(dst), src[0], src[1]);
 954    }
 955 }
 956
 957 Builder::Result
 958 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
 959                        bool swap_srcs = false)
 960 {
 961    Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
 962    Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
 963    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 964       src1 = as_vgpr(ctx, src1);
 965    assert(instr->def.num_components == 2);
 966
 967    /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
 968    unsigned opsel_lo =
 969       (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
 970    unsigned opsel_hi =
 971       (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
 972
 973    Builder bld(ctx->program, ctx->block);
 974    bld.is_precise = instr->exact;
 975    Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
 976    return res;
 977 }
 978
 979 void
 980 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
 981                       unsigned neg_lo = 0)
 982 {
 983    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
 984    bool has_sgpr = false;
 985    for (unsigned i = 0; i < 3; i++) {
 986       src[i] = get_alu_src(ctx, instr->src[i]);
 987       if (has_sgpr)
 988          src[i] = as_vgpr(ctx, src[i]);
 989       else
 990          has_sgpr = src[i].type() == RegType::sgpr;
 991    }
 992
 993    Builder bld(ctx->program, ctx->block);
 994    bld.is_precise = instr->exact;
 995    VALU_instruction& vop3p =
 996       bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
 997    vop3p.clamp = clamp;
 998    vop3p.neg_lo = neg_lo;
 999 }
1000
1001 void
1002 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1003 {
1004    Builder bld(ctx->program, ctx->block);
1005    bld.is_precise = instr->exact;
1006    if (dst.type() == RegType::sgpr)
1007       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1008                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
1009    else
1010       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
1011 }
1012
1013 void
1014 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1015 {
1016    Temp src0 = get_alu_src(ctx, instr->src[0]);
1017    Temp src1 = get_alu_src(ctx, instr->src[1]);
1018    assert(src0.size() == src1.size());
1019
1020    aco_ptr<Instruction> vopc;
1021    if (src1.type() == RegType::sgpr) {
1022       if (src0.type() == RegType::vgpr) {
1023          /* to swap the operands, we might also have to change the opcode */
1024          switch (op) {
1025          case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
1026          case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
1027          case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
1028          case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
1029          case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
1030          case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
1031          case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
1032          case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
1033          case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
1034          case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
1035          case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
1036          case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
1037          case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
1038          case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1039          case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1040          case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1041          case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1042          case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1043          default: /* eq and ne are commutative */ break;
1044          }
1045          Temp t = src0;
1046          src0 = src1;
1047          src1 = t;
1048       } else {
1049          src1 = as_vgpr(ctx, src1);
1050       }
1051    }
1052
1053    Builder bld(ctx->program, ctx->block);
1054    bld.vopc(op, Definition(dst), src0, src1);
1055 }
1056
1057 void
1058 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1059 {
1060    Temp src0 = get_alu_src(ctx, instr->src[0]);
1061    Temp src1 = get_alu_src(ctx, instr->src[1]);
1062    Builder bld(ctx->program, ctx->block);
1063
1064    assert(dst.regClass() == bld.lm);
1065    assert(src0.type() == RegType::sgpr);
1066    assert(src1.type() == RegType::sgpr);
1067
1068    /* Emit the SALU comparison instruction */
1069    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1070    /* Turn the result into a per-lane bool */
1071    bool_to_vector_condition(ctx, cmp, dst);
1072 }
1073
1074 void
1075 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1076                 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1077                 aco_opcode s64_op = aco_opcode::num_opcodes)
1078 {
1079    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
1080                      : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1081                                                              : aco_opcode::num_opcodes;
1082    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
1083                      : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1084                                                              : v16_op;
1085    bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
1086                    get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1087                    get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1088    aco_opcode op = use_valu ? v_op : s_op;
1089    assert(op != aco_opcode::num_opcodes);
1090    assert(dst.regClass() == ctx->program->lane_mask);
1091
1092    if (use_valu)
1093       emit_vopc_instruction(ctx, instr, op, dst);
1094    else
1095       emit_sopc_instruction(ctx, instr, op, dst);
1096 }
1097
1098 void
1099 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1100                    Temp dst)
1101 {
1102    Builder bld(ctx->program, ctx->block);
1103    Temp src0 = get_alu_src(ctx, instr->src[0]);
1104    Temp src1 = get_alu_src(ctx, instr->src[1]);
1105
1106    assert(dst.regClass() == bld.lm);
1107    assert(src0.regClass() == bld.lm);
1108    assert(src1.regClass() == bld.lm);
1109
1110    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1111 }
1112
1113 void
1114 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1115 {
1116    Builder bld(ctx->program, ctx->block);
1117    Temp cond = get_alu_src(ctx, instr->src[0]);
1118    Temp then = get_alu_src(ctx, instr->src[1]);
1119    Temp els = get_alu_src(ctx, instr->src[2]);
1120
1121    assert(cond.regClass() == bld.lm);
1122
1123    if (dst.type() == RegType::vgpr) {
1124       aco_ptr<Instruction> bcsel;
1125       if (dst.size() == 1) {
1126          then = as_vgpr(ctx, then);
1127          els = as_vgpr(ctx, els);
1128
1129          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1130       } else if (dst.size() == 2) {
1131          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1132          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1133          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1134          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1135
1136          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1137          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1138
1139          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1140       } else {
1141          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1142       }
1143       return;
1144    }
1145
1146    if (instr->def.bit_size == 1) {
1147       assert(dst.regClass() == bld.lm);
1148       assert(then.regClass() == bld.lm);
1149       assert(els.regClass() == bld.lm);
1150    }
1151
1152    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1153       if (dst.regClass() == s1 || dst.regClass() == s2) {
1154          assert((then.regClass() == s1 || then.regClass() == s2) &&
1155                 els.regClass() == then.regClass());
1156          assert(dst.size() == then.size());
1157          aco_opcode op =
1158             dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1159          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1160       } else {
1161          isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1162       }
1163       return;
1164    }
1165
1166    /* divergent boolean bcsel
1167     * this implements bcsel on bools: dst = s0 ? s1 : s2
1168     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1169    assert(instr->def.bit_size == 1);
1170
1171    if (cond.id() != then.id())
1172       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1173
1174    if (cond.id() == els.id())
1175       bld.copy(Definition(dst), then);
1176    else
1177       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1178                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1179 }
1180
1181 void
1182 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1183                uint32_t undo)
1184 {
1185    /* multiply by 16777216 to handle denormals */
1186    Temp is_denormal = bld.tmp(bld.lm);
1187    VALU_instruction& valu =
1188       bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal), val, Operand::c32(1u << 4))
1189          ->valu();
1190    valu.neg[0] = true;
1191    valu.abs[0] = true;
1192    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1193    scaled = bld.vop1(op, bld.def(v1), scaled);
1194    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1195
1196    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1197
1198    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1199 }
1200
1201 void
1202 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1203 {
1204    if (ctx->block->fp_mode.denorm32 == 0) {
1205       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1206       return;
1207    }
1208
1209    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1210 }
1211
1212 void
1213 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1214 {
1215    if (ctx->block->fp_mode.denorm32 == 0) {
1216       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1217       return;
1218    }
1219
1220    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1221 }
1222
1223 void
1224 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1225 {
1226    if (ctx->block->fp_mode.denorm32 == 0) {
1227       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1228       return;
1229    }
1230
1231    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1232 }
1233
1234 void
1235 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1236 {
1237    if (ctx->block->fp_mode.denorm32 == 0) {
1238       bld.vop1(aco_opcode::v_log_f32, dst, val);
1239       return;
1240    }
1241
1242    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1243 }
1244
1245 Temp
1246 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1247 {
1248    if (ctx->options->gfx_level >= GFX7)
1249       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1250
1251    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1252    /* TODO: create more efficient code! */
1253    if (val.type() == RegType::sgpr)
1254       val = as_vgpr(ctx, val);
1255
1256    /* Split the input value. */
1257    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1258    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1259
1260    /* Extract the exponent and compute the unbiased value. */
1261    Temp exponent =
1262       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1263    exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1264
1265    /* Extract the fractional part. */
1266    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1267                                 Operand::c32(0x000fffffu));
1268    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1269
1270    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1271    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1272               fract_mask);
1273
1274    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1275    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1276    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1277    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1278    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1279
1280    /* Get the sign bit. */
1281    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1282
1283    /* Decide the operation to apply depending on the unbiased exponent. */
1284    Temp exp_lt0 =
1285       bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
1286    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1287                           bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1288    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1289    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1290    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1291    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1292
1293    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1294 }
1295
1296 Temp
1297 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1298 {
1299    if (ctx->options->gfx_level >= GFX7)
1300       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1301
1302    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1303     * lowered at NIR level for precision reasons). */
1304    Temp src0 = as_vgpr(ctx, val);
1305
1306    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1307                              Operand::c32(0x3fefffffu));
1308
1309    Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
1310    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1311    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1312
1313    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1314    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1315    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1316    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1317
1318    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1319    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1320
1321    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1322
1323    Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1324    add->valu().neg[1] = true;
1325
1326    return add->definitions[0].getTemp();
1327 }
1328
1329 Temp
1330 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1331 {
1332    if (bld.program->gfx_level < GFX8) {
1333       Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1334       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1335                           add.def(1).getTemp());
1336    }
1337
1338    Builder::Result add(NULL);
1339    if (bld.program->gfx_level >= GFX9) {
1340       add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1341    } else {
1342       add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
1343    }
1344    add->valu().clamp = 1;
1345    return dst.getTemp();
1346 }
1347
1348 Temp
1349 usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1350 {
1351    if (bld.program->gfx_level < GFX8) {
1352       Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
1353       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
1354                           sub.def(1).getTemp());
1355    }
1356
1357    Builder::Result sub(NULL);
1358    if (bld.program->gfx_level >= GFX9) {
1359       sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
1360    } else {
1361       sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
1362    }
1363    sub->valu().clamp = 1;
1364    return dst.getTemp();
1365 }
1366
1367 void
1368 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1369 {
1370    Builder bld(ctx->program, ctx->block);
1371    bld.is_precise = instr->exact;
1372    Temp dst = get_ssa_temp(ctx, &instr->def);
1373    switch (instr->op) {
1374    case nir_op_vec2:
1375    case nir_op_vec3:
1376    case nir_op_vec4:
1377    case nir_op_vec5:
1378    case nir_op_vec8:
1379    case nir_op_vec16: {
1380       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1381       unsigned num = instr->def.num_components;
1382       for (unsigned i = 0; i < num; ++i)
1383          elems[i] = get_alu_src(ctx, instr->src[i]);
1384
1385       if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
1386          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1387             aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1)};
1388          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->def.bit_size / 8u);
1389          for (unsigned i = 0; i < num; ++i) {
1390             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1391                elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1392             vec->operands[i] = Operand{elems[i]};
1393          }
1394          vec->definitions[0] = Definition(dst);
1395          ctx->block->instructions.emplace_back(std::move(vec));
1396          ctx->allocated_vec.emplace(dst.id(), elems);
1397       } else {
1398          bool use_s_pack = ctx->program->gfx_level >= GFX9;
1399          Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
1400
1401          std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1402          uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1403          for (unsigned i = 0; i < num; i++) {
1404             unsigned packed_size = use_s_pack ? 16 : 32;
1405             unsigned idx = i * instr->def.bit_size / packed_size;
1406             unsigned offset = i * instr->def.bit_size % packed_size;
1407             if (nir_src_is_const(instr->src[i].src)) {
1408                const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1409                continue;
1410             }
1411             if (nir_src_is_undef(instr->src[i].src))
1412                continue;
1413
1414             if (offset != packed_size - instr->def.bit_size)
1415                elems[i] =
1416                   bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1417
1418             if (offset)
1419                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1420                                    Operand::c32(offset));
1421
1422             if (packed[idx].id())
1423                packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1424                                       packed[idx]);
1425             else
1426                packed[idx] = elems[i];
1427          }
1428
1429          if (use_s_pack) {
1430             for (unsigned i = 0; i < dst.size(); i++) {
1431                bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1432
1433                if (packed[i * 2].id() && packed[i * 2 + 1].id())
1434                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1435                                        packed[i * 2 + 1]);
1436                else if (packed[i * 2 + 1].id())
1437                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1438                                        Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1439                else if (packed[i * 2].id())
1440                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1441                                        Operand::c32(const_vals[i * 2 + 1]));
1442                else
1443                   packed[i] = Temp(); /* Both constants, so reset the entry */
1444
1445                if (same)
1446                   const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1447                else
1448                   const_vals[i] = 0;
1449             }
1450          }
1451
1452          for (unsigned i = 0; i < dst.size(); i++) {
1453             if (const_vals[i] && packed[i].id())
1454                packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1455                                     Operand::c32(const_vals[i]), packed[i]);
1456             else if (!packed[i].id())
1457                packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1458          }
1459
1460          if (dst.size() == 1)
1461             bld.copy(Definition(dst), packed[0]);
1462          else {
1463             aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1464                aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
1465             vec->definitions[0] = Definition(dst);
1466             for (unsigned i = 0; i < dst.size(); ++i)
1467                vec->operands[i] = Operand(packed[i]);
1468             bld.insert(std::move(vec));
1469          }
1470       }
1471       break;
1472    }
1473    case nir_op_mov: {
1474       Temp src = get_alu_src(ctx, instr->src[0]);
1475       if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1476          /* use size() instead of bytes() for 8/16-bit */
1477          assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1478          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1479       } else {
1480          assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1481          bld.copy(Definition(dst), src);
1482       }
1483       break;
1484    }
1485    case nir_op_inot: {
1486       Temp src = get_alu_src(ctx, instr->src[0]);
1487       if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1488          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1489       } else if (dst.regClass() == v2) {
1490          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1491          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1492          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1493          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1494          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1495       } else if (dst.type() == RegType::sgpr) {
1496          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1497          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1498       } else {
1499          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1500       }
1501       break;
1502    }
1503    case nir_op_iabs: {
1504       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1505          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
1506
1507          unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
1508          unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
1509
1510          Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
1511                               src, opsel_lo, opsel_hi);
1512          bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
1513          break;
1514       }
1515       Temp src = get_alu_src(ctx, instr->src[0]);
1516       if (dst.regClass() == s1) {
1517          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1518       } else if (dst.regClass() == v1) {
1519          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1520                   bld.vsub32(bld.def(v1), Operand::zero(), src));
1521       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1522          bld.vop3(
1523             aco_opcode::v_max_i16_e64, Definition(dst), src,
1524             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1525       } else if (dst.regClass() == v2b) {
1526          src = as_vgpr(ctx, src);
1527          bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
1528                   bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1529       } else {
1530          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1531       }
1532       break;
1533    }
1534    case nir_op_isign: {
1535       Temp src = get_alu_src(ctx, instr->src[0]);
1536       if (dst.regClass() == s1) {
1537          Temp tmp =
1538             bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1539          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1540       } else if (dst.regClass() == s2) {
1541          Temp neg =
1542             bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1543          Temp neqz;
1544          if (ctx->program->gfx_level >= GFX8)
1545             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1546          else
1547             neqz =
1548                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1549                   .def(1)
1550                   .getTemp();
1551          /* SCC gets zero-extended to 64 bit */
1552          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1553       } else if (dst.regClass() == v1) {
1554          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1555       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
1556          bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
1557       } else if (dst.regClass() == v2b) {
1558          src = as_vgpr(ctx, src);
1559          bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
1560                   bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
1561       } else if (dst.regClass() == v2) {
1562          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1563          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1564          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
1565          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1566          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1567          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1568       } else {
1569          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1570       }
1571       break;
1572    }
1573    case nir_op_imax: {
1574       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1575          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1576       } else if (dst.regClass() == v2b) {
1577          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1578       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1579          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1580       } else if (dst.regClass() == v1) {
1581          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1582       } else if (dst.regClass() == s1) {
1583          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1584       } else {
1585          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1586       }
1587       break;
1588    }
1589    case nir_op_umax: {
1590       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1591          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1592       } else if (dst.regClass() == v2b) {
1593          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1594       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1595          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1596       } else if (dst.regClass() == v1) {
1597          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1598       } else if (dst.regClass() == s1) {
1599          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1600       } else {
1601          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1602       }
1603       break;
1604    }
1605    case nir_op_imin: {
1606       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1607          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1608       } else if (dst.regClass() == v2b) {
1609          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1610       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1611          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1612       } else if (dst.regClass() == v1) {
1613          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1614       } else if (dst.regClass() == s1) {
1615          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1616       } else {
1617          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1618       }
1619       break;
1620    }
1621    case nir_op_umin: {
1622       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1623          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1624       } else if (dst.regClass() == v2b) {
1625          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1626       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1627          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1628       } else if (dst.regClass() == v1) {
1629          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1630       } else if (dst.regClass() == s1) {
1631          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1632       } else {
1633          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1634       }
1635       break;
1636    }
1637    case nir_op_ior: {
1638       if (instr->def.bit_size == 1) {
1639          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1640       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1641          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1642       } else if (dst.regClass() == v2) {
1643          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1644       } else if (dst.regClass() == s1) {
1645          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1646       } else if (dst.regClass() == s2) {
1647          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1648       } else {
1649          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1650       }
1651       break;
1652    }
1653    case nir_op_iand: {
1654       if (instr->def.bit_size == 1) {
1655          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1656       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1657          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1658       } else if (dst.regClass() == v2) {
1659          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1660       } else if (dst.regClass() == s1) {
1661          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1662       } else if (dst.regClass() == s2) {
1663          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1664       } else {
1665          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1666       }
1667       break;
1668    }
1669    case nir_op_ixor: {
1670       if (instr->def.bit_size == 1) {
1671          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1672       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1673          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1674       } else if (dst.regClass() == v2) {
1675          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1676       } else if (dst.regClass() == s1) {
1677          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1678       } else if (dst.regClass() == s2) {
1679          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1680       } else {
1681          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1682       }
1683       break;
1684    }
1685    case nir_op_ushr: {
1686       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1687          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1688       } else if (dst.regClass() == v2b) {
1689          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1690       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1691          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1692       } else if (dst.regClass() == v1) {
1693          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1694       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1695          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1696                   get_alu_src(ctx, instr->src[0]));
1697       } else if (dst.regClass() == v2) {
1698          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1699       } else if (dst.regClass() == s2) {
1700          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1701       } else if (dst.regClass() == s1) {
1702          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1703       } else {
1704          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1705       }
1706       break;
1707    }
1708    case nir_op_ishl: {
1709       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1710          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1711       } else if (dst.regClass() == v2b) {
1712          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1713       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1714          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1715       } else if (dst.regClass() == v1) {
1716          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1717                                false, 2);
1718       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1719          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1720                   get_alu_src(ctx, instr->src[0]));
1721       } else if (dst.regClass() == v2) {
1722          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1723       } else if (dst.regClass() == s1) {
1724          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1725       } else if (dst.regClass() == s2) {
1726          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1727       } else {
1728          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1729       }
1730       break;
1731    }
1732    case nir_op_ishr: {
1733       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1734          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1735       } else if (dst.regClass() == v2b) {
1736          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1737       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1738          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1739       } else if (dst.regClass() == v1) {
1740          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1741       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1742          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1743                   get_alu_src(ctx, instr->src[0]));
1744       } else if (dst.regClass() == v2) {
1745          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1746       } else if (dst.regClass() == s1) {
1747          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1748       } else if (dst.regClass() == s2) {
1749          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1750       } else {
1751          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1752       }
1753       break;
1754    }
1755    case nir_op_find_lsb: {
1756       Temp src = get_alu_src(ctx, instr->src[0]);
1757       if (src.regClass() == s1) {
1758          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1759       } else if (src.regClass() == v1) {
1760          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1761       } else if (src.regClass() == s2) {
1762          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1763       } else {
1764          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1765       }
1766       break;
1767    }
1768    case nir_op_ufind_msb:
1769    case nir_op_ifind_msb: {
1770       Temp src = get_alu_src(ctx, instr->src[0]);
1771       if (src.regClass() == s1 || src.regClass() == s2) {
1772          aco_opcode op = src.regClass() == s2
1773                             ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1774                                                              : aco_opcode::s_flbit_i32_i64)
1775                             : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1776                                                              : aco_opcode::s_flbit_i32);
1777          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1778
1779          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1780                                         Operand::c32(src.size() * 32u - 1u), msb_rev);
1781          Temp msb = sub.def(0).getTemp();
1782          Temp carry = sub.def(1).getTemp();
1783
1784          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1785                   bld.scc(carry));
1786       } else if (src.regClass() == v1) {
1787          aco_opcode op =
1788             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1789          Temp msb_rev = bld.tmp(v1);
1790          emit_vop1_instruction(ctx, instr, op, msb_rev);
1791          Temp msb = bld.tmp(v1);
1792          Temp carry =
1793             bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1794          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1795       } else if (src.regClass() == v2) {
1796          aco_opcode op =
1797             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1798
1799          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1800          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1801
1802          lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1803                          bld.vop1(op, bld.def(v1), lo));
1804          hi = bld.vop1(op, bld.def(v1), hi);
1805          Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1806
1807          Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1808
1809          Temp msb = bld.tmp(v1);
1810          Temp carry =
1811             bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1812          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1813       } else {
1814          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1815       }
1816       break;
1817    }
1818    case nir_op_ufind_msb_rev:
1819    case nir_op_ifind_msb_rev: {
1820       Temp src = get_alu_src(ctx, instr->src[0]);
1821       if (src.regClass() == s1) {
1822          aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
1823                                                            : aco_opcode::s_flbit_i32;
1824          bld.sop1(op, Definition(dst), src);
1825       } else if (src.regClass() == v1) {
1826          aco_opcode op =
1827             instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1828          emit_vop1_instruction(ctx, instr, op, dst);
1829       } else {
1830          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1831       }
1832       break;
1833    }
1834    case nir_op_bitfield_reverse: {
1835       if (dst.regClass() == s1) {
1836          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1837       } else if (dst.regClass() == v1) {
1838          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1839       } else {
1840          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1841       }
1842       break;
1843    }
1844    case nir_op_iadd: {
1845       if (dst.regClass() == s1) {
1846          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1847          break;
1848       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
1849          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1850          break;
1851       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
1852          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1853          break;
1854       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1855          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1856          break;
1857       }
1858
1859       Temp src0 = get_alu_src(ctx, instr->src[0]);
1860       Temp src1 = get_alu_src(ctx, instr->src[1]);
1861       if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1862          if (instr->no_unsigned_wrap)
1863             bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
1864          else
1865             bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1866          break;
1867       }
1868
1869       assert(src0.size() == 2 && src1.size() == 2);
1870       Temp src00 = bld.tmp(src0.type(), 1);
1871       Temp src01 = bld.tmp(dst.type(), 1);
1872       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1873       Temp src10 = bld.tmp(src1.type(), 1);
1874       Temp src11 = bld.tmp(dst.type(), 1);
1875       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1876
1877       if (dst.regClass() == s2) {
1878          Temp carry = bld.tmp(s1);
1879          Temp dst0 =
1880             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1881          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1882                               bld.scc(carry));
1883          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1884       } else if (dst.regClass() == v2) {
1885          Temp dst0 = bld.tmp(v1);
1886          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1887          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1888          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1889       } else {
1890          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1891       }
1892       break;
1893    }
1894    case nir_op_uadd_sat: {
1895       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1896          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1897          add_instr->valu().clamp = 1;
1898          break;
1899       }
1900       Temp src0 = get_alu_src(ctx, instr->src[0]);
1901       Temp src1 = get_alu_src(ctx, instr->src[1]);
1902       if (dst.regClass() == s1) {
1903          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1904          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1905          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1906                   bld.scc(carry));
1907          break;
1908       } else if (dst.regClass() == v2b) {
1909          Instruction* add_instr;
1910          if (ctx->program->gfx_level >= GFX10) {
1911             add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1912          } else {
1913             if (src1.type() == RegType::sgpr)
1914                std::swap(src0, src1);
1915             add_instr =
1916                bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1917          }
1918          add_instr->valu().clamp = 1;
1919          break;
1920       } else if (dst.regClass() == v1) {
1921          uadd32_sat(bld, Definition(dst), src0, src1);
1922          break;
1923       }
1924
1925       assert(src0.size() == 2 && src1.size() == 2);
1926
1927       Temp src00 = bld.tmp(src0.type(), 1);
1928       Temp src01 = bld.tmp(src0.type(), 1);
1929       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1930       Temp src10 = bld.tmp(src1.type(), 1);
1931       Temp src11 = bld.tmp(src1.type(), 1);
1932       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1933
1934       if (dst.regClass() == s2) {
1935          Temp carry0 = bld.tmp(s1);
1936          Temp carry1 = bld.tmp(s1);
1937
1938          Temp no_sat0 =
1939             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
1940          Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
1941                                  src01, src11, bld.scc(carry0));
1942
1943          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
1944
1945          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
1946                   bld.scc(carry1));
1947       } else if (dst.regClass() == v2) {
1948          Temp no_sat0 = bld.tmp(v1);
1949          Temp dst0 = bld.tmp(v1);
1950          Temp dst1 = bld.tmp(v1);
1951
1952          Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
1953          Temp carry1;
1954
1955          if (ctx->program->gfx_level >= GFX8) {
1956             carry1 = bld.tmp(bld.lm);
1957             bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
1958                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
1959                ->valu()
1960                .clamp = 1;
1961          } else {
1962             Temp no_sat1 = bld.tmp(v1);
1963             carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
1964             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
1965                          carry1);
1966          }
1967
1968          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
1969                       carry1);
1970          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1971       } else {
1972          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1973       }
1974       break;
1975    }
1976    case nir_op_iadd_sat: {
1977       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1978          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
1979          add_instr->valu().clamp = 1;
1980          break;
1981       }
1982       Temp src0 = get_alu_src(ctx, instr->src[0]);
1983       Temp src1 = get_alu_src(ctx, instr->src[1]);
1984       if (dst.regClass() == s1) {
1985          Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
1986          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
1987                                Operand::c32(INT32_MAX), cond);
1988          Temp overflow = bld.tmp(s1);
1989          Temp add =
1990             bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
1991          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
1992          break;
1993       }
1994
1995       src1 = as_vgpr(ctx, src1);
1996
1997       if (dst.regClass() == v2b) {
1998          Instruction* add_instr =
1999             bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
2000          add_instr->valu().clamp = 1;
2001       } else if (dst.regClass() == v1) {
2002          Instruction* add_instr =
2003             bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
2004          add_instr->valu().clamp = 1;
2005       } else {
2006          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2007       }
2008       break;
2009    }
2010    case nir_op_uadd_carry: {
2011       Temp src0 = get_alu_src(ctx, instr->src[0]);
2012       Temp src1 = get_alu_src(ctx, instr->src[1]);
2013       if (dst.regClass() == s1) {
2014          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2015          break;
2016       }
2017       if (dst.regClass() == v1) {
2018          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
2019          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2020                       carry);
2021          break;
2022       }
2023
2024       Temp src00 = bld.tmp(src0.type(), 1);
2025       Temp src01 = bld.tmp(dst.type(), 1);
2026       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2027       Temp src10 = bld.tmp(src1.type(), 1);
2028       Temp src11 = bld.tmp(dst.type(), 1);
2029       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2030       if (dst.regClass() == s2) {
2031          Temp carry = bld.tmp(s1);
2032          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
2033          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2034                           bld.scc(carry))
2035                     .def(1)
2036                     .getTemp();
2037          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2038       } else if (dst.regClass() == v2) {
2039          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
2040          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
2041          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2042                               Operand::c32(1u), carry);
2043          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2044       } else {
2045          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2046       }
2047       break;
2048    }
2049    case nir_op_isub: {
2050       if (dst.regClass() == s1) {
2051          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
2052          break;
2053       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2054          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2055          break;
2056       }
2057
2058       Temp src0 = get_alu_src(ctx, instr->src[0]);
2059       Temp src1 = get_alu_src(ctx, instr->src[1]);
2060       if (dst.regClass() == v1) {
2061          bld.vsub32(Definition(dst), src0, src1);
2062          break;
2063       } else if (dst.bytes() <= 2) {
2064          if (ctx->program->gfx_level >= GFX10)
2065             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
2066          else if (src1.type() == RegType::sgpr)
2067             bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
2068          else if (ctx->program->gfx_level >= GFX8)
2069             bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
2070          else
2071             bld.vsub32(Definition(dst), src0, src1);
2072          break;
2073       }
2074
2075       Temp src00 = bld.tmp(src0.type(), 1);
2076       Temp src01 = bld.tmp(dst.type(), 1);
2077       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2078       Temp src10 = bld.tmp(src1.type(), 1);
2079       Temp src11 = bld.tmp(dst.type(), 1);
2080       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2081       if (dst.regClass() == s2) {
2082          Temp borrow = bld.tmp(s1);
2083          Temp dst0 =
2084             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2085          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
2086                               bld.scc(borrow));
2087          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2088       } else if (dst.regClass() == v2) {
2089          Temp lower = bld.tmp(v1);
2090          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
2091          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
2092          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2093       } else {
2094          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2095       }
2096       break;
2097    }
2098    case nir_op_usub_borrow: {
2099       Temp src0 = get_alu_src(ctx, instr->src[0]);
2100       Temp src1 = get_alu_src(ctx, instr->src[1]);
2101       if (dst.regClass() == s1) {
2102          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2103          break;
2104       } else if (dst.regClass() == v1) {
2105          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
2106          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2107                       borrow);
2108          break;
2109       }
2110
2111       Temp src00 = bld.tmp(src0.type(), 1);
2112       Temp src01 = bld.tmp(dst.type(), 1);
2113       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2114       Temp src10 = bld.tmp(src1.type(), 1);
2115       Temp src11 = bld.tmp(dst.type(), 1);
2116       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2117       if (dst.regClass() == s2) {
2118          Temp borrow = bld.tmp(s1);
2119          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2120          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2121                            bld.scc(borrow))
2122                      .def(1)
2123                      .getTemp();
2124          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2125       } else if (dst.regClass() == v2) {
2126          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
2127          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
2128          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2129                                Operand::c32(1u), borrow);
2130          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2131       } else {
2132          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2133       }
2134       break;
2135    }
2136    case nir_op_usub_sat: {
2137       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2138          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2139          sub_instr->valu().clamp = 1;
2140          break;
2141       }
2142       Temp src0 = get_alu_src(ctx, instr->src[0]);
2143       Temp src1 = get_alu_src(ctx, instr->src[1]);
2144       if (dst.regClass() == s1) {
2145          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
2146          bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
2147          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
2148          break;
2149       } else if (dst.regClass() == v2b) {
2150          Instruction* sub_instr;
2151          if (ctx->program->gfx_level >= GFX10) {
2152             sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
2153          } else {
2154             aco_opcode op = aco_opcode::v_sub_u16;
2155             if (src1.type() == RegType::sgpr) {
2156                std::swap(src0, src1);
2157                op = aco_opcode::v_subrev_u16;
2158             }
2159             sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
2160          }
2161          sub_instr->valu().clamp = 1;
2162          break;
2163       } else if (dst.regClass() == v1) {
2164          usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
2165          break;
2166       }
2167
2168       assert(src0.size() == 2 && src1.size() == 2);
2169       Temp src00 = bld.tmp(src0.type(), 1);
2170       Temp src01 = bld.tmp(src0.type(), 1);
2171       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2172       Temp src10 = bld.tmp(src1.type(), 1);
2173       Temp src11 = bld.tmp(src1.type(), 1);
2174       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2175
2176       if (dst.regClass() == s2) {
2177          Temp carry0 = bld.tmp(s1);
2178          Temp carry1 = bld.tmp(s1);
2179
2180          Temp no_sat0 =
2181             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
2182          Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
2183                                  src01, src11, bld.scc(carry0));
2184
2185          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
2186
2187          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
2188                   bld.scc(carry1));
2189       } else if (dst.regClass() == v2) {
2190          Temp no_sat0 = bld.tmp(v1);
2191          Temp dst0 = bld.tmp(v1);
2192          Temp dst1 = bld.tmp(v1);
2193
2194          Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
2195          Temp carry1;
2196
2197          if (ctx->program->gfx_level >= GFX8) {
2198             carry1 = bld.tmp(bld.lm);
2199             bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
2200                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
2201                ->valu()
2202                .clamp = 1;
2203          } else {
2204             Temp no_sat1 = bld.tmp(v1);
2205             carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
2206             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
2207                          carry1);
2208          }
2209
2210          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
2211                       carry1);
2212          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2213       } else {
2214          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2215       }
2216       break;
2217    }
2218    case nir_op_isub_sat: {
2219       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2220          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
2221          sub_instr->valu().clamp = 1;
2222          break;
2223       }
2224       Temp src0 = get_alu_src(ctx, instr->src[0]);
2225       Temp src1 = get_alu_src(ctx, instr->src[1]);
2226       if (dst.regClass() == s1) {
2227          Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
2228          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2229                                Operand::c32(INT32_MAX), cond);
2230          Temp overflow = bld.tmp(s1);
2231          Temp sub =
2232             bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2233          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
2234          break;
2235       }
2236
2237       src1 = as_vgpr(ctx, src1);
2238
2239       if (dst.regClass() == v2b) {
2240          Instruction* sub_instr =
2241             bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
2242          sub_instr->valu().clamp = 1;
2243       } else if (dst.regClass() == v1) {
2244          Instruction* sub_instr =
2245             bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
2246          sub_instr->valu().clamp = 1;
2247       } else {
2248          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2249       }
2250       break;
2251    }
2252    case nir_op_imul: {
2253       if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
2254          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
2255       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
2256          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
2257       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2258          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
2259       } else if (dst.type() == RegType::vgpr) {
2260          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2261          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2262
2263          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2264             bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
2265             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
2266                                   true /* commutative */, false, false, nuw_16bit);
2267          } else if (nir_src_is_const(instr->src[0].src)) {
2268             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
2269                           nir_src_as_uint(instr->src[0].src), false);
2270          } else if (nir_src_is_const(instr->src[1].src)) {
2271             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
2272                           nir_src_as_uint(instr->src[1].src), false);
2273          } else {
2274             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
2275          }
2276       } else if (dst.regClass() == s1) {
2277          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
2278       } else {
2279          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2280       }
2281       break;
2282    }
2283    case nir_op_umul_high: {
2284       if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2285          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
2286       } else if (dst.bytes() == 4) {
2287          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2288          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2289
2290          Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2291          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2292             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2293          } else {
2294             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2295          }
2296
2297          if (dst.regClass() == s1)
2298             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2299       } else {
2300          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2301       }
2302       break;
2303    }
2304    case nir_op_imul_high: {
2305       if (dst.regClass() == v1) {
2306          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2307       } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2308          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2309       } else if (dst.regClass() == s1) {
2310          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2311                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2312          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2313       } else {
2314          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2315       }
2316       break;
2317    }
2318    case nir_op_fmul: {
2319       if (dst.regClass() == v2b) {
2320          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2321       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2322          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2323       } else if (dst.regClass() == v1) {
2324          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2325       } else if (dst.regClass() == v2) {
2326          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2327       } else {
2328          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2329       }
2330       break;
2331    }
2332    case nir_op_fmulz: {
2333       if (dst.regClass() == v1) {
2334          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
2335       } else {
2336          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2337       }
2338       break;
2339    }
2340    case nir_op_fadd: {
2341       if (dst.regClass() == v2b) {
2342          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2343       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2344          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2345       } else if (dst.regClass() == v1) {
2346          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2347       } else if (dst.regClass() == v2) {
2348          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2349       } else {
2350          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2351       }
2352       break;
2353    }
2354    case nir_op_fsub: {
2355       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2356          Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2357          VALU_instruction& sub = add->valu();
2358          sub.neg_lo[1] = true;
2359          sub.neg_hi[1] = true;
2360          break;
2361       }
2362
2363       Temp src0 = get_alu_src(ctx, instr->src[0]);
2364       Temp src1 = get_alu_src(ctx, instr->src[1]);
2365       if (dst.regClass() == v2b) {
2366          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2367             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2368          else
2369             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2370       } else if (dst.regClass() == v1) {
2371          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2372             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2373          else
2374             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2375       } else if (dst.regClass() == v2) {
2376          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2377                                      as_vgpr(ctx, src1));
2378          add->valu().neg[1] = true;
2379       } else {
2380          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2381       }
2382       break;
2383    }
2384    case nir_op_ffma: {
2385       if (dst.regClass() == v2b) {
2386          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
2387       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2388          assert(instr->def.num_components == 2);
2389
2390          Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
2391          Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
2392          Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
2393
2394          /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
2395          unsigned opsel_lo = 0, opsel_hi = 0;
2396          for (unsigned i = 0; i < 3; i++) {
2397             opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
2398             opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
2399          }
2400
2401          bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
2402       } else if (dst.regClass() == v1) {
2403          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
2404                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2405       } else if (dst.regClass() == v2) {
2406          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
2407       } else {
2408          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2409       }
2410       break;
2411    }
2412    case nir_op_ffmaz: {
2413       if (dst.regClass() == v1) {
2414          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
2415                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2416       } else {
2417          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2418       }
2419       break;
2420    }
2421    case nir_op_fmax: {
2422       if (dst.regClass() == v2b) {
2423          // TODO: check fp_mode.must_flush_denorms16_64
2424          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
2425       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2426          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2427       } else if (dst.regClass() == v1) {
2428          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2429                                ctx->block->fp_mode.must_flush_denorms32);
2430       } else if (dst.regClass() == v2) {
2431          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2432                                 ctx->block->fp_mode.must_flush_denorms16_64);
2433       } else {
2434          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2435       }
2436       break;
2437    }
2438    case nir_op_fmin: {
2439       if (dst.regClass() == v2b) {
2440          // TODO: check fp_mode.must_flush_denorms16_64
2441          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
2442       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2443          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2444       } else if (dst.regClass() == v1) {
2445          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2446                                ctx->block->fp_mode.must_flush_denorms32);
2447       } else if (dst.regClass() == v2) {
2448          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2449                                 ctx->block->fp_mode.must_flush_denorms16_64);
2450       } else {
2451          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2452       }
2453       break;
2454    }
2455    case nir_op_sdot_4x8_iadd: {
2456       if (ctx->options->gfx_level >= GFX11)
2457          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
2458       else
2459          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2460       break;
2461    }
2462    case nir_op_sdot_4x8_iadd_sat: {
2463       if (ctx->options->gfx_level >= GFX11)
2464          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
2465       else
2466          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2467       break;
2468    }
2469    case nir_op_sudot_4x8_iadd: {
2470       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
2471       break;
2472    }
2473    case nir_op_sudot_4x8_iadd_sat: {
2474       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
2475       break;
2476    }
2477    case nir_op_udot_4x8_uadd: {
2478       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2479       break;
2480    }
2481    case nir_op_udot_4x8_uadd_sat: {
2482       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2483       break;
2484    }
2485    case nir_op_sdot_2x16_iadd: {
2486       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2487       break;
2488    }
2489    case nir_op_sdot_2x16_iadd_sat: {
2490       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2491       break;
2492    }
2493    case nir_op_udot_2x16_uadd: {
2494       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2495       break;
2496    }
2497    case nir_op_udot_2x16_uadd_sat: {
2498       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2499       break;
2500    }
2501    case nir_op_cube_amd: {
2502       Temp in = get_alu_src(ctx, instr->src[0], 3);
2503       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2504                      emit_extract_vector(ctx, in, 2, v1)};
2505       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2506       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2507       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2508       Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
2509       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
2510       break;
2511    }
2512    case nir_op_bcsel: {
2513       emit_bcsel(ctx, instr, dst);
2514       break;
2515    }
2516    case nir_op_frsq: {
2517       if (dst.regClass() == v2b) {
2518          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2519       } else if (dst.regClass() == v1) {
2520          Temp src = get_alu_src(ctx, instr->src[0]);
2521          emit_rsq(ctx, bld, Definition(dst), src);
2522       } else if (dst.regClass() == v2) {
2523          /* Lowered at NIR level for precision reasons. */
2524          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2525       } else {
2526          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2527       }
2528       break;
2529    }
2530    case nir_op_fneg: {
2531       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2532          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2533          Instruction* vop3p =
2534             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2535                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2536          vop3p->valu().neg_lo[0] = true;
2537          vop3p->valu().neg_hi[0] = true;
2538          break;
2539       }
2540       Temp src = get_alu_src(ctx, instr->src[0]);
2541       if (dst.regClass() == v2b) {
2542          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2543       } else if (dst.regClass() == v1) {
2544          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2545                   as_vgpr(ctx, src));
2546       } else if (dst.regClass() == v2) {
2547          if (ctx->block->fp_mode.must_flush_denorms16_64)
2548             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2549                            as_vgpr(ctx, src));
2550          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2551          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2552          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2553          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2554       } else {
2555          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2556       }
2557       break;
2558    }
2559    case nir_op_fabs: {
2560       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2561          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2562          Instruction* vop3p =
2563             bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
2564                       instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
2565                .instr;
2566          vop3p->valu().neg_lo[1] = true;
2567          vop3p->valu().neg_hi[1] = true;
2568          break;
2569       }
2570       Temp src = get_alu_src(ctx, instr->src[0]);
2571       if (dst.regClass() == v2b) {
2572          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2573                                          Operand::c16(0x3c00), as_vgpr(ctx, src))
2574                                .instr;
2575          mul->valu().abs[1] = true;
2576       } else if (dst.regClass() == v1) {
2577          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2578                                          Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2579                                .instr;
2580          mul->valu().abs[1] = true;
2581       } else if (dst.regClass() == v2) {
2582          if (ctx->block->fp_mode.must_flush_denorms16_64)
2583             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2584                            as_vgpr(ctx, src));
2585          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2586          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2587          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2588          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2589       } else {
2590          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2591       }
2592       break;
2593    }
2594    case nir_op_fsat: {
2595       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2596          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2597          Instruction* vop3p =
2598             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2599                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2600          vop3p->valu().clamp = true;
2601          break;
2602       }
2603       Temp src = get_alu_src(ctx, instr->src[0]);
2604       if (dst.regClass() == v2b) {
2605          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2606                   src);
2607       } else if (dst.regClass() == v1) {
2608          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2609                   Operand::c32(0x3f800000u), src);
2610          /* apparently, it is not necessary to flush denorms if this instruction is used with these
2611           * operands */
2612          // TODO: confirm that this holds under any circumstances
2613       } else if (dst.regClass() == v2) {
2614          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2615          add->valu().clamp = true;
2616       } else {
2617          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2618       }
2619       break;
2620    }
2621    case nir_op_flog2: {
2622       if (dst.regClass() == v2b) {
2623          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2624       } else if (dst.regClass() == v1) {
2625          Temp src = get_alu_src(ctx, instr->src[0]);
2626          emit_log2(ctx, bld, Definition(dst), src);
2627       } else {
2628          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2629       }
2630       break;
2631    }
2632    case nir_op_frcp: {
2633       if (dst.regClass() == v2b) {
2634          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2635       } else if (dst.regClass() == v1) {
2636          Temp src = get_alu_src(ctx, instr->src[0]);
2637          emit_rcp(ctx, bld, Definition(dst), src);
2638       } else if (dst.regClass() == v2) {
2639          /* Lowered at NIR level for precision reasons. */
2640          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2641       } else {
2642          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2643       }
2644       break;
2645    }
2646    case nir_op_fexp2: {
2647       if (dst.regClass() == v2b) {
2648          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2649       } else if (dst.regClass() == v1) {
2650          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2651       } else {
2652          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2653       }
2654       break;
2655    }
2656    case nir_op_fsqrt: {
2657       if (dst.regClass() == v2b) {
2658          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2659       } else if (dst.regClass() == v1) {
2660          Temp src = get_alu_src(ctx, instr->src[0]);
2661          emit_sqrt(ctx, bld, Definition(dst), src);
2662       } else if (dst.regClass() == v2) {
2663          /* Lowered at NIR level for precision reasons. */
2664          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2665       } else {
2666          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2667       }
2668       break;
2669    }
2670    case nir_op_ffract: {
2671       if (dst.regClass() == v2b) {
2672          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2673       } else if (dst.regClass() == v1) {
2674          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2675       } else if (dst.regClass() == v2) {
2676          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2677       } else {
2678          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2679       }
2680       break;
2681    }
2682    case nir_op_ffloor: {
2683       if (dst.regClass() == v2b) {
2684          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2685       } else if (dst.regClass() == v1) {
2686          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2687       } else if (dst.regClass() == v2) {
2688          Temp src = get_alu_src(ctx, instr->src[0]);
2689          emit_floor_f64(ctx, bld, Definition(dst), src);
2690       } else {
2691          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2692       }
2693       break;
2694    }
2695    case nir_op_fceil: {
2696       if (dst.regClass() == v2b) {
2697          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2698       } else if (dst.regClass() == v1) {
2699          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2700       } else if (dst.regClass() == v2) {
2701          if (ctx->options->gfx_level >= GFX7) {
2702             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2703          } else {
2704             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2705             /* trunc = trunc(src0)
2706              * if (src0 > 0.0 && src0 != trunc)
2707              *    trunc += 1.0
2708              */
2709             Temp src0 = get_alu_src(ctx, instr->src[0]);
2710             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2711             Temp tmp0 =
2712                bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2713             Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
2714             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
2715             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2716                                 bld.copy(bld.def(v1), Operand::zero()),
2717                                 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2718             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2719                              bld.copy(bld.def(v1), Operand::zero()), add);
2720             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2721          }
2722       } else {
2723          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2724       }
2725       break;
2726    }
2727    case nir_op_ftrunc: {
2728       if (dst.regClass() == v2b) {
2729          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2730       } else if (dst.regClass() == v1) {
2731          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2732       } else if (dst.regClass() == v2) {
2733          Temp src = get_alu_src(ctx, instr->src[0]);
2734          emit_trunc_f64(ctx, bld, Definition(dst), src);
2735       } else {
2736          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2737       }
2738       break;
2739    }
2740    case nir_op_fround_even: {
2741       if (dst.regClass() == v2b) {
2742          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2743       } else if (dst.regClass() == v1) {
2744          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2745       } else if (dst.regClass() == v2) {
2746          if (ctx->options->gfx_level >= GFX7) {
2747             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2748          } else {
2749             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2750             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2751             Temp src0 = get_alu_src(ctx, instr->src[0]);
2752             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2753
2754             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2755                                     bld.copy(bld.def(s1), Operand::c32(-2u)));
2756             Temp bfi =
2757                bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2758                         bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2759             Temp tmp =
2760                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2761                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2762             Instruction* sub =
2763                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2764                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2765             sub->valu().neg[1] = true;
2766             tmp = sub->definitions[0].getTemp();
2767
2768             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2769                                 Operand::c32(0x432fffffu));
2770             Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
2771             vop3->valu().abs[0] = true;
2772             Temp cond = vop3->definitions[0].getTemp();
2773
2774             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2775             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2776             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2777                                      as_vgpr(ctx, src0_lo), cond);
2778             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2779                                      as_vgpr(ctx, src0_hi), cond);
2780
2781             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2782          }
2783       } else {
2784          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2785       }
2786       break;
2787    }
2788    case nir_op_fsin_amd:
2789    case nir_op_fcos_amd: {
2790       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2791       aco_ptr<Instruction> norm;
2792       if (dst.regClass() == v2b) {
2793          aco_opcode opcode =
2794             instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2795          bld.vop1(opcode, Definition(dst), src);
2796       } else if (dst.regClass() == v1) {
2797          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2798          if (ctx->options->gfx_level < GFX9)
2799             src = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), src);
2800
2801          aco_opcode opcode =
2802             instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2803          bld.vop1(opcode, Definition(dst), src);
2804       } else {
2805          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2806       }
2807       break;
2808    }
2809    case nir_op_ldexp: {
2810       if (dst.regClass() == v2b) {
2811          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2812       } else if (dst.regClass() == v1) {
2813          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2814       } else if (dst.regClass() == v2) {
2815          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2816       } else {
2817          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2818       }
2819       break;
2820    }
2821    case nir_op_frexp_sig: {
2822       if (dst.regClass() == v2b) {
2823          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2824       } else if (dst.regClass() == v1) {
2825          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2826       } else if (dst.regClass() == v2) {
2827          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2828       } else {
2829          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2830       }
2831       break;
2832    }
2833    case nir_op_frexp_exp: {
2834       if (instr->src[0].src.ssa->bit_size == 16) {
2835          Temp src = get_alu_src(ctx, instr->src[0]);
2836          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2837          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2838          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2839       } else if (instr->src[0].src.ssa->bit_size == 32) {
2840          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2841       } else if (instr->src[0].src.ssa->bit_size == 64) {
2842          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2843       } else {
2844          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2845       }
2846       break;
2847    }
2848    case nir_op_fsign: {
2849       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2850       if (dst.regClass() == v2b) {
2851          assert(ctx->program->gfx_level >= GFX9);
2852          /* replace negative zero with positive zero */
2853          src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2854          src =
2855             bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u));
2856          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2857       } else if (dst.regClass() == v1) {
2858          src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
2859          src =
2860             bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
2861          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2862       } else if (dst.regClass() == v2) {
2863          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
2864          Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2865          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2866                                    emit_extract_vector(ctx, src, 1, v1), cond);
2867
2868          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
2869          tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2870          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2871
2872          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2873       } else {
2874          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2875       }
2876       break;
2877    }
2878    case nir_op_f2f16:
2879    case nir_op_f2f16_rtne: {
2880       Temp src = get_alu_src(ctx, instr->src[0]);
2881       if (instr->src[0].src.ssa->bit_size == 64)
2882          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2883       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2884          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2885           * keep value numbering and the scheduler simpler.
2886           */
2887          bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2888       else
2889          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2890       break;
2891    }
2892    case nir_op_f2f16_rtz: {
2893       Temp src = get_alu_src(ctx, instr->src[0]);
2894       if (instr->src[0].src.ssa->bit_size == 64)
2895          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2896       if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2897          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2898       else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
2899          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2900       else
2901          bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2902       break;
2903    }
2904    case nir_op_f2f32: {
2905       if (instr->src[0].src.ssa->bit_size == 16) {
2906          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2907       } else if (instr->src[0].src.ssa->bit_size == 64) {
2908          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2909       } else {
2910          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2911       }
2912       break;
2913    }
2914    case nir_op_f2f64: {
2915       Temp src = get_alu_src(ctx, instr->src[0]);
2916       if (instr->src[0].src.ssa->bit_size == 16)
2917          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2918       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2919       break;
2920    }
2921    case nir_op_i2f16: {
2922       assert(dst.regClass() == v2b);
2923       Temp src = get_alu_src(ctx, instr->src[0]);
2924       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2925       if (input_size <= 16) {
2926          /* Expand integer to the size expected by the uint→float converter used below */
2927          unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2928          if (input_size != target_size) {
2929             src = convert_int(ctx, bld, src, input_size, target_size, true);
2930          }
2931       }
2932
2933       if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2934          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2935       } else {
2936          /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
2937           *
2938           * This is also the fallback-path taken on GFX7 and earlier, which
2939           * do not support direct f16⟷i16 conversions.
2940           */
2941          src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2942          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2943       }
2944       break;
2945    }
2946    case nir_op_i2f32: {
2947       assert(dst.size() == 1);
2948       Temp src = get_alu_src(ctx, instr->src[0]);
2949       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2950       if (input_size <= 32) {
2951          if (input_size <= 16) {
2952             /* Sign-extend to 32-bits */
2953             src = convert_int(ctx, bld, src, input_size, 32, true);
2954          }
2955          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2956       } else {
2957          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2958       }
2959       break;
2960    }
2961    case nir_op_i2f64: {
2962       if (instr->src[0].src.ssa->bit_size <= 32) {
2963          Temp src = get_alu_src(ctx, instr->src[0]);
2964          if (instr->src[0].src.ssa->bit_size <= 16)
2965             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2966          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2967       } else {
2968          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2969       }
2970       break;
2971    }
2972    case nir_op_u2f16: {
2973       assert(dst.regClass() == v2b);
2974       Temp src = get_alu_src(ctx, instr->src[0]);
2975       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2976       if (input_size <= 16) {
2977          /* Expand integer to the size expected by the uint→float converter used below */
2978          unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2979          if (input_size != target_size) {
2980             src = convert_int(ctx, bld, src, input_size, target_size, false);
2981          }
2982       }
2983
2984       if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2985          bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2986       } else {
2987          /* Large 32bit inputs need to return inf/FLOAT_MAX.
2988           *
2989           * This is also the fallback-path taken on GFX7 and earlier, which
2990           * do not support direct f16⟷u16 conversions.
2991           */
2992          src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
2993          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2994       }
2995       break;
2996    }
2997    case nir_op_u2f32: {
2998       assert(dst.size() == 1);
2999       Temp src = get_alu_src(ctx, instr->src[0]);
3000       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3001       if (input_size == 8) {
3002          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
3003       } else if (input_size <= 32) {
3004          if (input_size == 16)
3005             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3006          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
3007       } else {
3008          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3009       }
3010       break;
3011    }
3012    case nir_op_u2f64: {
3013       if (instr->src[0].src.ssa->bit_size <= 32) {
3014          Temp src = get_alu_src(ctx, instr->src[0]);
3015          if (instr->src[0].src.ssa->bit_size <= 16)
3016             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3017          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
3018       } else {
3019          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3020       }
3021       break;
3022    }
3023    case nir_op_f2i8:
3024    case nir_op_f2i16: {
3025       if (instr->src[0].src.ssa->bit_size == 16) {
3026          if (ctx->program->gfx_level >= GFX8) {
3027             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
3028          } else {
3029             /* GFX7 and earlier do not support direct f16⟷i16 conversions */
3030             Temp tmp = bld.tmp(v1);
3031             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3032             tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
3033             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3034                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3035             if (dst.type() == RegType::sgpr) {
3036                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3037             }
3038          }
3039       } else if (instr->src[0].src.ssa->bit_size == 32) {
3040          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3041       } else {
3042          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3043       }
3044       break;
3045    }
3046    case nir_op_f2u8:
3047    case nir_op_f2u16: {
3048       if (instr->src[0].src.ssa->bit_size == 16) {
3049          if (ctx->program->gfx_level >= GFX8) {
3050             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
3051          } else {
3052             /* GFX7 and earlier do not support direct f16⟷u16 conversions */
3053             Temp tmp = bld.tmp(v1);
3054             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3055             tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
3056             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3057                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3058             if (dst.type() == RegType::sgpr) {
3059                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3060             }
3061          }
3062       } else if (instr->src[0].src.ssa->bit_size == 32) {
3063          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3064       } else {
3065          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3066       }
3067       break;
3068    }
3069    case nir_op_f2i32: {
3070       Temp src = get_alu_src(ctx, instr->src[0]);
3071       if (instr->src[0].src.ssa->bit_size == 16) {
3072          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3073          if (dst.type() == RegType::vgpr) {
3074             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
3075          } else {
3076             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3077                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
3078          }
3079       } else if (instr->src[0].src.ssa->bit_size == 32) {
3080          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3081       } else if (instr->src[0].src.ssa->bit_size == 64) {
3082          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3083       } else {
3084          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3085       }
3086       break;
3087    }
3088    case nir_op_f2u32: {
3089       Temp src = get_alu_src(ctx, instr->src[0]);
3090       if (instr->src[0].src.ssa->bit_size == 16) {
3091          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3092          if (dst.type() == RegType::vgpr) {
3093             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
3094          } else {
3095             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3096                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
3097          }
3098       } else if (instr->src[0].src.ssa->bit_size == 32) {
3099          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3100       } else if (instr->src[0].src.ssa->bit_size == 64) {
3101          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3102       } else {
3103          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3104       }
3105       break;
3106    }
3107    case nir_op_b2f16: {
3108       Temp src = get_alu_src(ctx, instr->src[0]);
3109       assert(src.regClass() == bld.lm);
3110
3111       if (dst.regClass() == s1) {
3112          src = bool_to_scalar_condition(ctx, src);
3113          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3114       } else if (dst.regClass() == v2b) {
3115          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3116          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3117       } else {
3118          unreachable("Wrong destination register class for nir_op_b2f16.");
3119       }
3120       break;
3121    }
3122    case nir_op_b2f32: {
3123       Temp src = get_alu_src(ctx, instr->src[0]);
3124       assert(src.regClass() == bld.lm);
3125
3126       if (dst.regClass() == s1) {
3127          src = bool_to_scalar_condition(ctx, src);
3128          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3129       } else if (dst.regClass() == v1) {
3130          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3131                       Operand::c32(0x3f800000u), src);
3132       } else {
3133          unreachable("Wrong destination register class for nir_op_b2f32.");
3134       }
3135       break;
3136    }
3137    case nir_op_b2f64: {
3138       Temp src = get_alu_src(ctx, instr->src[0]);
3139       assert(src.regClass() == bld.lm);
3140
3141       if (dst.regClass() == s2) {
3142          src = bool_to_scalar_condition(ctx, src);
3143          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3144                   Operand::zero(), bld.scc(src));
3145       } else if (dst.regClass() == v2) {
3146          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3147          Temp upper =
3148             bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3149          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3150       } else {
3151          unreachable("Wrong destination register class for nir_op_b2f64.");
3152       }
3153       break;
3154    }
3155    case nir_op_i2i8:
3156    case nir_op_i2i16:
3157    case nir_op_i2i32: {
3158       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3159          /* no need to do the extract in get_alu_src() */
3160          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3161                                      ? sgpr_extract_sext
3162                                      : sgpr_extract_undef;
3163          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3164       } else {
3165          const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3166          const unsigned output_bitsize = instr->def.bit_size;
3167          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3168                      output_bitsize > input_bitsize, dst);
3169       }
3170       break;
3171    }
3172    case nir_op_u2u8:
3173    case nir_op_u2u16:
3174    case nir_op_u2u32: {
3175       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3176          /* no need to do the extract in get_alu_src() */
3177          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3178                                      ? sgpr_extract_zext
3179                                      : sgpr_extract_undef;
3180          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3181       } else {
3182          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3183                      instr->def.bit_size, false, dst);
3184       }
3185       break;
3186    }
3187    case nir_op_b2b32:
3188    case nir_op_b2i8:
3189    case nir_op_b2i16:
3190    case nir_op_b2i32: {
3191       Temp src = get_alu_src(ctx, instr->src[0]);
3192       assert(src.regClass() == bld.lm);
3193
3194       if (dst.regClass() == s1) {
3195          bool_to_scalar_condition(ctx, src, dst);
3196       } else if (dst.type() == RegType::vgpr) {
3197          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
3198                       src);
3199       } else {
3200          unreachable("Invalid register class for b2i32");
3201       }
3202       break;
3203    }
3204    case nir_op_b2b1: {
3205       Temp src = get_alu_src(ctx, instr->src[0]);
3206       assert(dst.regClass() == bld.lm);
3207
3208       if (src.type() == RegType::vgpr) {
3209          assert(src.regClass() == v1 || src.regClass() == v2);
3210          assert(dst.regClass() == bld.lm);
3211          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3212                   Definition(dst), Operand::zero(), src);
3213       } else {
3214          assert(src.regClass() == s1 || src.regClass() == s2);
3215          Temp tmp;
3216          if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
3217             tmp =
3218                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3219                   .def(1)
3220                   .getTemp();
3221          } else {
3222             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3223                            bld.scc(bld.def(s1)), Operand::zero(), src);
3224          }
3225          bool_to_vector_condition(ctx, tmp, dst);
3226       }
3227       break;
3228    }
3229    case nir_op_unpack_64_2x32:
3230    case nir_op_unpack_32_2x16:
3231    case nir_op_unpack_64_4x16:
3232    case nir_op_unpack_32_4x8:
3233       bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3234       emit_split_vector(
3235          ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3236       break;
3237    case nir_op_pack_64_2x32_split: {
3238       Temp src0 = get_alu_src(ctx, instr->src[0]);
3239       Temp src1 = get_alu_src(ctx, instr->src[1]);
3240
3241       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3242       break;
3243    }
3244    case nir_op_unpack_64_2x32_split_x:
3245       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3246                  get_alu_src(ctx, instr->src[0]));
3247       break;
3248    case nir_op_unpack_64_2x32_split_y:
3249       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3250                  get_alu_src(ctx, instr->src[0]));
3251       break;
3252    case nir_op_unpack_32_2x16_split_x:
3253       if (dst.type() == RegType::vgpr) {
3254          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3255                     get_alu_src(ctx, instr->src[0]));
3256       } else {
3257          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3258       }
3259       break;
3260    case nir_op_unpack_32_2x16_split_y:
3261       if (dst.type() == RegType::vgpr) {
3262          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3263                     get_alu_src(ctx, instr->src[0]));
3264       } else {
3265          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3266                     get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3267                     Operand::zero());
3268       }
3269       break;
3270    case nir_op_pack_32_2x16_split: {
3271       Temp src0 = get_alu_src(ctx, instr->src[0]);
3272       Temp src1 = get_alu_src(ctx, instr->src[1]);
3273       if (dst.regClass() == v1) {
3274          src0 = emit_extract_vector(ctx, src0, 0, v2b);
3275          src1 = emit_extract_vector(ctx, src1, 0, v2b);
3276          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3277       } else {
3278          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3279                          Operand::c32(0xFFFFu));
3280          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3281                          Operand::c32(16u));
3282          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3283       }
3284       break;
3285    }
3286    case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3287    case nir_op_pack_half_2x16_rtz_split:
3288    case nir_op_pack_half_2x16_split: {
3289       if (dst.regClass() == v1) {
3290          if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
3291             emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3292          else
3293             emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3294       } else {
3295          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3296       }
3297       break;
3298    }
3299    case nir_op_pack_unorm_2x16:
3300    case nir_op_pack_snorm_2x16: {
3301       unsigned bit_size = instr->src[0].src.ssa->bit_size;
3302       /* Only support 16 and 32bit. */
3303       assert(bit_size == 32 || bit_size == 16);
3304
3305       RegClass src_rc = bit_size == 32 ? v1 : v2b;
3306       Temp src = get_alu_src(ctx, instr->src[0], 2);
3307       Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
3308       Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
3309
3310       /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
3311       if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
3312          src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
3313          src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
3314          bit_size = 32;
3315       }
3316
3317       aco_opcode opcode;
3318       if (bit_size == 32) {
3319          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
3320                                                       : aco_opcode::v_cvt_pknorm_i16_f32;
3321       } else {
3322          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
3323                                                       : aco_opcode::v_cvt_pknorm_i16_f16;
3324       }
3325       bld.vop3(opcode, Definition(dst), src0, src1);
3326       break;
3327    }
3328    case nir_op_pack_uint_2x16:
3329    case nir_op_pack_sint_2x16: {
3330       Temp src = get_alu_src(ctx, instr->src[0], 2);
3331       Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3332       Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3333       aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
3334                                                              : aco_opcode::v_cvt_pk_i16_i32;
3335       bld.vop3(opcode, Definition(dst), src0, src1);
3336       break;
3337    }
3338    case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3339    case nir_op_unpack_half_2x16_split_x: {
3340       Temp src = get_alu_src(ctx, instr->src[0]);
3341       if (src.regClass() == v1)
3342          src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3343       if (dst.regClass() == v1) {
3344          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3345                 (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3346          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3347       } else {
3348          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3349       }
3350       break;
3351    }
3352    case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3353    case nir_op_unpack_half_2x16_split_y: {
3354       Temp src = get_alu_src(ctx, instr->src[0]);
3355       if (src.regClass() == s1)
3356          src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
3357                           Operand::c32(1u), Operand::c32(16u), Operand::zero());
3358       else
3359          src =
3360             bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3361       if (dst.regClass() == v1) {
3362          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3363                 (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3364          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3365       } else {
3366          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3367       }
3368       break;
3369    }
3370    case nir_op_sad_u8x4: {
3371       assert(dst.regClass() == v1);
3372       emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
3373       break;
3374    }
3375    case nir_op_fquantize2f16: {
3376       Temp src = get_alu_src(ctx, instr->src[0]);
3377       Temp f16;
3378       if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3379          f16 = bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, bld.def(v2b), src);
3380       else
3381          f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
3382       Temp f32, cmp_res;
3383
3384       if (ctx->program->gfx_level >= GFX8) {
3385          Temp mask = bld.copy(
3386             bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3387          cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, mask);
3388          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3389       } else {
3390          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3391           * so compare the result and flush to 0 if it's smaller.
3392           */
3393          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3394          Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3395          Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3396          tmp0->valu().abs[0] = true;
3397          Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f32, bld.def(bld.lm), Operand::zero(), f32);
3398          cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3399                             tmp0->definitions[0].getTemp(), tmp1);
3400       }
3401
3402       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3403          Temp copysign_0 =
3404             bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3405          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3406       } else {
3407          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3408       }
3409       break;
3410    }
3411    case nir_op_bfm: {
3412       Temp bits = get_alu_src(ctx, instr->src[0]);
3413       Temp offset = get_alu_src(ctx, instr->src[1]);
3414
3415       if (dst.regClass() == s1) {
3416          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3417       } else if (dst.regClass() == v1) {
3418          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3419       } else {
3420          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3421       }
3422       break;
3423    }
3424    case nir_op_bitfield_select: {
3425
3426       /* dst = (insert & bitmask) | (base & ~bitmask) */
3427       if (dst.regClass() == s1) {
3428          Temp bitmask = get_alu_src(ctx, instr->src[0]);
3429          Temp insert = get_alu_src(ctx, instr->src[1]);
3430          Temp base = get_alu_src(ctx, instr->src[2]);
3431          aco_ptr<Instruction> sop2;
3432          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3433          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3434          Operand lhs;
3435          if (const_insert && const_bitmask) {
3436             lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3437          } else {
3438             insert =
3439                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3440             lhs = Operand(insert);
3441          }
3442
3443          Operand rhs;
3444          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3445          if (const_base && const_bitmask) {
3446             rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3447          } else {
3448             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3449             rhs = Operand(base);
3450          }
3451
3452          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3453
3454       } else if (dst.regClass() == v1) {
3455          emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3456       } else {
3457          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3458       }
3459       break;
3460    }
3461    case nir_op_ubfe:
3462    case nir_op_ibfe: {
3463       if (dst.bytes() != 4)
3464          unreachable("Unsupported BFE bit size");
3465
3466       if (dst.type() == RegType::sgpr) {
3467          Temp base = get_alu_src(ctx, instr->src[0]);
3468
3469          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3470          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3471          aco_opcode opcode =
3472             instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3473          if (const_offset && const_bits) {
3474             uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
3475             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3476             break;
3477          }
3478
3479          Temp offset = get_alu_src(ctx, instr->src[1]);
3480          Temp bits = get_alu_src(ctx, instr->src[2]);
3481
3482          if (ctx->program->gfx_level >= GFX9) {
3483             Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
3484                                          : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3485                                                     bld.def(s1, scc), bits, Operand::c32(0x1fu));
3486             Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
3487             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
3488          } else if (instr->op == nir_op_ubfe) {
3489             Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3490             Temp masked =
3491                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3492             bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3493          } else {
3494             Operand bits_op = const_bits
3495                                  ? Operand::c32((const_bits->u32 & 0x1f) << 16)
3496                                  : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
3497                                             bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3498                                                      bld.def(s1, scc), bits, Operand::c32(0x1fu)),
3499                                             Operand::c32(16u));
3500             Operand offset_op = const_offset
3501                                    ? Operand::c32(const_offset->u32 & 0x1fu)
3502                                    : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3503                                               offset, Operand::c32(0x1fu));
3504
3505             Temp extract =
3506                bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3507             bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3508          }
3509
3510       } else {
3511          aco_opcode opcode =
3512             instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3513          emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3514       }
3515       break;
3516    }
3517    case nir_op_extract_u8:
3518    case nir_op_extract_i8:
3519    case nir_op_extract_u16:
3520    case nir_op_extract_i16: {
3521       bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3522       unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3523       uint32_t bits = comp == 4 ? 8 : 16;
3524       unsigned index = nir_src_as_uint(instr->src[1].src);
3525       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3526          assert(index == 0);
3527          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3528       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
3529          Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3530          unsigned swizzle = instr->src[0].swizzle[0];
3531          if (vec.size() > 1) {
3532             vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3533             swizzle = swizzle & 1;
3534          }
3535          index += swizzle * instr->def.bit_size / bits;
3536          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3537                     Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3538       } else {
3539          Temp src = get_alu_src(ctx, instr->src[0]);
3540          Definition def(dst);
3541          if (dst.bytes() == 8) {
3542             src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3543             index %= comp;
3544             def = bld.def(src.type(), 1);
3545          }
3546          assert(def.bytes() <= 4);
3547          if (def.regClass() == s1) {
3548             bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3549                        Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3550          } else {
3551             src = emit_extract_vector(ctx, src, 0, def.regClass());
3552             bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3553                        Operand::c32(bits), Operand::c32(is_signed));
3554          }
3555          if (dst.size() == 2)
3556             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3557                        Operand::zero());
3558       }
3559       break;
3560    }
3561    case nir_op_insert_u8:
3562    case nir_op_insert_u16: {
3563       unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3564       uint32_t bits = comp == 4 ? 8 : 16;
3565       unsigned index = nir_src_as_uint(instr->src[1].src);
3566       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3567          assert(index == 0);
3568          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3569       } else {
3570          Temp src = get_alu_src(ctx, instr->src[0]);
3571          Definition def(dst);
3572          bool swap = false;
3573          if (dst.bytes() == 8) {
3574             src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3575             swap = index >= comp;
3576             index %= comp;
3577             def = bld.def(src.type(), 1);
3578          }
3579          if (def.regClass() == s1) {
3580             bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3581                        Operand::c32(index), Operand::c32(bits));
3582          } else {
3583             src = emit_extract_vector(ctx, src, 0, def.regClass());
3584             bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3585                        Operand::c32(bits));
3586          }
3587          if (dst.size() == 2 && swap)
3588             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3589                        def.getTemp());
3590          else if (dst.size() == 2)
3591             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3592                        Operand::zero());
3593       }
3594       break;
3595    }
3596    case nir_op_bit_count: {
3597       Temp src = get_alu_src(ctx, instr->src[0]);
3598       if (src.regClass() == s1) {
3599          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3600       } else if (src.regClass() == v1) {
3601          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3602       } else if (src.regClass() == v2) {
3603          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3604                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3605                            emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3606       } else if (src.regClass() == s2) {
3607          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3608       } else {
3609          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3610       }
3611       break;
3612    }
3613    case nir_op_flt: {
3614       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3615                       aco_opcode::v_cmp_lt_f64);
3616       break;
3617    }
3618    case nir_op_fge: {
3619       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3620                       aco_opcode::v_cmp_ge_f64);
3621       break;
3622    }
3623    case nir_op_feq: {
3624       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3625                       aco_opcode::v_cmp_eq_f64);
3626       break;
3627    }
3628    case nir_op_fneu: {
3629       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3630                       aco_opcode::v_cmp_neq_f64);
3631       break;
3632    }
3633    case nir_op_ilt: {
3634       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3635                       aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3636       break;
3637    }
3638    case nir_op_ige: {
3639       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3640                       aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3641       break;
3642    }
3643    case nir_op_ieq: {
3644       if (instr->src[0].src.ssa->bit_size == 1)
3645          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3646       else
3647          emit_comparison(
3648             ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3649             aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3650             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3651       break;
3652    }
3653    case nir_op_ine: {
3654       if (instr->src[0].src.ssa->bit_size == 1)
3655          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3656       else
3657          emit_comparison(
3658             ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3659             aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3660             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3661       break;
3662    }
3663    case nir_op_ult: {
3664       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3665                       aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3666       break;
3667    }
3668    case nir_op_uge: {
3669       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3670                       aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3671       break;
3672    }
3673    case nir_op_bitz:
3674    case nir_op_bitnz: {
3675       assert(instr->src[0].src.ssa->bit_size != 1);
3676       bool test0 = instr->op == nir_op_bitz;
3677       Temp src0 = get_alu_src(ctx, instr->src[0]);
3678       Temp src1 = get_alu_src(ctx, instr->src[1]);
3679       bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
3680       if (!use_valu) {
3681          aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
3682                                                                : aco_opcode::s_bitcmp1_b32;
3683          if (test0)
3684             op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
3685                                                        : aco_opcode::s_bitcmp0_b32;
3686          emit_sopc_instruction(ctx, instr, op, dst);
3687          break;
3688       }
3689
3690       /* We do not have a VALU version of s_bitcmp.
3691        * But if the second source is constant, we can use
3692        * v_cmp_class_f32's LUT to check the bit.
3693        * The LUT only has 10 entries, so extract a higher byte if we have to.
3694        * For sign bits comparision with 0 is better because v_cmp_class
3695        * can't be inverted.
3696        */
3697       if (nir_src_is_const(instr->src[1].src)) {
3698          uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
3699          bit &= instr->src[0].src.ssa->bit_size - 1;
3700          src0 = as_vgpr(ctx, src0);
3701
3702          if (src0.regClass() == v2) {
3703             src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
3704             bit &= 31;
3705          }
3706
3707          if (bit == 31) {
3708             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3709                      Operand::c32(0), src0);
3710             break;
3711          }
3712
3713          if (bit == 15 && ctx->program->gfx_level >= GFX8) {
3714             bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
3715                      Operand::c32(0), src0);
3716             break;
3717          }
3718
3719          /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
3720          const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
3721          const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
3722          const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
3723          if (use_opsel) {
3724             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
3725                               Operand::c32(16), Operand::c32(0));
3726             bit &= 0xf;
3727          }
3728
3729          /* If we can use sdwa the extract is free, while test0's s_not is not. */
3730          if (bit == 7 && test0 && can_sdwa) {
3731             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
3732                               Operand::c32(8), Operand::c32(1));
3733             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3734                      Operand::c32(0), src0);
3735             break;
3736          }
3737
3738          if (bit > max_bit) {
3739             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
3740                               Operand::c32(8), Operand::c32(0));
3741             bit &= 0x7;
3742          }
3743
3744          /* denorm and snan/qnan inputs are preserved using all float control modes. */
3745          static const struct {
3746             uint32_t fp32;
3747             uint32_t fp16;
3748             bool negate;
3749          } float_lut[10] = {
3750             {0x7f800001, 0x7c01, false}, /* snan */
3751             {~0u, ~0u, false},           /* qnan */
3752             {0xff800000, 0xfc00, false}, /* -inf */
3753             {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
3754             {1, 1, true},                /* -denormal */
3755             {0, 0, true},                /* -0.0 */
3756             {0, 0, false},               /* +0.0 */
3757             {1, 1, false},               /* +denormal */
3758             {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
3759             {0x7f800000, 0x7c00, false}, /* +inf */
3760          };
3761
3762          Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
3763          /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
3764          const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
3765                                (ctx->program->gfx_level >= GFX11 && use_opsel);
3766          const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
3767          const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
3768
3769          VALU_instruction& res =
3770             bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
3771          if (float_lut[bit].negate) {
3772             res.format = asVOP3(res.format);
3773             res.neg[0] = true;
3774          }
3775
3776          if (test0)
3777             bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
3778
3779          break;
3780       }
3781
3782       Temp res;
3783       aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
3784       if (instr->src[0].src.ssa->bit_size == 16) {
3785          op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
3786          if (ctx->program->gfx_level < GFX10)
3787             res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
3788          else
3789             res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
3790
3791          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
3792       } else if (instr->src[0].src.ssa->bit_size == 32) {
3793          res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
3794       } else if (instr->src[0].src.ssa->bit_size == 64) {
3795          if (ctx->program->gfx_level < GFX8)
3796             res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
3797          else
3798             res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
3799
3800          res = emit_extract_vector(ctx, res, 0, v1);
3801          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
3802       } else {
3803          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3804       }
3805       bld.vopc(op, Definition(dst), Operand::c32(0), res);
3806       break;
3807    }
3808    case nir_op_fddx:
3809    case nir_op_fddy:
3810    case nir_op_fddx_fine:
3811    case nir_op_fddy_fine:
3812    case nir_op_fddx_coarse:
3813    case nir_op_fddy_coarse: {
3814       if (!nir_src_is_divergent(instr->src[0].src)) {
3815          /* Source is the same in all lanes, so the derivative is zero.
3816           * This also avoids emitting invalid IR.
3817           */
3818          bld.copy(Definition(dst), Operand::zero());
3819          break;
3820       }
3821
3822       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3823       uint16_t dpp_ctrl1, dpp_ctrl2;
3824       if (instr->op == nir_op_fddx_fine) {
3825          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3826          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3827       } else if (instr->op == nir_op_fddy_fine) {
3828          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3829          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3830       } else {
3831          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3832          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3833             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3834          else
3835             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3836       }
3837
3838       Temp tmp;
3839       if (ctx->program->gfx_level >= GFX8) {
3840          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3841          tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3842       } else {
3843          Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3844          Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3845          tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3846       }
3847       emit_wqm(bld, tmp, dst, true);
3848       break;
3849    }
3850    default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3851    }
3852 }
3853
3854 void
3855 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3856 {
3857    Temp dst = get_ssa_temp(ctx, &instr->def);
3858
3859    // TODO: we really want to have the resulting type as this would allow for 64bit literals
3860    // which get truncated the lsb if double and msb if int
3861    // for now, we only use s_mov_b64 with 64bit inline constants
3862    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3863    assert(dst.type() == RegType::sgpr);
3864
3865    Builder bld(ctx->program, ctx->block);
3866
3867    if (instr->def.bit_size == 1) {
3868       assert(dst.regClass() == bld.lm);
3869       int val = instr->value[0].b ? -1 : 0;
3870       Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3871       bld.copy(Definition(dst), op);
3872    } else if (instr->def.bit_size == 8) {
3873       bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3874    } else if (instr->def.bit_size == 16) {
3875       /* sign-extend to use s_movk_i32 instead of a literal */
3876       bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3877    } else if (dst.size() == 1) {
3878       bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3879    } else {
3880       assert(dst.size() != 1);
3881       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3882          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3883       if (instr->def.bit_size == 64)
3884          for (unsigned i = 0; i < dst.size(); i++)
3885             vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3886       else {
3887          for (unsigned i = 0; i < dst.size(); i++)
3888             vec->operands[i] = Operand::c32(instr->value[i].u32);
3889       }
3890       vec->definitions[0] = Definition(dst);
3891       ctx->block->instructions.emplace_back(std::move(vec));
3892    }
3893 }
3894
3895 bool
3896 can_use_byte_align_for_global_load(unsigned num_components, unsigned component_size,
3897                                    unsigned align_, bool support_12_byte)
3898 {
3899    /* Only use byte-align for 8/16-bit loads if we won't have to increase it's size and won't have
3900     * to use unsupported load sizes.
3901     */
3902    assert(util_is_power_of_two_nonzero(align_));
3903    if (align_ < 4) {
3904       assert(component_size < 4);
3905       unsigned load_size = num_components * component_size;
3906       uint32_t new_size = align(load_size + (4 - align_), 4);
3907       return new_size == align(load_size, 4) && (new_size != 12 || support_12_byte);
3908    }
3909    return true;
3910 }
3911
3912 struct LoadEmitInfo {
3913    Operand offset;
3914    Temp dst;
3915    unsigned num_components;
3916    unsigned component_size;
3917    Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */
3918    Temp idx = Temp(0, v1);      /* buffer index */
3919    unsigned component_stride = 0;
3920    unsigned const_offset = 0;
3921    unsigned align_mul = 0;
3922    unsigned align_offset = 0;
3923    pipe_format format;
3924
3925    bool glc = false;
3926    bool slc = false;
3927    bool split_by_component_stride = true;
3928    unsigned swizzle_component_size = 0;
3929    memory_sync_info sync;
3930    Temp soffset = Temp(0, s1);
3931 };
3932
3933 struct EmitLoadParameters {
3934    using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
3935                              unsigned bytes_needed, unsigned align, unsigned const_offset,
3936                              Temp dst_hint);
3937
3938    Callback callback;
3939    bool byte_align_loads;
3940    bool supports_8bit_16bit_loads;
3941    unsigned max_const_offset_plus_one;
3942 };
3943
3944 void
3945 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
3946           const EmitLoadParameters& params)
3947 {
3948    unsigned load_size = info.num_components * info.component_size;
3949    unsigned component_size = info.component_size;
3950
3951    unsigned num_vals = 0;
3952    Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
3953
3954    unsigned const_offset = info.const_offset;
3955
3956    const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
3957    unsigned align_offset = info.align_offset % align_mul;
3958
3959    unsigned bytes_read = 0;
3960    while (bytes_read < load_size) {
3961       unsigned bytes_needed = load_size - bytes_read;
3962
3963       /* add buffer for unaligned loads */
3964       int byte_align = 0;
3965       if (params.byte_align_loads) {
3966          byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3967       }
3968
3969       if (byte_align) {
3970          if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3971              !params.supports_8bit_16bit_loads) {
3972             if (info.component_stride) {
3973                assert(params.supports_8bit_16bit_loads && "unimplemented");
3974                bytes_needed = 2;
3975                byte_align = 0;
3976             } else {
3977                bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
3978                bytes_needed = align(bytes_needed, 4);
3979             }
3980          } else {
3981             byte_align = 0;
3982          }
3983       }
3984
3985       if (info.split_by_component_stride) {
3986          if (info.swizzle_component_size)
3987             bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
3988          if (info.component_stride)
3989             bytes_needed = MIN2(bytes_needed, info.component_size);
3990       }
3991
3992       bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3993
3994       /* reduce constant offset */
3995       Operand offset = info.offset;
3996       unsigned reduced_const_offset = const_offset;
3997       bool remove_const_offset_completely = need_to_align_offset;
3998       if (const_offset &&
3999           (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
4000          unsigned to_add = const_offset;
4001          if (remove_const_offset_completely) {
4002             reduced_const_offset = 0;
4003          } else {
4004             to_add =
4005                const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
4006             reduced_const_offset %= params.max_const_offset_plus_one;
4007          }
4008          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4009          if (offset.isConstant()) {
4010             offset = Operand::c32(offset.constantValue() + to_add);
4011          } else if (offset.isUndefined()) {
4012             offset = Operand::c32(to_add);
4013          } else if (offset_tmp.regClass() == s1) {
4014             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
4015                               Operand::c32(to_add));
4016          } else if (offset_tmp.regClass() == v1) {
4017             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
4018          } else {
4019             Temp lo = bld.tmp(offset_tmp.type(), 1);
4020             Temp hi = bld.tmp(offset_tmp.type(), 1);
4021             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4022
4023             if (offset_tmp.regClass() == s2) {
4024                Temp carry = bld.tmp(s1);
4025                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
4026                              Operand::c32(to_add));
4027                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
4028                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
4029             } else {
4030                Temp new_lo = bld.tmp(v1);
4031                Temp carry =
4032                   bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
4033                hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
4034                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
4035             }
4036          }
4037       }
4038
4039       /* align offset down if needed */
4040       Operand aligned_offset = offset;
4041       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
4042       if (need_to_align_offset) {
4043          align = 4;
4044          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4045          if (offset.isConstant()) {
4046             aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
4047          } else if (offset.isUndefined()) {
4048             aligned_offset = Operand::zero();
4049          } else if (offset_tmp.regClass() == s1) {
4050             aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
4051                                       Operand::c32(0xfffffffcu), offset_tmp);
4052          } else if (offset_tmp.regClass() == s2) {
4053             aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
4054                                       Operand::c64(0xfffffffffffffffcllu), offset_tmp);
4055          } else if (offset_tmp.regClass() == v1) {
4056             aligned_offset =
4057                bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
4058          } else if (offset_tmp.regClass() == v2) {
4059             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
4060             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4061             lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
4062             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
4063          }
4064       }
4065       Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp()
4066                                 : aligned_offset.isConstant()
4067                                    ? bld.copy(bld.def(s1), aligned_offset)
4068                                    : Temp(0, s1);
4069
4070       Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
4071                                  reduced_const_offset, byte_align ? Temp() : info.dst);
4072
4073       /* the callback wrote directly to dst */
4074       if (val == info.dst) {
4075          assert(num_vals == 0);
4076          emit_split_vector(ctx, info.dst, info.num_components);
4077          return;
4078       }
4079
4080       /* shift result right if needed */
4081       if (params.byte_align_loads && info.component_size < 4) {
4082          Operand byte_align_off = Operand::c32(byte_align);
4083          if (byte_align == -1) {
4084             if (offset.isConstant())
4085                byte_align_off = Operand::c32(offset.constantValue() % 4u);
4086             else if (offset.isUndefined())
4087                byte_align_off = Operand::zero();
4088             else if (offset.size() == 2)
4089                byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
4090                                                             RegClass(offset.getTemp().type(), 1)));
4091             else
4092                byte_align_off = offset;
4093          }
4094
4095          assert(val.bytes() >= load_size && "unimplemented");
4096          if (val.type() == RegType::sgpr)
4097             byte_align_scalar(ctx, val, byte_align_off, info.dst);
4098          else
4099             byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
4100          return;
4101       }
4102
4103       /* add result to list and advance */
4104       if (info.component_stride) {
4105          assert(val.bytes() % info.component_size == 0);
4106          unsigned num_loaded_components = val.bytes() / info.component_size;
4107          unsigned advance_bytes = info.component_stride * num_loaded_components;
4108          const_offset += advance_bytes;
4109          align_offset = (align_offset + advance_bytes) % align_mul;
4110       } else {
4111          const_offset += val.bytes();
4112          align_offset = (align_offset + val.bytes()) % align_mul;
4113       }
4114       bytes_read += val.bytes();
4115       vals[num_vals++] = val;
4116    }
4117
4118    /* create array of components */
4119    unsigned components_split = 0;
4120    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4121    bool has_vgprs = false;
4122    for (unsigned i = 0; i < num_vals;) {
4123       Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
4124       unsigned num_tmps = 0;
4125       unsigned tmp_size = 0;
4126       RegType reg_type = RegType::sgpr;
4127       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
4128          if (vals[i].type() == RegType::vgpr)
4129             reg_type = RegType::vgpr;
4130          tmp_size += vals[i].bytes();
4131          tmp[num_tmps++] = vals[i++];
4132       }
4133       if (num_tmps > 1) {
4134          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
4135             aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
4136          for (unsigned j = 0; j < num_tmps; j++)
4137             vec->operands[j] = Operand(tmp[j]);
4138          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
4139          vec->definitions[0] = Definition(tmp[0]);
4140          bld.insert(std::move(vec));
4141       }
4142
4143       if (tmp[0].bytes() % component_size) {
4144          /* trim tmp[0] */
4145          assert(i == num_vals);
4146          RegClass new_rc =
4147             RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
4148          tmp[0] =
4149             bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
4150       }
4151
4152       RegClass elem_rc = RegClass::get(reg_type, component_size);
4153
4154       unsigned start = components_split;
4155
4156       if (tmp_size == elem_rc.bytes()) {
4157          allocated_vec[components_split++] = tmp[0];
4158       } else {
4159          assert(tmp_size % elem_rc.bytes() == 0);
4160          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
4161             aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
4162          for (auto& def : split->definitions) {
4163             Temp component = bld.tmp(elem_rc);
4164             allocated_vec[components_split++] = component;
4165             def = Definition(component);
4166          }
4167          split->operands[0] = Operand(tmp[0]);
4168          bld.insert(std::move(split));
4169       }
4170
4171       /* try to p_as_uniform early so we can create more optimizable code and
4172        * also update allocated_vec */
4173       for (unsigned j = start; j < components_split; j++) {
4174          if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
4175             allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
4176          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
4177       }
4178    }
4179
4180    /* concatenate components and p_as_uniform() result if needed */
4181    if (info.dst.type() == RegType::vgpr || !has_vgprs)
4182       ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
4183
4184    int padding_bytes =
4185       MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
4186
4187    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
4188       aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
4189    for (unsigned i = 0; i < info.num_components; i++)
4190       vec->operands[i] = Operand(allocated_vec[i]);
4191    if (padding_bytes)
4192       vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
4193    if (info.dst.type() == RegType::sgpr && has_vgprs) {
4194       Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
4195       vec->definitions[0] = Definition(tmp);
4196       bld.insert(std::move(vec));
4197       bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
4198    } else {
4199       vec->definitions[0] = Definition(info.dst);
4200       bld.insert(std::move(vec));
4201    }
4202 }
4203
4204 Operand
4205 load_lds_size_m0(Builder& bld)
4206 {
4207    /* m0 does not need to be initialized on GFX9+ */
4208    if (bld.program->gfx_level >= GFX9)
4209       return Operand(s1);
4210
4211    return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
4212 }
4213
4214 Temp
4215 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4216                   unsigned align, unsigned const_offset, Temp dst_hint)
4217 {
4218    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
4219
4220    Operand m = load_lds_size_m0(bld);
4221
4222    bool large_ds_read = bld.program->gfx_level >= GFX7;
4223    bool usable_read2 = bld.program->gfx_level >= GFX7;
4224
4225    bool read2 = false;
4226    unsigned size = 0;
4227    aco_opcode op;
4228    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
4229       size = 16;
4230       op = aco_opcode::ds_read_b128;
4231    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
4232       size = 16;
4233       read2 = true;
4234       op = aco_opcode::ds_read2_b64;
4235    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
4236       size = 12;
4237       op = aco_opcode::ds_read_b96;
4238    } else if (bytes_needed >= 8 && align % 8 == 0) {
4239       size = 8;
4240       op = aco_opcode::ds_read_b64;
4241    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
4242       size = 8;
4243       read2 = true;
4244       op = aco_opcode::ds_read2_b32;
4245    } else if (bytes_needed >= 4 && align % 4 == 0) {
4246       size = 4;
4247       op = aco_opcode::ds_read_b32;
4248    } else if (bytes_needed >= 2 && align % 2 == 0) {
4249       size = 2;
4250       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
4251    } else {
4252       size = 1;
4253       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
4254    }
4255
4256    unsigned const_offset_unit = read2 ? size / 2u : 1u;
4257    unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
4258
4259    if (const_offset > (const_offset_range - const_offset_unit)) {
4260       unsigned excess = const_offset - (const_offset % const_offset_range);
4261       offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
4262       const_offset -= excess;
4263    }
4264
4265    const_offset /= const_offset_unit;
4266
4267    RegClass rc = RegClass::get(RegType::vgpr, size);
4268    Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
4269    Instruction* instr;
4270    if (read2)
4271       instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
4272    else
4273       instr = bld.ds(op, Definition(val), offset, m, const_offset);
4274    instr->ds().sync = info.sync;
4275
4276    if (m.isUndefined())
4277       instr->operands.pop_back();
4278
4279    return val;
4280 }
4281
4282 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
4283
4284 Temp
4285 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4286                    unsigned align, unsigned const_offset, Temp dst_hint)
4287 {
4288    assert(align >= 4u);
4289
4290    bld.program->has_smem_buffer_or_global_loads = true;
4291
4292    bool buffer = info.resource.id() && info.resource.bytes() == 16;
4293    Temp addr = info.resource;
4294    if (!buffer && !addr.id()) {
4295       addr = offset;
4296       offset = Temp();
4297    }
4298
4299    bytes_needed = MIN2(bytes_needed, 64);
4300    unsigned needed_round_up = util_next_power_of_two(bytes_needed);
4301    unsigned needed_round_down = needed_round_up >> (needed_round_up != bytes_needed ? 1 : 0);
4302    /* Only round-up global loads if it's aligned so that it won't cross pages */
4303    bytes_needed = buffer || align % needed_round_up == 0 ? needed_round_up : needed_round_down;
4304
4305    aco_opcode op;
4306    if (bytes_needed <= 4) {
4307       op = buffer ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4308    } else if (bytes_needed <= 8) {
4309       op = buffer ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4310    } else if (bytes_needed <= 16) {
4311       op = buffer ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4312    } else if (bytes_needed <= 32) {
4313       op = buffer ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4314    } else {
4315       assert(bytes_needed == 64);
4316       op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4317    }
4318
4319    aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4320    if (buffer) {
4321       if (const_offset)
4322          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4323                            Operand::c32(const_offset));
4324       load->operands[0] = Operand(info.resource);
4325       load->operands[1] = Operand(offset);
4326    } else {
4327       load->operands[0] = Operand(addr);
4328       if (offset.id() && const_offset)
4329          load->operands[1] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4330                                       Operand::c32(const_offset));
4331       else if (offset.id())
4332          load->operands[1] = Operand(offset);
4333       else
4334          load->operands[1] = Operand::c32(const_offset);
4335    }
4336    RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
4337    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4338    load->definitions[0] = Definition(val);
4339    load->glc = info.glc;
4340    load->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4341    load->sync = info.sync;
4342    bld.insert(std::move(load));
4343    return val;
4344 }
4345
4346 const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4347
4348 Temp
4349 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4350                     unsigned align_, unsigned const_offset, Temp dst_hint)
4351 {
4352    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4353    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4354
4355    if (info.soffset.id()) {
4356       if (soffset.isTemp())
4357          vaddr = bld.copy(bld.def(v1), soffset);
4358       soffset = Operand(info.soffset);
4359    }
4360
4361    if (soffset.isUndefined())
4362       soffset = Operand::zero();
4363
4364    bool offen = !vaddr.isUndefined();
4365    bool idxen = info.idx.id();
4366
4367    if (offen && idxen)
4368       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4369    else if (idxen)
4370       vaddr = Operand(info.idx);
4371
4372    unsigned bytes_size = 0;
4373    aco_opcode op;
4374    if (bytes_needed == 1 || align_ % 2) {
4375       bytes_size = 1;
4376       op = aco_opcode::buffer_load_ubyte;
4377    } else if (bytes_needed == 2 || align_ % 4) {
4378       bytes_size = 2;
4379       op = aco_opcode::buffer_load_ushort;
4380    } else if (bytes_needed <= 4) {
4381       bytes_size = 4;
4382       op = aco_opcode::buffer_load_dword;
4383    } else if (bytes_needed <= 8) {
4384       bytes_size = 8;
4385       op = aco_opcode::buffer_load_dwordx2;
4386    } else if (bytes_needed <= 12 && bld.program->gfx_level > GFX6) {
4387       bytes_size = 12;
4388       op = aco_opcode::buffer_load_dwordx3;
4389    } else {
4390       bytes_size = 16;
4391       op = aco_opcode::buffer_load_dwordx4;
4392    }
4393    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4394    mubuf->operands[0] = Operand(info.resource);
4395    mubuf->operands[1] = vaddr;
4396    mubuf->operands[2] = soffset;
4397    mubuf->offen = offen;
4398    mubuf->idxen = idxen;
4399    mubuf->glc = info.glc;
4400    mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4401    mubuf->slc = info.slc;
4402    mubuf->sync = info.sync;
4403    mubuf->offset = const_offset;
4404    mubuf->swizzled = info.swizzle_component_size != 0;
4405    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4406    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4407    mubuf->definitions[0] = Definition(val);
4408    bld.insert(std::move(mubuf));
4409
4410    return val;
4411 }
4412
4413 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4414
4415 Temp
4416 mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
4417                            unsigned bytes_needed, unsigned align_, unsigned const_offset,
4418                            Temp dst_hint)
4419 {
4420    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4421    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4422
4423    if (info.soffset.id()) {
4424       if (soffset.isTemp())
4425          vaddr = bld.copy(bld.def(v1), soffset);
4426       soffset = Operand(info.soffset);
4427    }
4428
4429    if (soffset.isUndefined())
4430       soffset = Operand::zero();
4431
4432    bool offen = !vaddr.isUndefined();
4433    bool idxen = info.idx.id();
4434
4435    if (offen && idxen)
4436       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4437    else if (idxen)
4438       vaddr = Operand(info.idx);
4439
4440    aco_opcode op = aco_opcode::num_opcodes;
4441    if (info.component_size == 2) {
4442       switch (bytes_needed) {
4443       case 2: op = aco_opcode::buffer_load_format_d16_x; break;
4444       case 4: op = aco_opcode::buffer_load_format_d16_xy; break;
4445       case 6: op = aco_opcode::buffer_load_format_d16_xyz; break;
4446       case 8: op = aco_opcode::buffer_load_format_d16_xyzw; break;
4447       default: unreachable("invalid buffer load format size"); break;
4448       }
4449    } else {
4450       assert(info.component_size == 4);
4451       switch (bytes_needed) {
4452       case 4: op = aco_opcode::buffer_load_format_x; break;
4453       case 8: op = aco_opcode::buffer_load_format_xy; break;
4454       case 12: op = aco_opcode::buffer_load_format_xyz; break;
4455       case 16: op = aco_opcode::buffer_load_format_xyzw; break;
4456       default: unreachable("invalid buffer load format size"); break;
4457       }
4458    }
4459
4460    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4461    mubuf->operands[0] = Operand(info.resource);
4462    mubuf->operands[1] = vaddr;
4463    mubuf->operands[2] = soffset;
4464    mubuf->offen = offen;
4465    mubuf->idxen = idxen;
4466    mubuf->glc = info.glc;
4467    mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4468    mubuf->slc = info.slc;
4469    mubuf->sync = info.sync;
4470    mubuf->offset = const_offset;
4471    RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
4472    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4473    mubuf->definitions[0] = Definition(val);
4474    bld.insert(std::move(mubuf));
4475
4476    return val;
4477 }
4478
4479 const EmitLoadParameters mubuf_load_format_params{mubuf_load_format_callback, false, true, 4096};
4480
4481 Temp
4482 scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4483                       unsigned align_, unsigned const_offset, Temp dst_hint)
4484 {
4485    unsigned bytes_size = 0;
4486    aco_opcode op;
4487    if (bytes_needed == 1 || align_ % 2u) {
4488       bytes_size = 1;
4489       op = aco_opcode::scratch_load_ubyte;
4490    } else if (bytes_needed == 2 || align_ % 4u) {
4491       bytes_size = 2;
4492       op = aco_opcode::scratch_load_ushort;
4493    } else if (bytes_needed <= 4) {
4494       bytes_size = 4;
4495       op = aco_opcode::scratch_load_dword;
4496    } else if (bytes_needed <= 8) {
4497       bytes_size = 8;
4498       op = aco_opcode::scratch_load_dwordx2;
4499    } else if (bytes_needed <= 12) {
4500       bytes_size = 12;
4501       op = aco_opcode::scratch_load_dwordx3;
4502    } else {
4503       bytes_size = 16;
4504       op = aco_opcode::scratch_load_dwordx4;
4505    }
4506    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4507    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4508    aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, Format::SCRATCH, 2, 1)};
4509    flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
4510    flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
4511    flat->sync = info.sync;
4512    flat->offset = const_offset;
4513    flat->definitions[0] = Definition(val);
4514    bld.insert(std::move(flat));
4515
4516    return val;
4517 }
4518
4519 const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, false, true, 4096};
4520 const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, false, true, 2048};
4521
4522 Temp
4523 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4524 {
4525    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4526                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4527
4528    if (addr.type() == RegType::vgpr)
4529       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4530                         Operand::c32(-1u), Operand::c32(rsrc_conf));
4531    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4532                      Operand::c32(rsrc_conf));
4533 }
4534
4535 Temp
4536 add64_32(Builder& bld, Temp src0, Temp src1)
4537 {
4538    Temp src00 = bld.tmp(src0.type(), 1);
4539    Temp src01 = bld.tmp(src0.type(), 1);
4540    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
4541
4542    if (src0.type() == RegType::vgpr || src1.type() == RegType::vgpr) {
4543       Temp dst0 = bld.tmp(v1);
4544       Temp carry = bld.vadd32(Definition(dst0), src00, src1, true).def(1).getTemp();
4545       Temp dst1 = bld.vadd32(bld.def(v1), src01, Operand::zero(), false, carry);
4546       return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
4547    } else {
4548       Temp carry = bld.tmp(s1);
4549       Temp dst0 =
4550          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src1);
4551       Temp dst1 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), src01, carry);
4552       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), dst0, dst1);
4553    }
4554 }
4555
4556 void
4557 lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout,
4558                      uint32_t* const_offset_inout, Temp* offset_inout)
4559 {
4560    Temp address = *address_inout;
4561    uint64_t const_offset = *const_offset_inout + offset_in;
4562    Temp offset = *offset_inout;
4563
4564    uint64_t max_const_offset_plus_one =
4565       1; /* GFX7/8/9: FLAT loads do not support constant offsets */
4566    if (bld.program->gfx_level >= GFX9)
4567       max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max;
4568    else if (bld.program->gfx_level == GFX6)
4569       max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */
4570    uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one);
4571    const_offset %= max_const_offset_plus_one;
4572
4573    if (!offset.id()) {
4574       while (unlikely(excess_offset > UINT32_MAX)) {
4575          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(UINT32_MAX)));
4576          excess_offset -= UINT32_MAX;
4577       }
4578       if (excess_offset)
4579          offset = bld.copy(bld.def(s1), Operand::c32(excess_offset));
4580    } else {
4581       /* If we add to "offset", we would transform the indended
4582        * "address + u2u64(offset) + u2u64(const_offset)" into
4583        * "address + u2u64(offset + const_offset)", so add to the address.
4584        * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition,
4585        * but that should be really rare.
4586        */
4587       while (excess_offset) {
4588          uint32_t src2 = MIN2(excess_offset, UINT32_MAX);
4589          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(src2)));
4590          excess_offset -= src2;
4591       }
4592    }
4593
4594    if (bld.program->gfx_level == GFX6) {
4595       /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */
4596       if (offset.type() != RegType::sgpr) {
4597          address = add64_32(bld, address, offset);
4598          offset = Temp();
4599       }
4600       offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero());
4601    } else if (bld.program->gfx_level <= GFX8) {
4602       /* GFX7,8 (FLAT): VGPR address */
4603       if (offset.id()) {
4604          address = add64_32(bld, address, offset);
4605          offset = Temp();
4606       }
4607       address = as_vgpr(bld, address);
4608    } else {
4609       /* GFX9+ (GLOBAL): (VGPR address), or (SGPR address and VGPR offset) */
4610       if (address.type() == RegType::vgpr && offset.id()) {
4611          address = add64_32(bld, address, offset);
4612          offset = Temp();
4613       } else if (address.type() == RegType::sgpr && offset.id()) {
4614          offset = as_vgpr(bld, offset);
4615       }
4616       if (address.type() == RegType::sgpr && !offset.id())
4617          offset = bld.copy(bld.def(v1), bld.copy(bld.def(s1), Operand::zero()));
4618    }
4619
4620    *address_inout = address;
4621    *const_offset_inout = const_offset;
4622    *offset_inout = offset;
4623 }
4624
4625 Temp
4626 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4627                      unsigned align_, unsigned const_offset, Temp dst_hint)
4628 {
4629    Temp addr = info.resource;
4630    if (!addr.id()) {
4631       addr = offset;
4632       offset = Temp();
4633    }
4634    lower_global_address(bld, 0, &addr, &const_offset, &offset);
4635
4636    unsigned bytes_size = 0;
4637    bool use_mubuf = bld.program->gfx_level == GFX6;
4638    bool global = bld.program->gfx_level >= GFX9;
4639    aco_opcode op;
4640    if (bytes_needed == 1 || align_ % 2u) {
4641       bytes_size = 1;
4642       op = use_mubuf ? aco_opcode::buffer_load_ubyte
4643            : global  ? aco_opcode::global_load_ubyte
4644                      : aco_opcode::flat_load_ubyte;
4645    } else if (bytes_needed == 2 || align_ % 4u) {
4646       bytes_size = 2;
4647       op = use_mubuf ? aco_opcode::buffer_load_ushort
4648            : global  ? aco_opcode::global_load_ushort
4649                      : aco_opcode::flat_load_ushort;
4650    } else if (bytes_needed <= 4) {
4651       bytes_size = 4;
4652       op = use_mubuf ? aco_opcode::buffer_load_dword
4653            : global  ? aco_opcode::global_load_dword
4654                      : aco_opcode::flat_load_dword;
4655    } else if (bytes_needed <= 8 || (bytes_needed <= 12 && use_mubuf)) {
4656       bytes_size = 8;
4657       op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4658            : global  ? aco_opcode::global_load_dwordx2
4659                      : aco_opcode::flat_load_dwordx2;
4660    } else if (bytes_needed <= 12 && !use_mubuf) {
4661       bytes_size = 12;
4662       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4663    } else {
4664       bytes_size = 16;
4665       op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4666            : global  ? aco_opcode::global_load_dwordx4
4667                      : aco_opcode::flat_load_dwordx4;
4668    }
4669    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4670    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4671    if (use_mubuf) {
4672       aco_ptr<MUBUF_instruction> mubuf{
4673          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4674       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
4675       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
4676       mubuf->operands[2] = Operand(offset);
4677       mubuf->glc = info.glc;
4678       mubuf->dlc = false;
4679       mubuf->offset = const_offset;
4680       mubuf->addr64 = addr.type() == RegType::vgpr;
4681       mubuf->disable_wqm = false;
4682       mubuf->sync = info.sync;
4683       mubuf->definitions[0] = Definition(val);
4684       bld.insert(std::move(mubuf));
4685    } else {
4686       aco_ptr<FLAT_instruction> flat{
4687          create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4688       if (addr.regClass() == s2) {
4689          assert(global && offset.id() && offset.type() == RegType::vgpr);
4690          flat->operands[0] = Operand(offset);
4691          flat->operands[1] = Operand(addr);
4692       } else {
4693          assert(addr.type() == RegType::vgpr && !offset.id());
4694          flat->operands[0] = Operand(addr);
4695          flat->operands[1] = Operand(s1);
4696       }
4697       flat->glc = info.glc;
4698       flat->dlc =
4699          info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4700       flat->sync = info.sync;
4701       assert(global || !const_offset);
4702       flat->offset = const_offset;
4703       flat->definitions[0] = Definition(val);
4704       bld.insert(std::move(flat));
4705    }
4706
4707    return val;
4708 }
4709
4710 const EmitLoadParameters global_load_params{global_load_callback, true, true, UINT32_MAX};
4711
4712 Temp
4713 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4714          Temp address, unsigned base_offset, unsigned align)
4715 {
4716    assert(util_is_power_of_two_nonzero(align));
4717
4718    Builder bld(ctx->program, ctx->block);
4719
4720    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4721    info.align_mul = align;
4722    info.align_offset = 0;
4723    info.sync = memory_sync_info(storage_shared);
4724    info.const_offset = base_offset;
4725    emit_load(ctx, bld, info, lds_load_params);
4726
4727    return dst;
4728 }
4729
4730 void
4731 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4732                  Temp src)
4733 {
4734    if (!count)
4735       return;
4736
4737    Builder bld(ctx->program, ctx->block);
4738
4739    /* count == 1 fast path */
4740    if (count == 1) {
4741       if (dst_type == RegType::sgpr)
4742          dst[0] = bld.as_uniform(src);
4743       else
4744          dst[0] = as_vgpr(ctx, src);
4745       return;
4746    }
4747
4748    /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4749    unsigned elem_size_bytes =
4750       1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4751
4752    ASSERTED bool is_subdword = elem_size_bytes < 4;
4753    assert(!is_subdword || dst_type == RegType::vgpr);
4754
4755    for (unsigned i = 0; i < count; i++)
4756       dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4757
4758    std::vector<Temp> temps;
4759    /* use allocated_vec if possible */
4760    auto it = ctx->allocated_vec.find(src.id());
4761    if (it != ctx->allocated_vec.end()) {
4762       if (!it->second[0].id())
4763          goto split;
4764       unsigned elem_size = it->second[0].bytes();
4765       assert(src.bytes() % elem_size == 0);
4766
4767       for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4768          if (!it->second[i].id())
4769             goto split;
4770       }
4771       if (elem_size_bytes % elem_size)
4772          goto split;
4773
4774       temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4775       elem_size_bytes = elem_size;
4776    }
4777
4778 split:
4779    /* split src if necessary */
4780    if (temps.empty()) {
4781       if (is_subdword && src.type() == RegType::sgpr)
4782          src = as_vgpr(ctx, src);
4783       if (dst_type == RegType::sgpr)
4784          src = bld.as_uniform(src);
4785
4786       unsigned num_elems = src.bytes() / elem_size_bytes;
4787       aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4788          aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4789       split->operands[0] = Operand(src);
4790       for (unsigned i = 0; i < num_elems; i++) {
4791          temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4792          split->definitions[i] = Definition(temps.back());
4793       }
4794       bld.insert(std::move(split));
4795    }
4796
4797    unsigned idx = 0;
4798    for (unsigned i = 0; i < count; i++) {
4799       unsigned op_count = dst[i].bytes() / elem_size_bytes;
4800       if (op_count == 1) {
4801          if (dst_type == RegType::sgpr)
4802             dst[i] = bld.as_uniform(temps[idx++]);
4803          else
4804             dst[i] = as_vgpr(ctx, temps[idx++]);
4805          continue;
4806       }
4807
4808       aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4809                                                                       Format::PSEUDO, op_count, 1)};
4810       for (unsigned j = 0; j < op_count; j++) {
4811          Temp tmp = temps[idx++];
4812          if (dst_type == RegType::sgpr)
4813             tmp = bld.as_uniform(tmp);
4814          vec->operands[j] = Operand(tmp);
4815       }
4816       vec->definitions[0] = Definition(dst[i]);
4817       bld.insert(std::move(vec));
4818    }
4819    return;
4820 }
4821
4822 bool
4823 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4824 {
4825    unsigned start_elem = ffs(todo_mask) - 1;
4826    bool skip = !(mask & (1 << start_elem));
4827    if (skip)
4828       mask = ~mask & todo_mask;
4829
4830    mask &= todo_mask;
4831
4832    u_bit_scan_consecutive_range(&mask, start, count);
4833
4834    return !skip;
4835 }
4836
4837 void
4838 advance_write_mask(uint32_t* todo_mask, int start, int count)
4839 {
4840    *todo_mask &= ~u_bit_consecutive(0, count) << start;
4841 }
4842
4843 void
4844 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4845           unsigned base_offset, unsigned align)
4846 {
4847    assert(util_is_power_of_two_nonzero(align));
4848    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4849
4850    Builder bld(ctx->program, ctx->block);
4851    bool large_ds_write = ctx->options->gfx_level >= GFX7;
4852    bool usable_write2 = ctx->options->gfx_level >= GFX7;
4853
4854    unsigned write_count = 0;
4855    Temp write_datas[32];
4856    unsigned offsets[32];
4857    unsigned bytes[32];
4858    aco_opcode opcodes[32];
4859
4860    wrmask = util_widen_mask(wrmask, elem_size_bytes);
4861
4862    const unsigned wrmask_bitcnt = util_bitcount(wrmask);
4863    uint32_t todo = u_bit_consecutive(0, data.bytes());
4864
4865    if (u_bit_consecutive(0, wrmask_bitcnt) == wrmask)
4866       todo = MIN2(todo, wrmask);
4867
4868    while (todo) {
4869       int offset, byte;
4870       if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
4871          offsets[write_count] = offset;
4872          bytes[write_count] = byte;
4873          opcodes[write_count] = aco_opcode::num_opcodes;
4874          write_count++;
4875          advance_write_mask(&todo, offset, byte);
4876          continue;
4877       }
4878
4879       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
4880       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
4881       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
4882       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
4883
4884       // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
4885       aco_opcode op = aco_opcode::num_opcodes;
4886       if (byte >= 16 && aligned16 && large_ds_write) {
4887          op = aco_opcode::ds_write_b128;
4888          byte = 16;
4889       } else if (byte >= 12 && aligned16 && large_ds_write) {
4890          op = aco_opcode::ds_write_b96;
4891          byte = 12;
4892       } else if (byte >= 8 && aligned8) {
4893          op = aco_opcode::ds_write_b64;
4894          byte = 8;
4895       } else if (byte >= 4 && aligned4) {
4896          op = aco_opcode::ds_write_b32;
4897          byte = 4;
4898       } else if (byte >= 2 && aligned2) {
4899          op = aco_opcode::ds_write_b16;
4900          byte = 2;
4901       } else if (byte >= 1) {
4902          op = aco_opcode::ds_write_b8;
4903          byte = 1;
4904       } else {
4905          assert(false);
4906       }
4907
4908       offsets[write_count] = offset;
4909       bytes[write_count] = byte;
4910       opcodes[write_count] = op;
4911       write_count++;
4912       advance_write_mask(&todo, offset, byte);
4913    }
4914
4915    Operand m = load_lds_size_m0(bld);
4916
4917    split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
4918
4919    for (unsigned i = 0; i < write_count; i++) {
4920       aco_opcode op = opcodes[i];
4921       if (op == aco_opcode::num_opcodes)
4922          continue;
4923
4924       Temp split_data = write_datas[i];
4925
4926       unsigned second = write_count;
4927       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
4928          for (second = i + 1; second < write_count; second++) {
4929             if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
4930                op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4931                opcodes[second] = aco_opcode::num_opcodes;
4932                break;
4933             }
4934          }
4935       }
4936
4937       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
4938       unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
4939
4940       unsigned inline_offset = base_offset + offsets[i];
4941       unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
4942       Temp address_offset = address;
4943       if (inline_offset > max_offset) {
4944          address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
4945          inline_offset = offsets[i];
4946       }
4947
4948       /* offsets[i] shouldn't be large enough for this to happen */
4949       assert(inline_offset <= max_offset);
4950
4951       Instruction* instr;
4952       if (write2) {
4953          Temp second_data = write_datas[second];
4954          inline_offset /= split_data.bytes();
4955          instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
4956                         inline_offset + write2_off);
4957       } else {
4958          instr = bld.ds(op, address_offset, split_data, m, inline_offset);
4959       }
4960       instr->ds().sync = memory_sync_info(storage_shared);
4961
4962       if (m.isUndefined())
4963          instr->operands.pop_back();
4964    }
4965 }
4966
4967 aco_opcode
4968 get_buffer_store_op(unsigned bytes)
4969 {
4970    switch (bytes) {
4971    case 1: return aco_opcode::buffer_store_byte;
4972    case 2: return aco_opcode::buffer_store_short;
4973    case 4: return aco_opcode::buffer_store_dword;
4974    case 8: return aco_opcode::buffer_store_dwordx2;
4975    case 12: return aco_opcode::buffer_store_dwordx3;
4976    case 16: return aco_opcode::buffer_store_dwordx4;
4977    }
4978    unreachable("Unexpected store size");
4979    return aco_opcode::num_opcodes;
4980 }
4981
4982 void
4983 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
4984                    Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
4985                    Temp* write_datas, unsigned* offsets)
4986 {
4987    unsigned write_count_with_skips = 0;
4988    bool skips[16];
4989    unsigned bytes[16];
4990
4991    /* determine how to split the data */
4992    unsigned todo = u_bit_consecutive(0, data.bytes());
4993    while (todo) {
4994       int offset, byte;
4995       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
4996       offsets[write_count_with_skips] = offset;
4997       if (skips[write_count_with_skips]) {
4998          bytes[write_count_with_skips] = byte;
4999          advance_write_mask(&todo, offset, byte);
5000          write_count_with_skips++;
5001          continue;
5002       }
5003
5004       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
5005        * larger than swizzle_element_size */
5006       byte = MIN2(byte, swizzle_element_size);
5007       if (byte % 4)
5008          byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
5009
5010       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
5011       if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12)
5012          byte = 8;
5013
5014       /* dword or larger stores have to be dword-aligned */
5015       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
5016       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
5017       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
5018       if (!dword_aligned)
5019          byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
5020
5021       bytes[write_count_with_skips] = byte;
5022       advance_write_mask(&todo, offset, byte);
5023       write_count_with_skips++;
5024    }
5025
5026    /* actually split data */
5027    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
5028
5029    /* remove skips */
5030    for (unsigned i = 0; i < write_count_with_skips; i++) {
5031       if (skips[i])
5032          continue;
5033       write_datas[*write_count] = write_datas[i];
5034       offsets[*write_count] = offsets[i];
5035       (*write_count)++;
5036    }
5037 }
5038
5039 Temp
5040 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
5041                       unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
5042 {
5043    Builder bld(ctx->program, ctx->block);
5044    unsigned dword_size = elem_size_bytes / 4;
5045
5046    if (!dst.id())
5047       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
5048
5049    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
5050    aco_ptr<Pseudo_instruction> instr{
5051       create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
5052    instr->definitions[0] = Definition(dst);
5053
5054    for (unsigned i = 0; i < cnt; ++i) {
5055       if (arr[i].id()) {
5056          assert(arr[i].size() == dword_size);
5057          allocated_vec[i] = arr[i];
5058          instr->operands[i] = Operand(arr[i]);
5059       } else {
5060          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
5061                               Operand::zero(dword_size == 2 ? 8 : 4));
5062          allocated_vec[i] = zero;
5063          instr->operands[i] = Operand(zero);
5064       }
5065    }
5066
5067    bld.insert(std::move(instr));
5068
5069    if (split_cnt)
5070       emit_split_vector(ctx, dst, split_cnt);
5071    else
5072       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
5073
5074    return dst;
5075 }
5076
5077 inline unsigned
5078 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
5079 {
5080    if (const_offset >= 4096) {
5081       unsigned excess_const_offset = const_offset / 4096u * 4096u;
5082       const_offset %= 4096u;
5083
5084       if (!voffset.id())
5085          voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
5086       else if (unlikely(voffset.regClass() == s1))
5087          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
5088                             Operand::c32(excess_const_offset), Operand(voffset));
5089       else if (likely(voffset.regClass() == v1))
5090          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
5091       else
5092          unreachable("Unsupported register class of voffset");
5093    }
5094
5095    return const_offset;
5096 }
5097
5098 void
5099 emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
5100                         Temp vdata, unsigned const_offset, memory_sync_info sync, bool glc,
5101                         bool slc, bool swizzled)
5102 {
5103    assert(vdata.id());
5104    assert(vdata.size() != 3 || ctx->program->gfx_level != GFX6);
5105    assert(vdata.size() >= 1 && vdata.size() <= 4);
5106
5107    Builder bld(ctx->program, ctx->block);
5108    aco_opcode op = get_buffer_store_op(vdata.bytes());
5109    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
5110
5111    bool offen = voffset.id();
5112    bool idxen = idx.id();
5113
5114    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
5115    glc &= ctx->program->gfx_level < GFX11;
5116
5117    Operand vaddr_op(v1);
5118    if (offen && idxen)
5119       vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, voffset);
5120    else if (offen)
5121       vaddr_op = Operand(voffset);
5122    else if (idxen)
5123       vaddr_op = Operand(idx);
5124
5125    Builder::Result r =
5126       bld.mubuf(op, Operand(descriptor), vaddr_op, soffset_op, Operand(vdata), const_offset, offen,
5127                 swizzled, idxen, /* addr64 */ false, /* disable_wqm */ false, glc,
5128                 /* dlc*/ false, slc);
5129
5130    r->mubuf().sync = sync;
5131 }
5132
5133 void
5134 store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
5135                  unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
5136                  bool swizzled, memory_sync_info sync, bool glc, bool slc)
5137 {
5138    Builder bld(ctx->program, ctx->block);
5139    assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 ||
5140           elem_size_bytes == 8);
5141    assert(write_mask);
5142    write_mask = util_widen_mask(write_mask, elem_size_bytes);
5143
5144    unsigned write_count = 0;
5145    Temp write_datas[32];
5146    unsigned offsets[32];
5147    split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
5148                       swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
5149                       write_datas, offsets);
5150
5151    for (unsigned i = 0; i < write_count; i++) {
5152       unsigned const_offset = offsets[i] + base_const_offset;
5153       emit_single_mubuf_store(ctx, descriptor, voffset, soffset, idx, write_datas[i], const_offset,
5154                               sync, glc, slc, swizzled);
5155    }
5156 }
5157
5158 Temp
5159 wave_id_in_threadgroup(isel_context* ctx)
5160 {
5161    Builder bld(ctx->program, ctx->block);
5162    return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
5163                    get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(24u | (4u << 16)));
5164 }
5165
5166 Temp
5167 thread_id_in_threadgroup(isel_context* ctx)
5168 {
5169    /* tid_in_tg = wave_id * wave_size + tid_in_wave */
5170
5171    Builder bld(ctx->program, ctx->block);
5172    Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
5173
5174    if (ctx->program->workgroup_size <= ctx->program->wave_size)
5175       return tid_in_wave;
5176
5177    Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
5178    Temp num_pre_threads =
5179       bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
5180                Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
5181    return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
5182 }
5183
5184 bool
5185 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
5186 {
5187    unsigned write_mask = nir_intrinsic_write_mask(instr);
5188    unsigned component = nir_intrinsic_component(instr);
5189    nir_src offset = *nir_get_io_offset_src(instr);
5190
5191    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5192       return false;
5193
5194    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5195
5196    if (instr->src[0].ssa->bit_size == 64)
5197       write_mask = util_widen_mask(write_mask, 2);
5198
5199    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
5200
5201    /* Use semantic location as index. radv already uses it as intrinsic base
5202     * but radeonsi does not. We need to make LS output and TCS input index
5203     * match each other, so need to use semantic location explicitly. Also for
5204     * TCS epilog to index tess factor temps using semantic location directly.
5205     */
5206    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5207    unsigned base = sem.location + sem.dual_source_blend_index;
5208    unsigned idx = base * 4u + component;
5209
5210    for (unsigned i = 0; i < 8; ++i) {
5211       if (write_mask & (1 << i)) {
5212          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
5213          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
5214       }
5215       idx++;
5216    }
5217
5218    if (ctx->stage == fragment_fs && ctx->program->info.has_epilog) {
5219       unsigned index = base - FRAG_RESULT_DATA0;
5220
5221       if (nir_intrinsic_src_type(instr) == nir_type_float16) {
5222          ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2);
5223       } else if (nir_intrinsic_src_type(instr) == nir_type_int16) {
5224          ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2);
5225       } else if (nir_intrinsic_src_type(instr) == nir_type_uint16) {
5226          ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2);
5227       }
5228    }
5229
5230    return true;
5231 }
5232
5233 bool
5234 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
5235 {
5236    /* Only TCS per-vertex inputs are supported by this function.
5237     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
5238     * is the same.
5239     */
5240    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
5241       return false;
5242
5243    nir_src* off_src = nir_get_io_offset_src(instr);
5244    nir_src* vertex_index_src = nir_get_io_arrayed_index_src(instr);
5245    nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
5246    bool can_use_temps =
5247       nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
5248       nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
5249
5250    if (!can_use_temps)
5251       return false;
5252
5253    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5254
5255    unsigned idx =
5256       sem.location * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
5257    Temp* src = &ctx->inputs.temps[idx];
5258    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
5259
5260    return true;
5261 }
5262
5263 void
5264 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
5265 {
5266    /* LS pass output to TCS by temp if they have same in/out patch size. */
5267    bool ls_need_output = ctx->stage == vertex_tess_control_hs &&
5268                          ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq;
5269
5270    bool tcs_need_output = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL &&
5271                           ctx->program->info.has_epilog &&
5272                           ctx->program->info.tcs.pass_tessfactors_by_reg;
5273
5274    bool ps_need_output = ctx->stage == fragment_fs;
5275
5276    if (ls_need_output || tcs_need_output || ps_need_output) {
5277       bool stored_to_temps = store_output_to_temps(ctx, instr);
5278       if (!stored_to_temps) {
5279          isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
5280          abort();
5281       }
5282    } else {
5283       unreachable("Shader stage not implemented");
5284    }
5285 }
5286
5287 bool
5288 in_exec_divergent_or_in_loop(isel_context* ctx)
5289 {
5290    return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
5291           ctx->cf_info.had_divergent_discard;
5292 }
5293
5294 void
5295 emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5296                         Temp prim_mask)
5297 {
5298    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5299    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5300
5301    Builder bld(ctx->program, ctx->block);
5302
5303    if (in_exec_divergent_or_in_loop(ctx)) {
5304       Operand prim_mask_op = bld.m0(prim_mask);
5305       prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
5306       Operand coord2_op(coord2);
5307       coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */
5308       bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5309                  Operand::c32(idx), Operand::c32(component), coord1, coord2_op, prim_mask_op);
5310       return;
5311    }
5312
5313    Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5314
5315    Temp res;
5316    if (dst.regClass() == v2b) {
5317       Temp p10 =
5318          bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1, p);
5319       res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v1), p, coord2, p10);
5320    } else {
5321       Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
5322       res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), p, coord2, p10);
5323    }
5324    /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5325    if (dst.regClass() != v2b)
5326       emit_wqm(bld, res, dst, true);
5327    else
5328       emit_extract_vector(ctx, emit_wqm(bld, res, Temp(0, s1), true), 0, dst);
5329 }
5330
5331 void
5332 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5333                   Temp prim_mask)
5334 {
5335    if (ctx->options->gfx_level >= GFX11) {
5336       emit_interp_instr_gfx11(ctx, idx, component, src, dst, prim_mask);
5337       return;
5338    }
5339
5340    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5341    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5342
5343    Builder bld(ctx->program, ctx->block);
5344
5345    if (dst.regClass() == v2b) {
5346       if (ctx->program->dev.has_16bank_lds) {
5347          assert(ctx->options->gfx_level <= GFX8);
5348          Builder::Result interp_p1 =
5349             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
5350                        bld.m0(prim_mask), idx, component);
5351          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
5352                                 bld.m0(prim_mask), interp_p1, idx, component);
5353          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
5354                     interp_p1, idx, component);
5355       } else {
5356          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
5357
5358          if (ctx->options->gfx_level == GFX8)
5359             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
5360
5361          Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
5362                                                 bld.m0(prim_mask), idx, component);
5363          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
5364                     component);
5365       }
5366    } else {
5367       Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
5368                                              bld.m0(prim_mask), idx, component);
5369
5370       if (ctx->program->dev.has_16bank_lds)
5371          interp_p1->operands[0].setLateKill(true);
5372
5373       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
5374                  idx, component);
5375    }
5376 }
5377
5378 void
5379 emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsigned vertex_id,
5380                       Temp dst, Temp prim_mask)
5381 {
5382    Builder bld(ctx->program, ctx->block);
5383    if (ctx->options->gfx_level >= GFX11) {
5384       uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
5385       if (in_exec_divergent_or_in_loop(ctx)) {
5386          Operand prim_mask_op = bld.m0(prim_mask);
5387          prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
5388          bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5389                     Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
5390                     prim_mask_op);
5391       } else {
5392          Temp p =
5393             bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5394          Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
5395
5396          /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5397          if (dst.regClass() != v2b)
5398             emit_wqm(bld, res, dst, true);
5399          else
5400             emit_extract_vector(ctx, emit_wqm(bld, res, Temp(0, s1), true), 0, dst);
5401       }
5402    } else {
5403       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32((vertex_id + 2) % 3),
5404                  bld.m0(prim_mask), idx, component);
5405    }
5406 }
5407
5408 void
5409 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
5410 {
5411    Builder bld(ctx->program, ctx->block);
5412
5413    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
5414       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
5415    for (unsigned i = 0; i < num_components; i++) {
5416       if (ctx->args->frag_pos[i].used)
5417          vec->operands[i] = Operand(get_arg(ctx, ctx->args->frag_pos[i]));
5418       else
5419          vec->operands[i] = Operand(v1);
5420    }
5421    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
5422       assert(num_components == 4);
5423       vec->operands[3] =
5424          bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->frag_pos[3]));
5425    }
5426
5427    for (Operand& op : vec->operands)
5428       op = op.isUndefined() ? Operand::zero() : op;
5429
5430    vec->definitions[0] = Definition(dst);
5431    ctx->block->instructions.emplace_back(std::move(vec));
5432    emit_split_vector(ctx, dst, num_components);
5433    return;
5434 }
5435
5436 void
5437 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
5438 {
5439    Builder bld(ctx->program, ctx->block);
5440    Temp cond;
5441
5442    /* VRS Rate X = Ancillary[2:3]
5443     * VRS Rate Y = Ancillary[4:5]
5444     */
5445    Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5446                           Operand::c32(2u), Operand::c32(2u));
5447    Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5448                           Operand::c32(4u), Operand::c32(2u));
5449
5450    /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
5451    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
5452    x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5453                      bld.copy(bld.def(v1), Operand::c32(4u)), cond);
5454
5455    /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
5456    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
5457    y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5458                      bld.copy(bld.def(v1), Operand::c32(1u)), cond);
5459
5460    bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
5461 }
5462
5463 void
5464 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
5465 {
5466    Temp dst = get_ssa_temp(ctx, &instr->def);
5467    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
5468    unsigned idx = nir_intrinsic_base(instr);
5469    unsigned component = nir_intrinsic_component(instr);
5470    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5471
5472    assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
5473
5474    if (instr->def.num_components == 1) {
5475       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
5476    } else {
5477       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
5478          aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1));
5479       for (unsigned i = 0; i < instr->def.num_components; i++) {
5480          Temp tmp = ctx->program->allocateTmp(instr->def.bit_size == 16 ? v2b : v1);
5481          emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
5482          vec->operands[i] = Operand(tmp);
5483       }
5484       vec->definitions[0] = Definition(dst);
5485       ctx->block->instructions.emplace_back(std::move(vec));
5486    }
5487 }
5488
5489 Temp
5490 mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
5491                     unsigned alignment, unsigned const_offset, Temp dst_hint)
5492 {
5493    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5494    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
5495
5496    if (info.soffset.id()) {
5497       if (soffset.isTemp())
5498          vaddr = bld.copy(bld.def(v1), soffset);
5499       soffset = Operand(info.soffset);
5500    }
5501
5502    if (soffset.isUndefined())
5503       soffset = Operand::zero();
5504
5505    const bool offen = !vaddr.isUndefined();
5506    const bool idxen = info.idx.id();
5507
5508    if (offen && idxen)
5509       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
5510    else if (idxen)
5511       vaddr = Operand(info.idx);
5512
5513    /* Determine number of fetched components.
5514     * Note, ACO IR works with GFX6-8 nfmt + dfmt fields, these are later converted for GFX10+.
5515     */
5516    const struct ac_vtx_format_info* vtx_info =
5517       ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, info.format);
5518    /* The number of channels in the format determines the memory range. */
5519    const unsigned max_components = vtx_info->num_channels;
5520    /* Calculate maximum number of components loaded according to alignment. */
5521    unsigned max_fetched_components = bytes_needed / info.component_size;
5522    max_fetched_components =
5523       ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
5524                              alignment, max_fetched_components);
5525    const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
5526    /* Adjust bytes needed in case we need to do a smaller load due to alignment.
5527     * If a larger format is selected, it's still OK to load a smaller amount from it.
5528     */
5529    bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
5530    unsigned bytes_size = 0;
5531    const unsigned bit_size = info.component_size * 8;
5532    aco_opcode op = aco_opcode::num_opcodes;
5533
5534    if (bytes_needed == 2) {
5535       bytes_size = 2;
5536       op = aco_opcode::tbuffer_load_format_d16_x;
5537    } else if (bytes_needed <= 4) {
5538       bytes_size = 4;
5539       if (bit_size == 16)
5540          op = aco_opcode::tbuffer_load_format_d16_xy;
5541       else
5542          op = aco_opcode::tbuffer_load_format_x;
5543    } else if (bytes_needed <= 6) {
5544       bytes_size = 6;
5545       if (bit_size == 16)
5546          op = aco_opcode::tbuffer_load_format_d16_xyz;
5547       else
5548          op = aco_opcode::tbuffer_load_format_xy;
5549    } else if (bytes_needed <= 8) {
5550       bytes_size = 8;
5551       if (bit_size == 16)
5552          op = aco_opcode::tbuffer_load_format_d16_xyzw;
5553       else
5554          op = aco_opcode::tbuffer_load_format_xy;
5555    } else if (bytes_needed <= 12) {
5556       bytes_size = 12;
5557       op = aco_opcode::tbuffer_load_format_xyz;
5558    } else {
5559       bytes_size = 16;
5560       op = aco_opcode::tbuffer_load_format_xyzw;
5561    }
5562
5563    /* Abort when suitable opcode wasn't found so we don't compile buggy shaders. */
5564    if (op == aco_opcode::num_opcodes) {
5565       aco_err(bld.program, "unsupported bit size for typed buffer load");
5566       abort();
5567    }
5568
5569    aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(op, Format::MTBUF, 3, 1)};
5570    mtbuf->operands[0] = Operand(info.resource);
5571    mtbuf->operands[1] = vaddr;
5572    mtbuf->operands[2] = soffset;
5573    mtbuf->offen = offen;
5574    mtbuf->idxen = idxen;
5575    mtbuf->glc = info.glc;
5576    mtbuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
5577    mtbuf->slc = info.slc;
5578    mtbuf->sync = info.sync;
5579    mtbuf->offset = const_offset;
5580    mtbuf->dfmt = fetch_fmt & 0xf;
5581    mtbuf->nfmt = fetch_fmt >> 4;
5582    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5583    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5584    mtbuf->definitions[0] = Definition(val);
5585    bld.insert(std::move(mtbuf));
5586
5587    return val;
5588 }
5589
5590 const EmitLoadParameters mtbuf_load_params{mtbuf_load_callback, false, true, 4096};
5591
5592 void
5593 visit_load_fs_input(isel_context* ctx, nir_intrinsic_instr* instr)
5594 {
5595    Builder bld(ctx->program, ctx->block);
5596    Temp dst = get_ssa_temp(ctx, &instr->def);
5597    nir_src offset = *nir_get_io_offset_src(instr);
5598
5599    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5600       isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
5601
5602    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5603
5604    unsigned idx = nir_intrinsic_base(instr);
5605    unsigned component = nir_intrinsic_component(instr);
5606    unsigned vertex_id = 0; /* P0 */
5607
5608    if (instr->intrinsic == nir_intrinsic_load_input_vertex)
5609       vertex_id = nir_src_as_uint(instr->src[0]);
5610
5611    if (instr->def.num_components == 1 && instr->def.bit_size != 64) {
5612       emit_interp_mov_instr(ctx, idx, component, vertex_id, dst, prim_mask);
5613    } else {
5614       unsigned num_components = instr->def.num_components;
5615       if (instr->def.bit_size == 64)
5616          num_components *= 2;
5617       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5618          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5619       for (unsigned i = 0; i < num_components; i++) {
5620          unsigned chan_component = (component + i) % 4;
5621          unsigned chan_idx = idx + (component + i) / 4;
5622          vec->operands[i] = Operand(bld.tmp(instr->def.bit_size == 16 ? v2b : v1));
5623          emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(),
5624                                prim_mask);
5625       }
5626       vec->definitions[0] = Definition(dst);
5627       bld.insert(std::move(vec));
5628    }
5629 }
5630
5631 void
5632 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5633 {
5634    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5635
5636    Builder bld(ctx->program, ctx->block);
5637    Temp dst = get_ssa_temp(ctx, &instr->def);
5638
5639    if (load_input_from_temps(ctx, instr, dst))
5640       return;
5641
5642    unreachable("LDS-based TCS input should have been lowered in NIR.");
5643 }
5644
5645 void
5646 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5647 {
5648    switch (ctx->shader->info.stage) {
5649    case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5650    default: unreachable("Unimplemented shader stage");
5651    }
5652 }
5653
5654 void
5655 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5656 {
5657    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5658
5659    Builder bld(ctx->program, ctx->block);
5660    Temp dst = get_ssa_temp(ctx, &instr->def);
5661
5662    Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5663    Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5664    Operand tes_w = Operand::zero();
5665
5666    if (ctx->shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
5667       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5668       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5669       tes_w = Operand(tmp);
5670    }
5671
5672    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5673    emit_split_vector(ctx, tess_coord, 3);
5674 }
5675
5676 void
5677 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5678             Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5679             bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5680 {
5681    Builder bld(ctx->program, ctx->block);
5682
5683    bool use_smem =
5684       dst.type() != RegType::vgpr && (!glc || ctx->options->gfx_level >= GFX8) && allow_smem;
5685    if (use_smem)
5686       offset = bld.as_uniform(offset);
5687    else {
5688       /* GFX6-7 are affected by a hw bug that prevents address clamping to
5689        * work correctly when the SGPR offset is used.
5690        */
5691       if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
5692          offset = as_vgpr(ctx, offset);
5693    }
5694
5695    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5696    info.glc = glc;
5697    info.sync = sync;
5698    info.align_mul = align_mul;
5699    info.align_offset = align_offset;
5700    if (use_smem)
5701       emit_load(ctx, bld, info, smem_load_params);
5702    else
5703       emit_load(ctx, bld, info, mubuf_load_params);
5704 }
5705
5706 void
5707 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5708 {
5709    Temp dst = get_ssa_temp(ctx, &instr->def);
5710    Builder bld(ctx->program, ctx->block);
5711    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5712
5713    unsigned size = instr->def.bit_size / 8;
5714    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5715                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5716 }
5717
5718 void
5719 visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5720 {
5721    Builder bld(ctx->program, ctx->block);
5722    Temp dst = get_ssa_temp(ctx, &instr->def);
5723    unsigned offset = nir_intrinsic_base(instr);
5724    unsigned count = instr->def.num_components;
5725    nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5726
5727    if (instr->def.bit_size == 64)
5728       count *= 2;
5729
5730    if (index_cv && instr->def.bit_size >= 32) {
5731       unsigned start = (offset + index_cv->u32) / 4u;
5732       uint64_t mask = BITFIELD64_MASK(count) << start;
5733       if ((ctx->args->inline_push_const_mask | mask) == ctx->args->inline_push_const_mask &&
5734           start + count <= (sizeof(ctx->args->inline_push_const_mask) * 8u)) {
5735          std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5736          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5737             aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5738          unsigned arg_index =
5739             util_bitcount64(ctx->args->inline_push_const_mask & BITFIELD64_MASK(start));
5740          for (unsigned i = 0; i < count; ++i) {
5741             elems[i] = get_arg(ctx, ctx->args->inline_push_consts[arg_index++]);
5742             vec->operands[i] = Operand{elems[i]};
5743          }
5744          vec->definitions[0] = Definition(dst);
5745          ctx->block->instructions.emplace_back(std::move(vec));
5746          ctx->allocated_vec.emplace(dst.id(), elems);
5747          return;
5748       }
5749    }
5750
5751    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5752    if (offset != 0) // TODO check if index != 0 as well
5753       index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5754                              Operand::c32(offset), index);
5755    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->push_constants));
5756    Temp vec = dst;
5757    bool trim = false;
5758    bool aligned = true;
5759
5760    if (instr->def.bit_size == 8) {
5761       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5762       bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5763       if (!aligned)
5764          vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5765    } else if (instr->def.bit_size == 16) {
5766       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5767       if (!aligned)
5768          vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5769    }
5770
5771    aco_opcode op;
5772
5773    switch (vec.size()) {
5774    case 1: op = aco_opcode::s_load_dword; break;
5775    case 2: op = aco_opcode::s_load_dwordx2; break;
5776    case 3:
5777       vec = bld.tmp(s4);
5778       trim = true;
5779       FALLTHROUGH;
5780    case 4: op = aco_opcode::s_load_dwordx4; break;
5781    case 6:
5782       vec = bld.tmp(s8);
5783       trim = true;
5784       FALLTHROUGH;
5785    case 8: op = aco_opcode::s_load_dwordx8; break;
5786    default: unreachable("unimplemented or forbidden load_push_constant.");
5787    }
5788
5789    bld.smem(op, Definition(vec), ptr, index);
5790
5791    if (!aligned) {
5792       Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5793       byte_align_scalar(ctx, vec, byte_offset, dst);
5794       return;
5795    }
5796
5797    if (trim) {
5798       emit_split_vector(ctx, vec, 4);
5799       RegClass rc = dst.size() == 3 ? s1 : s2;
5800       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5801                  emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5802    }
5803    emit_split_vector(ctx, dst, instr->def.num_components);
5804 }
5805
5806 void
5807 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5808 {
5809    Temp dst = get_ssa_temp(ctx, &instr->def);
5810
5811    Builder bld(ctx->program, ctx->block);
5812
5813    uint32_t desc_type =
5814       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5815       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5816    if (ctx->options->gfx_level >= GFX10) {
5817       desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5818                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5819                    S_008F0C_RESOURCE_LEVEL(ctx->options->gfx_level < GFX11);
5820    } else {
5821       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5822                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5823    }
5824
5825    unsigned base = nir_intrinsic_base(instr);
5826    unsigned range = nir_intrinsic_range(instr);
5827
5828    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5829    if (base && offset.type() == RegType::sgpr)
5830       offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5831                               Operand::c32(base));
5832    else if (base && offset.type() == RegType::vgpr)
5833       offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5834
5835    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5836                           bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5837                                      Operand::c32(ctx->constant_data_offset)),
5838                           Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5839                           Operand::c32(desc_type));
5840    unsigned size = instr->def.bit_size / 8;
5841    // TODO: get alignment information for subdword constants
5842    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5843 }
5844
5845 /* Packs multiple Temps of different sizes in to a vector of v1 Temps.
5846  * The byte count of each input Temp must be a multiple of 2.
5847  */
5848 static std::vector<Temp>
5849 emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
5850 {
5851    Builder bld(ctx->program, ctx->block);
5852    std::vector<Temp> packed;
5853    Temp low = Temp();
5854    for (Temp tmp : unpacked) {
5855       assert(tmp.bytes() % 2 == 0);
5856       unsigned byte_idx = 0;
5857       while (byte_idx < tmp.bytes()) {
5858          if (low != Temp()) {
5859             Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5860             Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high);
5861             low = Temp();
5862             packed.push_back(dword);
5863             byte_idx += 2;
5864          } else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) {
5865             packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
5866             byte_idx += 4;
5867          } else {
5868             low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5869             byte_idx += 2;
5870          }
5871       }
5872    }
5873    if (low != Temp()) {
5874       Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b));
5875       packed.push_back(dword);
5876    }
5877    return packed;
5878 }
5879
5880 static bool
5881 should_declare_array(ac_image_dim dim)
5882 {
5883    return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5884           dim == ac_image_2darraymsaa;
5885 }
5886
5887 static int
5888 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5889 {
5890    switch (dim) {
5891    case GLSL_SAMPLER_DIM_BUF: return 1;
5892    case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5893    case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5894    case GLSL_SAMPLER_DIM_MS: return array ? 3 : 2;
5895    case GLSL_SAMPLER_DIM_3D:
5896    case GLSL_SAMPLER_DIM_CUBE: return 3;
5897    case GLSL_SAMPLER_DIM_RECT:
5898    case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5899    case GLSL_SAMPLER_DIM_SUBPASS_MS: return 2;
5900    default: break;
5901    }
5902    return 0;
5903 }
5904
5905 static MIMG_instruction*
5906 emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector<Temp> coords,
5907           bool needs_wqm = false, Operand vdata = Operand(v1))
5908 {
5909    size_t nsa_size = bld.program->dev.max_nsa_vgprs;
5910    nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
5911
5912    const bool strict_wqm = coords[0].regClass().is_linear_vgpr();
5913    if (strict_wqm)
5914       nsa_size = coords.size();
5915
5916    for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {
5917       if (!coords[i].id())
5918          continue;
5919
5920       coords[i] = as_vgpr(bld, coords[i]);
5921    }
5922
5923    if (nsa_size < coords.size()) {
5924       Temp coord = coords[nsa_size];
5925       if (coords.size() - nsa_size > 1) {
5926          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5927             aco_opcode::p_create_vector, Format::PSEUDO, coords.size() - nsa_size, 1)};
5928
5929          unsigned coord_size = 0;
5930          for (unsigned i = nsa_size; i < coords.size(); i++) {
5931             vec->operands[i - nsa_size] = Operand(coords[i]);
5932             coord_size += coords[i].size();
5933          }
5934
5935          coord = bld.tmp(RegType::vgpr, coord_size);
5936          vec->definitions[0] = Definition(coord);
5937          bld.insert(std::move(vec));
5938       } else {
5939          coord = as_vgpr(bld, coord);
5940       }
5941
5942       coords[nsa_size] = coord;
5943       coords.resize(nsa_size + 1);
5944    }
5945
5946    bool has_dst = dst.id() != 0;
5947    assert(!needs_wqm || has_dst);
5948    Temp tmp_dst = needs_wqm ? bld.tmp(dst.regClass()) : dst;
5949
5950    aco_ptr<MIMG_instruction> mimg{
5951       create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), has_dst)};
5952    if (has_dst)
5953       mimg->definitions[0] = Definition(tmp_dst);
5954    mimg->operands[0] = Operand(rsrc);
5955    mimg->operands[1] = samp;
5956    mimg->operands[2] = vdata;
5957    for (unsigned i = 0; i < coords.size(); i++)
5958       mimg->operands[3 + i] = Operand(coords[i]);
5959    mimg->strict_wqm = strict_wqm;
5960
5961    MIMG_instruction* res = mimg.get();
5962    bld.insert(std::move(mimg));
5963    if (needs_wqm)
5964       emit_wqm(bld, tmp_dst, dst, true);
5965    return res;
5966 }
5967
5968 void
5969 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5970 {
5971    Builder bld(ctx->program, ctx->block);
5972    Temp dst = get_ssa_temp(ctx, &instr->def);
5973    Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
5974    Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
5975    Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
5976    Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
5977    Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
5978    Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
5979
5980    /* On GFX11 image_bvh64_intersect_ray has a special vaddr layout with NSA:
5981     * There are five smaller vector groups:
5982     * node_pointer, ray_extent, ray_origin, ray_dir, ray_inv_dir.
5983     * These directly match the NIR intrinsic sources.
5984     */
5985    std::vector<Temp> args = {
5986       node, tmax, origin, dir, inv_dir,
5987    };
5988
5989    if (bld.program->gfx_level == GFX10_3) {
5990       std::vector<Temp> scalar_args;
5991       for (Temp tmp : args) {
5992          for (unsigned i = 0; i < tmp.size(); i++)
5993             scalar_args.push_back(emit_extract_vector(ctx, tmp, i, v1));
5994       }
5995       args = std::move(scalar_args);
5996    }
5997
5998    MIMG_instruction* mimg =
5999       emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, dst, resource, Operand(s4), args);
6000    mimg->dim = ac_image_1d;
6001    mimg->dmask = 0xf;
6002    mimg->unrm = true;
6003    mimg->r128 = true;
6004
6005    emit_split_vector(ctx, dst, instr->def.num_components);
6006 }
6007
6008 static std::vector<Temp>
6009 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6010 {
6011
6012    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6013    bool a16 = instr->src[1].ssa->bit_size == 16;
6014    RegClass rc = a16 ? v2b : v1;
6015    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6016    bool is_array = nir_intrinsic_image_array(instr);
6017    ASSERTED bool add_frag_pos =
6018       (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6019    assert(!add_frag_pos && "Input attachments should be lowered.");
6020    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6021    bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6022    int count = image_type_to_components_count(dim, is_array);
6023    std::vector<Temp> coords;
6024    Builder bld(ctx->program, ctx->block);
6025
6026    if (gfx9_1d) {
6027       coords.emplace_back(emit_extract_vector(ctx, src0, 0, rc));
6028       coords.emplace_back(bld.copy(bld.def(rc), Operand::zero(a16 ? 2 : 4)));
6029       if (is_array)
6030          coords.emplace_back(emit_extract_vector(ctx, src0, 1, rc));
6031    } else {
6032       for (int i = 0; i < count; i++)
6033          coords.emplace_back(emit_extract_vector(ctx, src0, i, rc));
6034    }
6035
6036    bool has_lod = false;
6037    Temp lod;
6038
6039    if (instr->intrinsic == nir_intrinsic_bindless_image_load ||
6040        instr->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
6041        instr->intrinsic == nir_intrinsic_bindless_image_store) {
6042       int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3;
6043       assert(instr->src[lod_index].ssa->bit_size == (a16 ? 16 : 32));
6044       has_lod =
6045          !nir_src_is_const(instr->src[lod_index]) || nir_src_as_uint(instr->src[lod_index]) != 0;
6046
6047       if (has_lod)
6048          lod = get_ssa_temp_tex(ctx, instr->src[lod_index].ssa, a16);
6049    }
6050
6051    if (ctx->program->info.image_2d_view_of_3d && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
6052       /* The hw can't bind a slice of a 3D image as a 2D image, because it
6053        * ignores BASE_ARRAY if the target is 3D. The workaround is to read
6054        * BASE_ARRAY and set it as the 3rd address operand for all 2D images.
6055        */
6056       assert(ctx->options->gfx_level == GFX9);
6057       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6058       Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1);
6059       /* Extract the BASE_ARRAY field [0:12] from the descriptor. */
6060       Temp first_layer = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), rsrc_word5, Operand::c32(0u),
6061                                   Operand::c32(13u));
6062
6063       if (has_lod) {
6064          /* If there's a lod parameter it matter if the image is 3d or 2d because
6065           * the hw reads either the fourth or third component as lod. So detect
6066           * 3d images and place the lod at the third component otherwise.
6067           * For non 3D descriptors we effectively add lod twice to coords,
6068           * but the hw will only read the first one, the second is ignored.
6069           */
6070          Temp rsrc_word3 = emit_extract_vector(ctx, rsrc, 3, s1);
6071          Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), rsrc_word3,
6072                               Operand::c32(28 | (4 << 16))); /* extract last 4 bits */
6073          Temp is_3d = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), type,
6074                                    Operand::c32(V_008F1C_SQ_RSRC_IMG_3D));
6075          first_layer =
6076             bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), as_vgpr(ctx, lod), first_layer, is_3d);
6077       }
6078
6079       if (a16)
6080          coords.emplace_back(emit_extract_vector(ctx, first_layer, 0, v2b));
6081       else
6082          coords.emplace_back(first_layer);
6083    }
6084
6085    if (is_ms && instr->intrinsic != nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6086       assert(instr->src[2].ssa->bit_size == (a16 ? 16 : 32));
6087       coords.emplace_back(get_ssa_temp_tex(ctx, instr->src[2].ssa, a16));
6088    }
6089
6090    if (has_lod)
6091       coords.emplace_back(lod);
6092
6093    return emit_pack_v1(ctx, coords);
6094 }
6095
6096 memory_sync_info
6097 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6098 {
6099    /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6100    if (semantics & semantic_atomicrmw)
6101       return memory_sync_info(storage, semantics);
6102
6103    unsigned access = nir_intrinsic_access(instr);
6104
6105    if (access & ACCESS_VOLATILE)
6106       semantics |= semantic_volatile;
6107    if (access & ACCESS_CAN_REORDER)
6108       semantics |= semantic_can_reorder | semantic_private;
6109
6110    return memory_sync_info(storage, semantics);
6111 }
6112
6113 Operand
6114 emit_tfe_init(Builder& bld, Temp dst)
6115 {
6116    Temp tmp = bld.tmp(dst.regClass());
6117
6118    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6119       aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6120    for (unsigned i = 0; i < dst.size(); i++)
6121       vec->operands[i] = Operand::zero();
6122    vec->definitions[0] = Definition(tmp);
6123    /* Since this is fixed to an instruction's definition register, any CSE will
6124     * just create copies. Copying costs about the same as zero-initialization,
6125     * but these copies can break up clauses.
6126     */
6127    vec->definitions[0].setNoCSE(true);
6128    bld.insert(std::move(vec));
6129
6130    return Operand(tmp);
6131 }
6132
6133 void
6134 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6135 {
6136    Builder bld(ctx->program, ctx->block);
6137    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6138    bool is_array = nir_intrinsic_image_array(instr);
6139    bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
6140    Temp dst = get_ssa_temp(ctx, &instr->def);
6141
6142    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6143    unsigned access = nir_intrinsic_access(instr);
6144
6145    unsigned result_size = instr->def.num_components - is_sparse;
6146    unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
6147    expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6148    if (dim == GLSL_SAMPLER_DIM_BUF)
6149       expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6150    unsigned dmask = expand_mask;
6151    if (instr->def.bit_size == 64) {
6152       expand_mask &= 0x9;
6153       /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6154       dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6155    }
6156    if (is_sparse)
6157       expand_mask |= 1 << result_size;
6158
6159    bool d16 = instr->def.bit_size == 16;
6160    assert(!d16 || !is_sparse);
6161
6162    unsigned num_bytes = util_bitcount(dmask) * (d16 ? 2 : 4) + is_sparse * 4;
6163
6164    Temp tmp;
6165    if (num_bytes == dst.bytes() && dst.type() == RegType::vgpr)
6166       tmp = dst;
6167    else
6168       tmp = bld.tmp(RegClass::get(RegType::vgpr, num_bytes));
6169
6170    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6171
6172    if (dim == GLSL_SAMPLER_DIM_BUF) {
6173       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6174
6175       aco_opcode opcode;
6176       if (!d16) {
6177          switch (util_bitcount(dmask)) {
6178          case 1: opcode = aco_opcode::buffer_load_format_x; break;
6179          case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6180          case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6181          case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6182          default: unreachable(">4 channel buffer image load");
6183          }
6184       } else {
6185          switch (util_bitcount(dmask)) {
6186          case 1: opcode = aco_opcode::buffer_load_format_d16_x; break;
6187          case 2: opcode = aco_opcode::buffer_load_format_d16_xy; break;
6188          case 3: opcode = aco_opcode::buffer_load_format_d16_xyz; break;
6189          case 4: opcode = aco_opcode::buffer_load_format_d16_xyzw; break;
6190          default: unreachable(">4 channel buffer image load");
6191          }
6192       }
6193       aco_ptr<MUBUF_instruction> load{
6194          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6195       load->operands[0] = Operand(resource);
6196       load->operands[1] = Operand(vindex);
6197       load->operands[2] = Operand::c32(0);
6198       load->definitions[0] = Definition(tmp);
6199       load->idxen = true;
6200       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6201       load->dlc =
6202          load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6203       load->sync = sync;
6204       load->tfe = is_sparse;
6205       if (load->tfe)
6206          load->operands[3] = emit_tfe_init(bld, tmp);
6207       ctx->block->instructions.emplace_back(std::move(load));
6208    } else {
6209       std::vector<Temp> coords = get_image_coords(ctx, instr);
6210
6211       aco_opcode opcode;
6212       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6213          opcode = aco_opcode::image_load;
6214       } else {
6215          bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6216          opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6217       }
6218
6219       Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6220       MIMG_instruction* load =
6221          emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, false, vdata);
6222       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6223       load->dlc =
6224          load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6225       load->a16 = instr->src[1].ssa->bit_size == 16;
6226       load->d16 = d16;
6227       load->dmask = dmask;
6228       load->unrm = true;
6229       load->tfe = is_sparse;
6230
6231       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6232          load->dim = is_array ? ac_image_2darray : ac_image_2d;
6233          load->da = is_array;
6234          load->sync = memory_sync_info();
6235       } else {
6236          ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6237          load->dim = sdim;
6238          load->da = should_declare_array(sdim);
6239          load->sync = sync;
6240       }
6241    }
6242
6243    if (is_sparse && instr->def.bit_size == 64) {
6244       /* The result components are 64-bit but the sparse residency code is
6245        * 32-bit. So add a zero to the end so expand_vector() works correctly.
6246        */
6247       tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6248                        Operand::zero());
6249    }
6250
6251    expand_vector(ctx, tmp, dst, instr->def.num_components, expand_mask, instr->def.bit_size == 64);
6252 }
6253
6254 void
6255 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6256 {
6257    Builder bld(ctx->program, ctx->block);
6258    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6259    bool is_array = nir_intrinsic_image_array(instr);
6260    Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6261    bool d16 = instr->src[3].ssa->bit_size == 16;
6262
6263    /* only R64_UINT and R64_SINT supported */
6264    if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6265       data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6266    data = as_vgpr(ctx, data);
6267
6268    uint32_t num_components = d16 ? instr->src[3].ssa->num_components : data.size();
6269
6270    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6271    unsigned access = nir_intrinsic_access(instr);
6272    bool glc = ctx->options->gfx_level == GFX6 ||
6273               ((access & (ACCESS_VOLATILE | ACCESS_COHERENT)) && ctx->program->gfx_level < GFX11);
6274
6275    if (dim == GLSL_SAMPLER_DIM_BUF) {
6276       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6277       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6278       aco_opcode opcode;
6279       if (!d16) {
6280          switch (num_components) {
6281          case 1: opcode = aco_opcode::buffer_store_format_x; break;
6282          case 2: opcode = aco_opcode::buffer_store_format_xy; break;
6283          case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
6284          case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
6285          default: unreachable(">4 channel buffer image store");
6286          }
6287       } else {
6288          switch (num_components) {
6289          case 1: opcode = aco_opcode::buffer_store_format_d16_x; break;
6290          case 2: opcode = aco_opcode::buffer_store_format_d16_xy; break;
6291          case 3: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
6292          case 4: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
6293          default: unreachable(">4 channel buffer image store");
6294          }
6295       }
6296       aco_ptr<MUBUF_instruction> store{
6297          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6298       store->operands[0] = Operand(rsrc);
6299       store->operands[1] = Operand(vindex);
6300       store->operands[2] = Operand::c32(0);
6301       store->operands[3] = Operand(data);
6302       store->idxen = true;
6303       store->glc = glc;
6304       store->dlc = false;
6305       store->disable_wqm = true;
6306       store->sync = sync;
6307       ctx->program->needs_exact = true;
6308       ctx->block->instructions.emplace_back(std::move(store));
6309       return;
6310    }
6311
6312    assert(data.type() == RegType::vgpr);
6313    std::vector<Temp> coords = get_image_coords(ctx, instr);
6314    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6315
6316    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6317    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6318
6319    uint32_t dmask = BITFIELD_MASK(num_components);
6320    /* remove zero/undef elements from data, components which aren't in dmask
6321     * are zeroed anyway
6322     */
6323    if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
6324       for (uint32_t i = 0; i < instr->num_components; i++) {
6325          nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
6326          if ((nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0) ||
6327              nir_scalar_is_undef(comp))
6328             dmask &= ~BITFIELD_BIT(i);
6329       }
6330
6331       /* dmask cannot be 0, at least one vgpr is always read */
6332       if (dmask == 0)
6333          dmask = 1;
6334
6335       if (dmask != BITFIELD_MASK(num_components)) {
6336          uint32_t dmask_count = util_bitcount(dmask);
6337          RegClass rc = d16 ? v2b : v1;
6338          if (dmask_count == 1) {
6339             data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
6340          } else {
6341             aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6342                aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
6343             uint32_t index = 0;
6344             u_foreach_bit (bit, dmask) {
6345                vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
6346             }
6347             data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
6348             vec->definitions[0] = Definition(data);
6349             bld.insert(std::move(vec));
6350          }
6351       }
6352    }
6353
6354    MIMG_instruction* store =
6355       emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, false, Operand(data));
6356    store->glc = glc;
6357    store->dlc = false;
6358    store->a16 = instr->src[1].ssa->bit_size == 16;
6359    store->d16 = d16;
6360    store->dmask = dmask;
6361    store->unrm = true;
6362    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6363    store->dim = sdim;
6364    store->da = should_declare_array(sdim);
6365    store->disable_wqm = true;
6366    store->sync = sync;
6367    ctx->program->needs_exact = true;
6368    return;
6369 }
6370
6371 void
6372 translate_buffer_image_atomic_op(const nir_atomic_op op, aco_opcode* buf_op, aco_opcode* buf_op64,
6373                                  aco_opcode* image_op)
6374 {
6375    switch (op) {
6376    case nir_atomic_op_iadd:
6377       *buf_op = aco_opcode::buffer_atomic_add;
6378       *buf_op64 = aco_opcode::buffer_atomic_add_x2;
6379       *image_op = aco_opcode::image_atomic_add;
6380       break;
6381    case nir_atomic_op_umin:
6382       *buf_op = aco_opcode::buffer_atomic_umin;
6383       *buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6384       *image_op = aco_opcode::image_atomic_umin;
6385       break;
6386    case nir_atomic_op_imin:
6387       *buf_op = aco_opcode::buffer_atomic_smin;
6388       *buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6389       *image_op = aco_opcode::image_atomic_smin;
6390       break;
6391    case nir_atomic_op_umax:
6392       *buf_op = aco_opcode::buffer_atomic_umax;
6393       *buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6394       *image_op = aco_opcode::image_atomic_umax;
6395       break;
6396    case nir_atomic_op_imax:
6397       *buf_op = aco_opcode::buffer_atomic_smax;
6398       *buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6399       *image_op = aco_opcode::image_atomic_smax;
6400       break;
6401    case nir_atomic_op_iand:
6402       *buf_op = aco_opcode::buffer_atomic_and;
6403       *buf_op64 = aco_opcode::buffer_atomic_and_x2;
6404       *image_op = aco_opcode::image_atomic_and;
6405       break;
6406    case nir_atomic_op_ior:
6407       *buf_op = aco_opcode::buffer_atomic_or;
6408       *buf_op64 = aco_opcode::buffer_atomic_or_x2;
6409       *image_op = aco_opcode::image_atomic_or;
6410       break;
6411    case nir_atomic_op_ixor:
6412       *buf_op = aco_opcode::buffer_atomic_xor;
6413       *buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6414       *image_op = aco_opcode::image_atomic_xor;
6415       break;
6416    case nir_atomic_op_xchg:
6417       *buf_op = aco_opcode::buffer_atomic_swap;
6418       *buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6419       *image_op = aco_opcode::image_atomic_swap;
6420       break;
6421    case nir_atomic_op_cmpxchg:
6422       *buf_op = aco_opcode::buffer_atomic_cmpswap;
6423       *buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6424       *image_op = aco_opcode::image_atomic_cmpswap;
6425       break;
6426    case nir_atomic_op_inc_wrap:
6427       *buf_op = aco_opcode::buffer_atomic_inc;
6428       *buf_op64 = aco_opcode::buffer_atomic_inc_x2;
6429       *image_op = aco_opcode::image_atomic_inc;
6430       break;
6431    case nir_atomic_op_dec_wrap:
6432       *buf_op = aco_opcode::buffer_atomic_dec;
6433       *buf_op64 = aco_opcode::buffer_atomic_dec_x2;
6434       *image_op = aco_opcode::image_atomic_dec;
6435       break;
6436    case nir_atomic_op_fadd:
6437       *buf_op = aco_opcode::buffer_atomic_add_f32;
6438       *buf_op64 = aco_opcode::num_opcodes;
6439       *image_op = aco_opcode::num_opcodes;
6440       break;
6441    case nir_atomic_op_fmin:
6442       *buf_op = aco_opcode::buffer_atomic_fmin;
6443       *buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6444       *image_op = aco_opcode::image_atomic_fmin;
6445       break;
6446    case nir_atomic_op_fmax:
6447       *buf_op = aco_opcode::buffer_atomic_fmax;
6448       *buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6449       *image_op = aco_opcode::image_atomic_fmax;
6450       break;
6451    default: unreachable("unsupported atomic operation");
6452    }
6453 }
6454
6455 void
6456 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6457 {
6458    bool return_previous = !nir_def_is_unused(&instr->def);
6459    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6460    bool is_array = nir_intrinsic_image_array(instr);
6461    Builder bld(ctx->program, ctx->block);
6462
6463    const nir_atomic_op op = nir_intrinsic_atomic_op(instr);
6464    const bool cmpswap = op == nir_atomic_op_cmpxchg;
6465
6466    aco_opcode buf_op, buf_op64, image_op;
6467    translate_buffer_image_atomic_op(op, &buf_op, &buf_op64, &image_op);
6468
6469    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6470    bool is_64bit = data.bytes() == 8;
6471    assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6472
6473    if (cmpswap)
6474       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6475                         get_ssa_temp(ctx, instr->src[4].ssa), data);
6476
6477    Temp dst = get_ssa_temp(ctx, &instr->def);
6478    memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6479
6480    if (dim == GLSL_SAMPLER_DIM_BUF) {
6481       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6482       Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6483       // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
6484       // implemented.");
6485       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6486          is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6487       mubuf->operands[0] = Operand(resource);
6488       mubuf->operands[1] = Operand(vindex);
6489       mubuf->operands[2] = Operand::c32(0);
6490       mubuf->operands[3] = Operand(data);
6491       Definition def =
6492          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6493       if (return_previous)
6494          mubuf->definitions[0] = def;
6495       mubuf->offset = 0;
6496       mubuf->idxen = true;
6497       mubuf->glc = return_previous;
6498       mubuf->dlc = false; /* Not needed for atomics */
6499       mubuf->disable_wqm = true;
6500       mubuf->sync = sync;
6501       ctx->program->needs_exact = true;
6502       ctx->block->instructions.emplace_back(std::move(mubuf));
6503       if (return_previous && cmpswap)
6504          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6505       return;
6506    }
6507
6508    std::vector<Temp> coords = get_image_coords(ctx, instr);
6509    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6510    Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
6511    MIMG_instruction* mimg =
6512       emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, false, Operand(data));
6513    mimg->glc = return_previous;
6514    mimg->dlc = false; /* Not needed for atomics */
6515    mimg->dmask = (1 << data.size()) - 1;
6516    mimg->a16 = instr->src[1].ssa->bit_size == 16;
6517    mimg->unrm = true;
6518    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6519    mimg->dim = sdim;
6520    mimg->da = should_declare_array(sdim);
6521    mimg->disable_wqm = true;
6522    mimg->sync = sync;
6523    ctx->program->needs_exact = true;
6524    if (return_previous && cmpswap)
6525       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::zero());
6526    return;
6527 }
6528
6529 void
6530 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6531 {
6532    Builder bld(ctx->program, ctx->block);
6533    unsigned num_components = instr->num_components;
6534
6535    Temp dst = get_ssa_temp(ctx, &instr->def);
6536    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6537
6538    unsigned access = nir_intrinsic_access(instr);
6539    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6540    unsigned size = instr->def.bit_size / 8;
6541
6542    bool allow_smem = access & ACCESS_CAN_REORDER;
6543
6544    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6545                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6546                get_memory_sync_info(instr, storage_buffer, 0));
6547 }
6548
6549 void
6550 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6551 {
6552    Builder bld(ctx->program, ctx->block);
6553    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6554    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6555    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6556    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6557
6558    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6559
6560    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6561    bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) &&
6562               ctx->program->gfx_level < GFX11;
6563
6564    unsigned write_count = 0;
6565    Temp write_datas[32];
6566    unsigned offsets[32];
6567    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6568                       write_datas, offsets);
6569
6570    /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6571     * correctly when the SGPR offset is used.
6572     */
6573    if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
6574       offset = as_vgpr(ctx, offset);
6575
6576    for (unsigned i = 0; i < write_count; i++) {
6577       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6578
6579       aco_ptr<MUBUF_instruction> store{
6580          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6581       store->operands[0] = Operand(rsrc);
6582       store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6583       store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6584       store->operands[3] = Operand(write_datas[i]);
6585       store->offset = offsets[i];
6586       store->offen = (offset.type() == RegType::vgpr);
6587       store->glc = glc;
6588       store->dlc = false;
6589       store->disable_wqm = true;
6590       store->sync = sync;
6591       ctx->program->needs_exact = true;
6592       ctx->block->instructions.emplace_back(std::move(store));
6593    }
6594 }
6595
6596 void
6597 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6598 {
6599    Builder bld(ctx->program, ctx->block);
6600    bool return_previous = !nir_def_is_unused(&instr->def);
6601    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6602
6603    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6604    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6605
6606    aco_opcode op32, op64, image_op;
6607    translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6608
6609    if (cmpswap)
6610       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6611                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6612
6613    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6614    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6615    Temp dst = get_ssa_temp(ctx, &instr->def);
6616
6617    aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6618    aco_ptr<MUBUF_instruction> mubuf{
6619       create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6620    mubuf->operands[0] = Operand(rsrc);
6621    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6622    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6623    mubuf->operands[3] = Operand(data);
6624    Definition def =
6625       return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6626    if (return_previous)
6627       mubuf->definitions[0] = def;
6628    mubuf->offset = 0;
6629    mubuf->offen = (offset.type() == RegType::vgpr);
6630    mubuf->glc = return_previous;
6631    mubuf->dlc = false; /* Not needed for atomics */
6632    mubuf->disable_wqm = true;
6633    mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6634    ctx->program->needs_exact = true;
6635    ctx->block->instructions.emplace_back(std::move(mubuf));
6636    if (return_previous && cmpswap)
6637       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6638 }
6639
6640 void
6641 parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset,
6642              Temp* offset)
6643 {
6644    bool is_store = intrin->intrinsic == nir_intrinsic_store_global_amd;
6645    *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa);
6646
6647    *const_offset = nir_intrinsic_base(intrin);
6648
6649    unsigned num_src = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
6650    nir_src offset_src = intrin->src[num_src - 1];
6651    if (!nir_src_is_const(offset_src) || nir_src_as_uint(offset_src))
6652       *offset = get_ssa_temp(ctx, offset_src.ssa);
6653    else
6654       *offset = Temp();
6655 }
6656
6657 void
6658 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6659 {
6660    Builder bld(ctx->program, ctx->block);
6661    unsigned num_components = instr->num_components;
6662    unsigned component_size = instr->def.bit_size / 8;
6663
6664    Temp addr, offset;
6665    uint32_t const_offset;
6666    parse_global(ctx, instr, &addr, &const_offset, &offset);
6667
6668    LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->def), num_components,
6669                         component_size};
6670    if (offset.id()) {
6671       info.resource = addr;
6672       info.offset = Operand(offset);
6673    }
6674    info.const_offset = const_offset;
6675    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6676    info.align_mul = nir_intrinsic_align_mul(instr);
6677    info.align_offset = nir_intrinsic_align_offset(instr);
6678    info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6679
6680    /* Don't expand global loads when they use MUBUF or SMEM.
6681     * Global loads don't have the bounds checking that buffer loads have that
6682     * makes this safe.
6683     */
6684    unsigned align = nir_intrinsic_align(instr);
6685    bool byte_align_for_smem_mubuf =
6686       can_use_byte_align_for_global_load(num_components, component_size, align, false);
6687
6688    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6689     * it's safe to use SMEM */
6690    bool can_use_smem =
6691       (nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE) && byte_align_for_smem_mubuf;
6692    if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->gfx_level < GFX8) ||
6693        !can_use_smem) {
6694       EmitLoadParameters params = global_load_params;
6695       params.byte_align_loads = ctx->options->gfx_level > GFX6 || byte_align_for_smem_mubuf;
6696       emit_load(ctx, bld, info, params);
6697    } else {
6698       if (info.resource.id())
6699          info.resource = bld.as_uniform(info.resource);
6700       info.offset = Operand(bld.as_uniform(info.offset));
6701       emit_load(ctx, bld, info, smem_load_params);
6702    }
6703 }
6704
6705 void
6706 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6707 {
6708    Builder bld(ctx->program, ctx->block);
6709    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6710    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6711
6712    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6713    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6714    bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) &&
6715               ctx->program->gfx_level < GFX11;
6716
6717    unsigned write_count = 0;
6718    Temp write_datas[32];
6719    unsigned offsets[32];
6720    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6721                       write_datas, offsets);
6722
6723    Temp addr, offset;
6724    uint32_t const_offset;
6725    parse_global(ctx, instr, &addr, &const_offset, &offset);
6726
6727    for (unsigned i = 0; i < write_count; i++) {
6728       Temp write_address = addr;
6729       uint32_t write_const_offset = const_offset;
6730       Temp write_offset = offset;
6731       lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset);
6732
6733       if (ctx->options->gfx_level >= GFX7) {
6734          bool global = ctx->options->gfx_level >= GFX9;
6735          aco_opcode op;
6736          switch (write_datas[i].bytes()) {
6737          case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6738          case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6739          case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6740          case 8:
6741             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6742             break;
6743          case 12:
6744             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6745             break;
6746          case 16:
6747             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6748             break;
6749          default: unreachable("store_global not implemented for this size.");
6750          }
6751
6752          aco_ptr<FLAT_instruction> flat{
6753             create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6754          if (write_address.regClass() == s2) {
6755             assert(global && write_offset.id() && write_offset.type() == RegType::vgpr);
6756             flat->operands[0] = Operand(write_offset);
6757             flat->operands[1] = Operand(write_address);
6758          } else {
6759             assert(write_address.type() == RegType::vgpr && !write_offset.id());
6760             flat->operands[0] = Operand(write_address);
6761             flat->operands[1] = Operand(s1);
6762          }
6763          flat->operands[2] = Operand(write_datas[i]);
6764          flat->glc = glc;
6765          flat->dlc = false;
6766          assert(global || !write_const_offset);
6767          flat->offset = write_const_offset;
6768          flat->disable_wqm = true;
6769          flat->sync = sync;
6770          ctx->program->needs_exact = true;
6771          ctx->block->instructions.emplace_back(std::move(flat));
6772       } else {
6773          assert(ctx->options->gfx_level == GFX6);
6774
6775          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6776
6777          Temp rsrc = get_gfx6_global_rsrc(bld, write_address);
6778
6779          aco_ptr<MUBUF_instruction> mubuf{
6780             create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6781          mubuf->operands[0] = Operand(rsrc);
6782          mubuf->operands[1] =
6783             write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
6784          mubuf->operands[2] = Operand(write_offset);
6785          mubuf->operands[3] = Operand(write_datas[i]);
6786          mubuf->glc = glc;
6787          mubuf->dlc = false;
6788          mubuf->offset = write_const_offset;
6789          mubuf->addr64 = write_address.type() == RegType::vgpr;
6790          mubuf->disable_wqm = true;
6791          mubuf->sync = sync;
6792          ctx->program->needs_exact = true;
6793          ctx->block->instructions.emplace_back(std::move(mubuf));
6794       }
6795    }
6796 }
6797
6798 void
6799 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6800 {
6801    Builder bld(ctx->program, ctx->block);
6802    bool return_previous = !nir_def_is_unused(&instr->def);
6803    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6804
6805    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6806    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6807
6808    if (cmpswap)
6809       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6810                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6811
6812    Temp dst = get_ssa_temp(ctx, &instr->def);
6813
6814    aco_opcode op32, op64;
6815
6816    Temp addr, offset;
6817    uint32_t const_offset;
6818    parse_global(ctx, instr, &addr, &const_offset, &offset);
6819    lower_global_address(bld, 0, &addr, &const_offset, &offset);
6820
6821    if (ctx->options->gfx_level >= GFX7) {
6822       bool global = ctx->options->gfx_level >= GFX9;
6823       switch (nir_op) {
6824       case nir_atomic_op_iadd:
6825          op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6826          op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6827          break;
6828       case nir_atomic_op_imin:
6829          op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6830          op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6831          break;
6832       case nir_atomic_op_umin:
6833          op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6834          op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6835          break;
6836       case nir_atomic_op_imax:
6837          op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6838          op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6839          break;
6840       case nir_atomic_op_umax:
6841          op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6842          op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6843          break;
6844       case nir_atomic_op_iand:
6845          op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6846          op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6847          break;
6848       case nir_atomic_op_ior:
6849          op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6850          op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6851          break;
6852       case nir_atomic_op_ixor:
6853          op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6854          op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6855          break;
6856       case nir_atomic_op_xchg:
6857          op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6858          op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6859          break;
6860       case nir_atomic_op_cmpxchg:
6861          op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6862          op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6863          break;
6864       case nir_atomic_op_fadd:
6865          op32 = global ? aco_opcode::global_atomic_add_f32 : aco_opcode::flat_atomic_add_f32;
6866          op64 = aco_opcode::num_opcodes;
6867          break;
6868       case nir_atomic_op_fmin:
6869          op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
6870          op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
6871          break;
6872       case nir_atomic_op_fmax:
6873          op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
6874          op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
6875          break;
6876       default: unreachable("unsupported atomic operation");
6877       }
6878
6879       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6880       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
6881          op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6882       if (addr.regClass() == s2) {
6883          assert(global && offset.id() && offset.type() == RegType::vgpr);
6884          flat->operands[0] = Operand(offset);
6885          flat->operands[1] = Operand(addr);
6886       } else {
6887          assert(addr.type() == RegType::vgpr && !offset.id());
6888          flat->operands[0] = Operand(addr);
6889          flat->operands[1] = Operand(s1);
6890       }
6891       flat->operands[2] = Operand(data);
6892       if (return_previous)
6893          flat->definitions[0] = Definition(dst);
6894       flat->glc = return_previous;
6895       flat->dlc = false; /* Not needed for atomics */
6896       assert(global || !const_offset);
6897       flat->offset = const_offset;
6898       flat->disable_wqm = true;
6899       flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6900       ctx->program->needs_exact = true;
6901       ctx->block->instructions.emplace_back(std::move(flat));
6902    } else {
6903       assert(ctx->options->gfx_level == GFX6);
6904
6905       UNUSED aco_opcode image_op;
6906       translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6907
6908       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6909
6910       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6911
6912       aco_ptr<MUBUF_instruction> mubuf{
6913          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6914       mubuf->operands[0] = Operand(rsrc);
6915       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6916       mubuf->operands[2] = Operand(offset);
6917       mubuf->operands[3] = Operand(data);
6918       Definition def =
6919          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6920       if (return_previous)
6921          mubuf->definitions[0] = def;
6922       mubuf->glc = return_previous;
6923       mubuf->dlc = false;
6924       mubuf->offset = const_offset;
6925       mubuf->addr64 = addr.type() == RegType::vgpr;
6926       mubuf->disable_wqm = true;
6927       mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6928       ctx->program->needs_exact = true;
6929       ctx->block->instructions.emplace_back(std::move(mubuf));
6930       if (return_previous && cmpswap)
6931          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6932    }
6933 }
6934
6935 unsigned
6936 aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)
6937 {
6938    unsigned storage = storage_none;
6939
6940    if (mem_mode & nir_var_shader_out)
6941       storage |= storage_vmem_output;
6942    if ((mem_mode & nir_var_mem_ssbo) || (mem_mode & nir_var_mem_global))
6943       storage |= storage_buffer;
6944    if (mem_mode & nir_var_mem_task_payload)
6945       storage |= storage_task_payload;
6946    if (mem_mode & nir_var_mem_shared)
6947       storage |= storage_shared;
6948    if (mem_mode & nir_var_image)
6949       storage |= storage_image;
6950
6951    return storage;
6952 }
6953
6954 void
6955 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
6956 {
6957    Builder bld(ctx->program, ctx->block);
6958
6959    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
6960    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
6961    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
6962                 !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]);
6963    bool v_offset_zero = nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]);
6964    bool s_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
6965
6966    Temp dst = get_ssa_temp(ctx, &intrin->def);
6967    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
6968    Temp v_offset =
6969       v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
6970    Temp s_offset =
6971       s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
6972    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
6973
6974    bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
6975    bool slc = nir_intrinsic_access(intrin) & ACCESS_NON_TEMPORAL;
6976
6977    unsigned const_offset = nir_intrinsic_base(intrin);
6978    unsigned elem_size_bytes = intrin->def.bit_size / 8u;
6979    unsigned num_components = intrin->def.num_components;
6980
6981    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
6982    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
6983
6984    LoadEmitInfo info = {Operand(v_offset), dst, num_components, elem_size_bytes, descriptor};
6985    info.idx = idx;
6986    info.glc = glc;
6987    info.slc = slc;
6988    info.soffset = s_offset;
6989    info.const_offset = const_offset;
6990    info.sync = sync;
6991
6992    if (intrin->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
6993       const pipe_format format = nir_intrinsic_format(intrin);
6994       const struct ac_vtx_format_info* vtx_info =
6995          ac_get_vtx_format_info(ctx->program->gfx_level, ctx->program->family, format);
6996       const struct util_format_description* f = util_format_description(format);
6997       const unsigned align_mul = nir_intrinsic_align_mul(intrin);
6998       const unsigned align_offset = nir_intrinsic_align_offset(intrin);
6999
7000       /* Avoid splitting:
7001        * - non-array formats because that would result in incorrect code
7002        * - when element size is same as component size (to reduce instruction count)
7003        */
7004       const bool can_split = f->is_array && elem_size_bytes != vtx_info->chan_byte_size;
7005
7006       info.align_mul = align_mul;
7007       info.align_offset = align_offset;
7008       info.format = format;
7009       info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
7010       info.split_by_component_stride = false;
7011
7012       emit_load(ctx, bld, info, mtbuf_load_params);
7013    } else {
7014       assert(intrin->intrinsic == nir_intrinsic_load_buffer_amd);
7015
7016       if (nir_intrinsic_access(intrin) & ACCESS_USES_FORMAT_AMD) {
7017          assert(!swizzled);
7018
7019          emit_load(ctx, bld, info, mubuf_load_format_params);
7020       } else {
7021          const unsigned swizzle_element_size =
7022             swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
7023
7024          info.component_stride = swizzle_element_size;
7025          info.swizzle_component_size = swizzle_element_size ? 4 : 0;
7026          info.align_mul = MIN2(elem_size_bytes, 4);
7027          info.align_offset = 0;
7028
7029          emit_load(ctx, bld, info, mubuf_load_params);
7030       }
7031    }
7032 }
7033
7034 void
7035 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7036 {
7037    Builder bld(ctx->program, ctx->block);
7038
7039    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7040    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7041    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7042                 !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]);
7043    bool v_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
7044    bool s_offset_zero = nir_src_is_const(intrin->src[3]) && !nir_src_as_uint(intrin->src[3]);
7045
7046    Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7047    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa));
7048    Temp v_offset =
7049       v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa));
7050    Temp s_offset =
7051       s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
7052    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
7053
7054    bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
7055    bool slc = nir_intrinsic_access(intrin) & ACCESS_NON_TEMPORAL;
7056
7057    unsigned const_offset = nir_intrinsic_base(intrin);
7058    unsigned write_mask = nir_intrinsic_write_mask(intrin);
7059    unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7060
7061    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7062    /* GS outputs are only written once. */
7063    const bool written_once =
7064       mem_mode == nir_var_shader_out && ctx->shader->info.stage == MESA_SHADER_GEOMETRY;
7065    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode),
7066                          written_once ? semantic_can_reorder : semantic_none);
7067
7068    store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, idx, const_offset,
7069                     elem_size_bytes, write_mask, swizzled, sync, glc, slc);
7070 }
7071
7072 void
7073 visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr)
7074 {
7075    Builder bld(ctx->program, ctx->block);
7076    Temp dst = get_ssa_temp(ctx, &instr->def);
7077    Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
7078    Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7079
7080    /* If base address is 32bit, convert to 64bit with the high 32bit part. */
7081    if (base.bytes() == 4) {
7082       base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), base,
7083                         Operand::c32(ctx->options->address32_hi));
7084    }
7085
7086    aco_opcode opcode = aco_opcode::s_load_dword;
7087    unsigned size = 1;
7088
7089    assert(dst.bytes() <= 64);
7090
7091    if (dst.bytes() > 32) {
7092       opcode = aco_opcode::s_load_dwordx16;
7093       size = 16;
7094    } else if (dst.bytes() > 16) {
7095       opcode = aco_opcode::s_load_dwordx8;
7096       size = 8;
7097    } else if (dst.bytes() > 8) {
7098       opcode = aco_opcode::s_load_dwordx4;
7099       size = 4;
7100    } else if (dst.bytes() > 4) {
7101       opcode = aco_opcode::s_load_dwordx2;
7102       size = 2;
7103    }
7104
7105    if (dst.size() != size) {
7106       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst),
7107                  bld.smem(opcode, bld.def(RegType::sgpr, size), base, offset), Operand::c32(0u));
7108    } else {
7109       bld.smem(opcode, Definition(dst), base, offset);
7110    }
7111    emit_split_vector(ctx, dst, instr->def.num_components);
7112 }
7113
7114 sync_scope
7115 translate_nir_scope(mesa_scope scope)
7116 {
7117    switch (scope) {
7118    case SCOPE_NONE:
7119    case SCOPE_INVOCATION: return scope_invocation;
7120    case SCOPE_SUBGROUP: return scope_subgroup;
7121    case SCOPE_WORKGROUP: return scope_workgroup;
7122    case SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7123    case SCOPE_DEVICE: return scope_device;
7124    case SCOPE_SHADER_CALL: return scope_invocation;
7125    }
7126    unreachable("invalid scope");
7127 }
7128
7129 void
7130 emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7131 {
7132    Builder bld(ctx->program, ctx->block);
7133
7134    unsigned storage_allowed = storage_buffer | storage_image;
7135    unsigned semantics = 0;
7136    sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7137    sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7138
7139    /* We use shared storage for the following:
7140     * - compute shaders expose it in their API
7141     * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7142     * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7143     * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7144     */
7145    bool shared_storage_used =
7146       ctx->stage.hw == AC_HW_COMPUTE_SHADER || ctx->stage.hw == AC_HW_LOCAL_SHADER ||
7147       ctx->stage.hw == AC_HW_HULL_SHADER ||
7148       (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->program->gfx_level >= GFX9) ||
7149       ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7150
7151    if (shared_storage_used)
7152       storage_allowed |= storage_shared;
7153
7154    /* Task payload: Task Shader output, Mesh Shader input */
7155    if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS))
7156       storage_allowed |= storage_task_payload;
7157
7158    /* Allow VMEM output for all stages that can have outputs. */
7159    if ((ctx->stage.hw != AC_HW_COMPUTE_SHADER && ctx->stage.hw != AC_HW_PIXEL_SHADER) ||
7160        ctx->stage.has(SWStage::TS))
7161       storage_allowed |= storage_vmem_output;
7162
7163    /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7164     * They are allowed in CS, TCS, and in any NGG shader.
7165     */
7166    ASSERTED bool workgroup_scope_allowed = ctx->stage.hw == AC_HW_COMPUTE_SHADER ||
7167                                            ctx->stage.hw == AC_HW_HULL_SHADER ||
7168                                            ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7169
7170    unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7171    unsigned storage = aco_storage_mode_from_nir_mem_mode(nir_storage);
7172    storage &= storage_allowed;
7173
7174    unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7175    if (nir_semantics & NIR_MEMORY_ACQUIRE)
7176       semantics |= semantic_acquire | semantic_release;
7177    if (nir_semantics & NIR_MEMORY_RELEASE)
7178       semantics |= semantic_acquire | semantic_release;
7179
7180    assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7181    assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7182
7183    bld.barrier(aco_opcode::p_barrier,
7184                memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7185                exec_scope);
7186 }
7187
7188 void
7189 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7190 {
7191    // TODO: implement sparse reads using ds_read2_b32 and nir_def_components_read()
7192    Temp dst = get_ssa_temp(ctx, &instr->def);
7193    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7194    Builder bld(ctx->program, ctx->block);
7195
7196    unsigned elem_size_bytes = instr->def.bit_size / 8;
7197    unsigned num_components = instr->def.num_components;
7198    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7199    load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7200 }
7201
7202 void
7203 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7204 {
7205    unsigned writemask = nir_intrinsic_write_mask(instr);
7206    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7207    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7208    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7209
7210    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7211    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7212 }
7213
7214 void
7215 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7216 {
7217    unsigned offset = nir_intrinsic_base(instr);
7218    Builder bld(ctx->program, ctx->block);
7219    Operand m = load_lds_size_m0(bld);
7220    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7221    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7222
7223    unsigned num_operands = 3;
7224    aco_opcode op32, op64, op32_rtn, op64_rtn;
7225    switch (nir_intrinsic_atomic_op(instr)) {
7226    case nir_atomic_op_iadd:
7227       op32 = aco_opcode::ds_add_u32;
7228       op64 = aco_opcode::ds_add_u64;
7229       op32_rtn = aco_opcode::ds_add_rtn_u32;
7230       op64_rtn = aco_opcode::ds_add_rtn_u64;
7231       break;
7232    case nir_atomic_op_imin:
7233       op32 = aco_opcode::ds_min_i32;
7234       op64 = aco_opcode::ds_min_i64;
7235       op32_rtn = aco_opcode::ds_min_rtn_i32;
7236       op64_rtn = aco_opcode::ds_min_rtn_i64;
7237       break;
7238    case nir_atomic_op_umin:
7239       op32 = aco_opcode::ds_min_u32;
7240       op64 = aco_opcode::ds_min_u64;
7241       op32_rtn = aco_opcode::ds_min_rtn_u32;
7242       op64_rtn = aco_opcode::ds_min_rtn_u64;
7243       break;
7244    case nir_atomic_op_imax:
7245       op32 = aco_opcode::ds_max_i32;
7246       op64 = aco_opcode::ds_max_i64;
7247       op32_rtn = aco_opcode::ds_max_rtn_i32;
7248       op64_rtn = aco_opcode::ds_max_rtn_i64;
7249       break;
7250    case nir_atomic_op_umax:
7251       op32 = aco_opcode::ds_max_u32;
7252       op64 = aco_opcode::ds_max_u64;
7253       op32_rtn = aco_opcode::ds_max_rtn_u32;
7254       op64_rtn = aco_opcode::ds_max_rtn_u64;
7255       break;
7256    case nir_atomic_op_iand:
7257       op32 = aco_opcode::ds_and_b32;
7258       op64 = aco_opcode::ds_and_b64;
7259       op32_rtn = aco_opcode::ds_and_rtn_b32;
7260       op64_rtn = aco_opcode::ds_and_rtn_b64;
7261       break;
7262    case nir_atomic_op_ior:
7263       op32 = aco_opcode::ds_or_b32;
7264       op64 = aco_opcode::ds_or_b64;
7265       op32_rtn = aco_opcode::ds_or_rtn_b32;
7266       op64_rtn = aco_opcode::ds_or_rtn_b64;
7267       break;
7268    case nir_atomic_op_ixor:
7269       op32 = aco_opcode::ds_xor_b32;
7270       op64 = aco_opcode::ds_xor_b64;
7271       op32_rtn = aco_opcode::ds_xor_rtn_b32;
7272       op64_rtn = aco_opcode::ds_xor_rtn_b64;
7273       break;
7274    case nir_atomic_op_xchg:
7275       op32 = aco_opcode::ds_write_b32;
7276       op64 = aco_opcode::ds_write_b64;
7277       op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7278       op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7279       break;
7280    case nir_atomic_op_cmpxchg:
7281       op32 = aco_opcode::ds_cmpst_b32;
7282       op64 = aco_opcode::ds_cmpst_b64;
7283       op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7284       op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7285       num_operands = 4;
7286       break;
7287    case nir_atomic_op_fadd:
7288       op32 = aco_opcode::ds_add_f32;
7289       op32_rtn = aco_opcode::ds_add_rtn_f32;
7290       op64 = aco_opcode::num_opcodes;
7291       op64_rtn = aco_opcode::num_opcodes;
7292       break;
7293    case nir_atomic_op_fmin:
7294       op32 = aco_opcode::ds_min_f32;
7295       op32_rtn = aco_opcode::ds_min_rtn_f32;
7296       op64 = aco_opcode::ds_min_f64;
7297       op64_rtn = aco_opcode::ds_min_rtn_f64;
7298       break;
7299    case nir_atomic_op_fmax:
7300       op32 = aco_opcode::ds_max_f32;
7301       op32_rtn = aco_opcode::ds_max_rtn_f32;
7302       op64 = aco_opcode::ds_max_f64;
7303       op64_rtn = aco_opcode::ds_max_rtn_f64;
7304       break;
7305    default: unreachable("Unhandled shared atomic intrinsic");
7306    }
7307
7308    bool return_previous = !nir_def_is_unused(&instr->def);
7309
7310    aco_opcode op;
7311    if (data.size() == 1) {
7312       assert(instr->def.bit_size == 32);
7313       op = return_previous ? op32_rtn : op32;
7314    } else {
7315       assert(instr->def.bit_size == 64);
7316       op = return_previous ? op64_rtn : op64;
7317    }
7318
7319    if (offset > 65535) {
7320       address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7321       offset = 0;
7322    }
7323
7324    aco_ptr<DS_instruction> ds;
7325    ds.reset(
7326       create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7327    ds->operands[0] = Operand(address);
7328    ds->operands[1] = Operand(data);
7329    if (num_operands == 4) {
7330       Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7331       ds->operands[2] = Operand(data2);
7332       if (bld.program->gfx_level >= GFX11)
7333          std::swap(ds->operands[1], ds->operands[2]);
7334    }
7335    ds->operands[num_operands - 1] = m;
7336    ds->offset0 = offset;
7337    if (return_previous)
7338       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->def));
7339    ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7340
7341    if (m.isUndefined())
7342       ds->operands.pop_back();
7343
7344    ctx->block->instructions.emplace_back(std::move(ds));
7345 }
7346
7347 void
7348 visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
7349 {
7350    bool is_store = instr->intrinsic == nir_intrinsic_store_shared2_amd;
7351    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa));
7352    Builder bld(ctx->program, ctx->block);
7353
7354    assert(bld.program->gfx_level >= GFX7);
7355
7356    bool is64bit = (is_store ? instr->src[0].ssa->bit_size : instr->def.bit_size) == 64;
7357    uint8_t offset0 = nir_intrinsic_offset0(instr);
7358    uint8_t offset1 = nir_intrinsic_offset1(instr);
7359    bool st64 = nir_intrinsic_st64(instr);
7360
7361    Operand m = load_lds_size_m0(bld);
7362    Instruction* ds;
7363    if (is_store) {
7364       aco_opcode op = st64
7365                          ? (is64bit ? aco_opcode::ds_write2st64_b64 : aco_opcode::ds_write2st64_b32)
7366                          : (is64bit ? aco_opcode::ds_write2_b64 : aco_opcode::ds_write2_b32);
7367       Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7368       RegClass comp_rc = is64bit ? v2 : v1;
7369       Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc);
7370       Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc);
7371       ds = bld.ds(op, address, data0, data1, m, offset0, offset1);
7372    } else {
7373       Temp dst = get_ssa_temp(ctx, &instr->def);
7374       Definition tmp_dst(dst.type() == RegType::vgpr ? dst : bld.tmp(is64bit ? v4 : v2));
7375       aco_opcode op = st64 ? (is64bit ? aco_opcode::ds_read2st64_b64 : aco_opcode::ds_read2st64_b32)
7376                            : (is64bit ? aco_opcode::ds_read2_b64 : aco_opcode::ds_read2_b32);
7377       ds = bld.ds(op, tmp_dst, address, m, offset0, offset1);
7378    }
7379    ds->ds().sync = memory_sync_info(storage_shared);
7380    if (m.isUndefined())
7381       ds->operands.pop_back();
7382
7383    if (!is_store) {
7384       Temp dst = get_ssa_temp(ctx, &instr->def);
7385       if (dst.type() == RegType::sgpr) {
7386          emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size());
7387          Temp comp[4];
7388          /* Use scalar v_readfirstlane_b32 for better 32-bit copy propagation */
7389          for (unsigned i = 0; i < dst.size(); i++)
7390             comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1));
7391          if (is64bit) {
7392             Temp comp0 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[0], comp[1]);
7393             Temp comp1 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[2], comp[3]);
7394             ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]};
7395             ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]};
7396             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp0, comp1);
7397             ctx->allocated_vec[dst.id()] = {comp0, comp1};
7398          } else {
7399             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp[0], comp[1]);
7400          }
7401       }
7402
7403       emit_split_vector(ctx, dst, 2);
7404    }
7405 }
7406
7407 Temp
7408 get_scratch_resource(isel_context* ctx)
7409 {
7410    Builder bld(ctx->program, ctx->block);
7411    Temp scratch_addr = ctx->program->private_segment_buffer;
7412    if (!scratch_addr.bytes()) {
7413       Temp addr_lo =
7414          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
7415       Temp addr_hi =
7416          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
7417       scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
7418    } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
7419       scratch_addr =
7420          bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7421    }
7422
7423    uint32_t rsrc_conf =
7424       S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7425
7426    if (ctx->program->gfx_level >= GFX10) {
7427       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7428                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
7429                    S_008F0C_RESOURCE_LEVEL(ctx->program->gfx_level < GFX11);
7430    } else if (ctx->program->gfx_level <=
7431               GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7432       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7433                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7434    }
7435
7436    /* older generations need element size = 4 bytes. element size removed in GFX9 */
7437    if (ctx->program->gfx_level <= GFX8)
7438       rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7439
7440    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7441                      Operand::c32(rsrc_conf));
7442 }
7443
7444 void
7445 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7446 {
7447    Builder bld(ctx->program, ctx->block);
7448    Temp dst = get_ssa_temp(ctx, &instr->def);
7449
7450    LoadEmitInfo info = {Operand(v1), dst, instr->def.num_components, instr->def.bit_size / 8u};
7451    info.align_mul = nir_intrinsic_align_mul(instr);
7452    info.align_offset = nir_intrinsic_align_offset(instr);
7453    info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
7454    info.sync = memory_sync_info(storage_scratch, semantic_private);
7455    if (ctx->program->gfx_level >= GFX9) {
7456       if (nir_src_is_const(instr->src[0])) {
7457          uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7458          info.offset =
7459             bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
7460          info.const_offset = nir_src_as_uint(instr->src[0]) % max;
7461       } else {
7462          info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
7463       }
7464       EmitLoadParameters params = scratch_flat_load_params;
7465       params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
7466       emit_load(ctx, bld, info, params);
7467    } else {
7468       info.resource = get_scratch_resource(ctx);
7469       info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7470       info.soffset = ctx->program->scratch_offset;
7471       emit_load(ctx, bld, info, scratch_mubuf_load_params);
7472    }
7473 }
7474
7475 void
7476 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7477 {
7478    Builder bld(ctx->program, ctx->block);
7479    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7480    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
7481
7482    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7483    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7484
7485    unsigned write_count = 0;
7486    Temp write_datas[32];
7487    unsigned offsets[32];
7488    unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
7489    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7490                       &write_count, write_datas, offsets);
7491
7492    if (ctx->program->gfx_level >= GFX9) {
7493       uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7494       offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset;
7495       uint32_t base_const_offset =
7496          nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0;
7497
7498       for (unsigned i = 0; i < write_count; i++) {
7499          aco_opcode op;
7500          switch (write_datas[i].bytes()) {
7501          case 1: op = aco_opcode::scratch_store_byte; break;
7502          case 2: op = aco_opcode::scratch_store_short; break;
7503          case 4: op = aco_opcode::scratch_store_dword; break;
7504          case 8: op = aco_opcode::scratch_store_dwordx2; break;
7505          case 12: op = aco_opcode::scratch_store_dwordx3; break;
7506          case 16: op = aco_opcode::scratch_store_dwordx4; break;
7507          default: unreachable("Unexpected store size");
7508          }
7509
7510          uint32_t const_offset = base_const_offset + offsets[i];
7511          assert(const_offset < max || offset.id() == 0);
7512
7513          Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
7514          Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
7515          if (offset.id() == 0)
7516             saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max)));
7517
7518          bld.scratch(op, addr, saddr, write_datas[i], const_offset % max,
7519                      memory_sync_info(storage_scratch, semantic_private));
7520       }
7521    } else {
7522       Temp rsrc = get_scratch_resource(ctx);
7523       offset = as_vgpr(ctx, offset);
7524       for (unsigned i = 0; i < write_count; i++) {
7525          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7526          Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
7527                                         write_datas[i], offsets[i], true, true);
7528          mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7529       }
7530    }
7531 }
7532
7533 Temp
7534 emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
7535 {
7536    Builder bld(ctx->program, ctx->block);
7537
7538    if (cluster_size == 1) {
7539       return src;
7540    }
7541    if (op == nir_op_iand && cluster_size == 4) {
7542       /* subgroupClusteredAnd(val, 4) -> ~wqm(~val & exec) */
7543       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
7544       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
7545       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7546                       bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7547    } else if (op == nir_op_ior && cluster_size == 4) {
7548       /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
7549       return bld.sop1(
7550          Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7551          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7552    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7553       /* subgroupAnd(val) -> (~val & exec) == 0 */
7554       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
7555       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
7556                .def(1)
7557                .getTemp();
7558       Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
7559       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7560    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7561       /* subgroupOr(val) -> (val & exec) != 0 */
7562       Temp tmp =
7563          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
7564             .def(1)
7565             .getTemp();
7566       return bool_to_vector_condition(ctx, tmp);
7567    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7568       /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
7569       Temp tmp =
7570          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7571       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7572       tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
7573                .def(1)
7574                .getTemp();
7575       return bool_to_vector_condition(ctx, tmp);
7576    } else {
7577       /* subgroupClustered{And,Or,Xor}(val, n):
7578        *   lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
7579        *   cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)
7580        * subgroupClusteredAnd():
7581        *   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7582        * subgroupClusteredOr():
7583        *   return ((val & exec) >> cluster_offset) & cluster_mask != 0
7584        * subgroupClusteredXor():
7585        *   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7586        */
7587       Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
7588       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
7589                                      Operand::c32(~uint32_t(cluster_size - 1)), lane_id);
7590
7591       Temp tmp;
7592       if (op == nir_op_iand)
7593          tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,
7594                         Operand(exec, bld.lm));
7595       else
7596          tmp =
7597             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7598
7599       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7600
7601       if (ctx->program->gfx_level <= GFX7)
7602          tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7603       else if (ctx->program->wave_size == 64)
7604          tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7605       else
7606          tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7607       tmp = emit_extract_vector(ctx, tmp, 0, v1);
7608       if (cluster_mask != 0xffffffff)
7609          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
7610
7611       if (op == nir_op_iand) {
7612          return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::c32(cluster_mask),
7613                          tmp);
7614       } else if (op == nir_op_ior) {
7615          return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
7616       } else if (op == nir_op_ixor) {
7617          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
7618                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
7619          return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
7620       }
7621       assert(false);
7622       return Temp();
7623    }
7624 }
7625
7626 Temp
7627 emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
7628 {
7629    Builder bld(ctx->program, ctx->block);
7630    assert(src.regClass() == bld.lm);
7631
7632    /* subgroupExclusiveAnd(val) -> mbcnt(~val & exec) == 0
7633     * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7634     * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7635     */
7636    if (op == nir_op_iand)
7637       src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
7638
7639    Temp tmp =
7640       bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7641
7642    Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
7643
7644    if (op == nir_op_iand)
7645       return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), mbcnt);
7646    else if (op == nir_op_ior)
7647       return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), mbcnt);
7648    else if (op == nir_op_ixor)
7649       return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(),
7650                       bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
7651
7652    assert(false);
7653    return Temp();
7654 }
7655
7656 Temp
7657 emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
7658 {
7659    Builder bld(ctx->program, ctx->block);
7660
7661    /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7662     * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7663     * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7664     */
7665    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7666    if (op == nir_op_iand)
7667       return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7668    else if (op == nir_op_ior)
7669       return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7670    else if (op == nir_op_ixor)
7671       return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7672
7673    assert(false);
7674    return Temp();
7675 }
7676
7677 ReduceOp
7678 get_reduce_op(nir_op op, unsigned bit_size)
7679 {
7680    switch (op) {
7681 #define CASEI(name)                                                                                \
7682    case nir_op_##name:                                                                             \
7683       return (bit_size == 32)   ? name##32                                                         \
7684              : (bit_size == 16) ? name##16                                                         \
7685              : (bit_size == 8)  ? name##8                                                          \
7686                                 : name##64;
7687 #define CASEF(name)                                                                                \
7688    case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7689       CASEI(iadd)
7690       CASEI(imul)
7691       CASEI(imin)
7692       CASEI(umin)
7693       CASEI(imax)
7694       CASEI(umax)
7695       CASEI(iand)
7696       CASEI(ior)
7697       CASEI(ixor)
7698       CASEF(fadd)
7699       CASEF(fmul)
7700       CASEF(fmin)
7701       CASEF(fmax)
7702    default: unreachable("unknown reduction op");
7703 #undef CASEI
7704 #undef CASEF
7705    }
7706 }
7707
7708 void
7709 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7710 {
7711    Builder bld(ctx->program, ctx->block);
7712    Definition dst(get_ssa_temp(ctx, &instr->def));
7713    assert(dst.regClass().type() != RegType::vgpr);
7714    if (src.regClass().type() == RegType::vgpr)
7715       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7716    else
7717       bld.copy(dst, src);
7718 }
7719
7720 void
7721 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7722 {
7723    Builder bld(ctx->program, ctx->block);
7724    Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7725
7726    if (op == nir_op_fadd) {
7727       src_tmp = as_vgpr(ctx, src_tmp);
7728       Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
7729                                       : dst.getTemp();
7730
7731       if (src.ssa->bit_size == 16) {
7732          count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7733          bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7734       } else {
7735          assert(src.ssa->bit_size == 32);
7736          count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7737          bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7738       }
7739
7740       if (tmp != dst.getTemp())
7741          bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7742
7743       return;
7744    }
7745
7746    if (dst.regClass() == s1)
7747       src_tmp = bld.as_uniform(src_tmp);
7748
7749    if (op == nir_op_ixor && count.type() == RegType::sgpr)
7750       count =
7751          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7752    else if (op == nir_op_ixor)
7753       count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7754
7755    assert(dst.getTemp().type() == count.type());
7756
7757    if (nir_src_is_const(src)) {
7758       if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7759          bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7760       else if (nir_src_as_uint(src) == 1)
7761          bld.copy(dst, count);
7762       else if (nir_src_as_uint(src) == 0)
7763          bld.copy(dst, Operand::zero(dst.bytes()));
7764       else if (count.type() == RegType::vgpr)
7765          bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7766       else
7767          bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7768    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
7769       bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7770    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
7771       bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7772    } else if (dst.getTemp().type() == RegType::vgpr) {
7773       bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7774    } else {
7775       bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7776    }
7777 }
7778
7779 bool
7780 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7781 {
7782    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7783    if (op == nir_op_imul || op == nir_op_fmul)
7784       return false;
7785
7786    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7787       Builder bld(ctx->program, ctx->block);
7788       Definition dst(get_ssa_temp(ctx, &instr->def));
7789       unsigned bit_size = instr->src[0].ssa->bit_size;
7790       if (bit_size > 32)
7791          return false;
7792
7793       Temp thread_count =
7794          bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7795       thread_count = emit_wqm(bld, thread_count, Temp(0, s1), nir_intrinsic_include_helpers(instr));
7796
7797       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7798    } else {
7799       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7800    }
7801
7802    return true;
7803 }
7804
7805 bool
7806 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7807 {
7808    Builder bld(ctx->program, ctx->block);
7809    Definition dst(get_ssa_temp(ctx, &instr->def));
7810    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7811    bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7812
7813    if (op == nir_op_imul || op == nir_op_fmul)
7814       return false;
7815
7816    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7817       if (instr->src[0].ssa->bit_size > 32)
7818          return false;
7819
7820       Temp packed_tid;
7821       if (inc)
7822          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7823       else
7824          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7825       packed_tid = emit_wqm(bld, packed_tid);
7826
7827       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7828       return true;
7829    }
7830
7831    assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7832           op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7833
7834    if (inc) {
7835       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7836       return true;
7837    }
7838
7839    /* Copy the source and write the reduction operation identity to the first lane. */
7840    Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7841    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7842    ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7843    if (dst.bytes() == 8) {
7844       Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7845       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7846       uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7847       uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7848
7849       lo =
7850          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
7851       hi =
7852          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
7853       bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7854    } else {
7855       uint32_t identity = get_reduction_identity(reduce_op, 0);
7856       bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
7857                     as_vgpr(ctx, src));
7858    }
7859
7860    return true;
7861 }
7862
7863 Temp
7864 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7865                      Definition dst, Temp src)
7866 {
7867    assert(src.bytes() <= 8);
7868    assert(src.type() == RegType::vgpr);
7869
7870    Builder bld(ctx->program, ctx->block);
7871
7872    unsigned num_defs = 0;
7873    Definition defs[5];
7874    defs[num_defs++] = dst;
7875    defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7876
7877    /* scalar identity temporary */
7878    bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) &&
7879                      aco_op != aco_opcode::p_reduce;
7880    if (aco_op == aco_opcode::p_exclusive_scan) {
7881       need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7882                      op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7883                      op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7884                      op == fmul64);
7885    }
7886    if (need_sitmp)
7887       defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7888
7889    /* scc clobber */
7890    defs[num_defs++] = bld.def(s1, scc);
7891
7892    /* vcc clobber */
7893    bool clobber_vcc = false;
7894    if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9)
7895       clobber_vcc = true;
7896    if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8)
7897       clobber_vcc = true;
7898    if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7899       clobber_vcc = true;
7900
7901    if (clobber_vcc)
7902       defs[num_defs++] = bld.def(bld.lm, vcc);
7903
7904    Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7905       aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7906    reduce->operands[0] = Operand(src);
7907    /* setup_reduce_temp will update these undef operands if needed */
7908    reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7909    reduce->operands[2] = Operand(v1.as_linear());
7910    std::copy(defs, defs + num_defs, reduce->definitions.begin());
7911
7912    reduce->reduce_op = op;
7913    reduce->cluster_size = cluster_size;
7914    bld.insert(std::move(reduce));
7915
7916    return dst.getTemp();
7917 }
7918
7919 void
7920 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
7921 {
7922    Builder bld(ctx->program, ctx->block);
7923    Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7924    Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7925
7926    Temp ddx_1, ddx_2, ddy_1, ddy_2;
7927    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7928    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7929    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7930
7931    /* Build DD X/Y */
7932    if (ctx->program->gfx_level >= GFX8) {
7933       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7934       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7935       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7936       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7937       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7938       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7939    } else {
7940       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7941       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7942       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7943       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7944       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_1);
7945
7946       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7947       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7948       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_2);
7949       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7950       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7951    }
7952
7953    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7954    aco_opcode mad =
7955       ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
7956    Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
7957    Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
7958    tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
7959    tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
7960    Temp wqm1 = bld.tmp(v1);
7961    emit_wqm(bld, tmp1, wqm1, true);
7962    Temp wqm2 = bld.tmp(v1);
7963    emit_wqm(bld, tmp2, wqm2, true);
7964    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7965    return;
7966 }
7967
7968 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
7969 Temp lanecount_to_mask(isel_context* ctx, Temp count);
7970 void pops_await_overlapped_waves(isel_context* ctx);
7971
7972 Temp
7973 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, enum glsl_interp_mode interp)
7974 {
7975    bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
7976    if (intrin == nir_intrinsic_load_barycentric_pixel ||
7977        intrin == nir_intrinsic_load_barycentric_at_offset) {
7978       return get_arg(ctx, linear ? ctx->args->linear_center : ctx->args->persp_center);
7979    } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
7980       return get_arg(ctx, linear ? ctx->args->linear_centroid : ctx->args->persp_centroid);
7981    } else {
7982       assert(intrin == nir_intrinsic_load_barycentric_sample);
7983       return get_arg(ctx, linear ? ctx->args->linear_sample : ctx->args->persp_sample);
7984    }
7985 }
7986
7987 void
7988 ds_ordered_count_offsets(isel_context* ctx, unsigned index_operand, unsigned wave_release,
7989                          unsigned wave_done, unsigned* offset0, unsigned* offset1)
7990 {
7991    unsigned ordered_count_index = index_operand & 0x3f;
7992    unsigned count_dword = (index_operand >> 24) & 0xf;
7993
7994    assert(ctx->options->gfx_level >= GFX10);
7995    assert(count_dword >= 1 && count_dword <= 4);
7996
7997    *offset0 = ordered_count_index << 2;
7998    *offset1 = wave_release | (wave_done << 1) | ((count_dword - 1) << 6);
7999
8000    if (ctx->options->gfx_level < GFX11)
8001       *offset1 |= 3 /* GS shader type */ << 2;
8002 }
8003
8004 struct aco_export_mrt {
8005    Operand out[4];
8006    unsigned enabled_channels;
8007    unsigned target;
8008    bool compr;
8009 };
8010
8011 static void
8012 create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
8013                                 const struct aco_export_mrt* mrt1)
8014 {
8015    Builder bld(ctx->program, ctx->block);
8016
8017    aco_ptr<Pseudo_instruction> exp{create_instruction<Pseudo_instruction>(
8018       aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
8019    for (unsigned i = 0; i < 4; i++) {
8020       exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
8021       exp->operands[i].setLateKill(true);
8022       exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
8023       exp->operands[i + 4].setLateKill(true);
8024    }
8025
8026    RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
8027    exp->definitions[0] = bld.def(type); /* mrt0 */
8028    exp->definitions[1] = bld.def(type); /* mrt1 */
8029    exp->definitions[2] = bld.def(v1);
8030    exp->definitions[3] = bld.def(bld.lm);
8031    exp->definitions[4] = bld.def(bld.lm, vcc);
8032    exp->definitions[5] = bld.def(s1, scc);
8033    ctx->block->instructions.emplace_back(std::move(exp));
8034
8035    ctx->program->has_color_exports = true;
8036 }
8037
8038 void
8039 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
8040 {
8041    Builder bld(ctx->program, ctx->block);
8042    switch (instr->intrinsic) {
8043    case nir_intrinsic_load_barycentric_sample:
8044    case nir_intrinsic_load_barycentric_pixel:
8045    case nir_intrinsic_load_barycentric_centroid: {
8046       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
8047       Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
8048       assert(bary.size() == 2);
8049       Temp dst = get_ssa_temp(ctx, &instr->def);
8050       bld.copy(Definition(dst), bary);
8051       emit_split_vector(ctx, dst, 2);
8052       break;
8053    }
8054    case nir_intrinsic_load_barycentric_model: {
8055       Temp model = get_arg(ctx, ctx->args->pull_model);
8056       assert(model.size() == 3);
8057       Temp dst = get_ssa_temp(ctx, &instr->def);
8058       bld.copy(Definition(dst), model);
8059       emit_split_vector(ctx, dst, 3);
8060       break;
8061    }
8062    case nir_intrinsic_load_barycentric_at_offset: {
8063       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8064       RegClass rc = RegClass(offset.type(), 1);
8065       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8066       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8067       Temp bary = get_interp_param(ctx, instr->intrinsic,
8068                                    (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8069       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->def), bary, pos1, pos2);
8070       break;
8071    }
8072    case nir_intrinsic_load_front_face: {
8073       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8074                Operand::zero(), get_arg(ctx, ctx->args->front_face));
8075       break;
8076    }
8077    case nir_intrinsic_load_view_index: {
8078       Temp dst = get_ssa_temp(ctx, &instr->def);
8079       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->view_index)));
8080       break;
8081    }
8082    case nir_intrinsic_load_frag_coord: {
8083       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->def), 4);
8084       break;
8085    }
8086    case nir_intrinsic_load_frag_shading_rate:
8087       emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->def));
8088       break;
8089    case nir_intrinsic_load_sample_pos: {
8090       Temp posx = get_arg(ctx, ctx->args->frag_pos[0]);
8091       Temp posy = get_arg(ctx, ctx->args->frag_pos[1]);
8092       bld.pseudo(
8093          aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->def)),
8094          posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8095          posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8096       break;
8097    }
8098    case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8099    case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8100    case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8101    case nir_intrinsic_load_input:
8102    case nir_intrinsic_load_input_vertex:
8103       if (ctx->program->stage == fragment_fs)
8104          visit_load_fs_input(ctx, instr);
8105       else
8106          isel_err(&instr->instr, "Shader inputs should have been lowered in NIR.");
8107       break;
8108    case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8109    case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8110    case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8111    case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8112    case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8113    case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8114    case nir_intrinsic_shared_atomic:
8115    case nir_intrinsic_shared_atomic_swap: visit_shared_atomic(ctx, instr); break;
8116    case nir_intrinsic_load_shared2_amd:
8117    case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break;
8118    case nir_intrinsic_bindless_image_load:
8119    case nir_intrinsic_bindless_image_fragment_mask_load_amd:
8120    case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
8121    case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
8122    case nir_intrinsic_bindless_image_atomic:
8123    case nir_intrinsic_bindless_image_atomic_swap: visit_image_atomic(ctx, instr); break;
8124    case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8125    case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8126    case nir_intrinsic_load_typed_buffer_amd:
8127    case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8128    case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8129    case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;
8130    case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break;
8131    case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break;
8132    case nir_intrinsic_global_atomic_amd:
8133    case nir_intrinsic_global_atomic_swap_amd: visit_global_atomic(ctx, instr); break;
8134    case nir_intrinsic_ssbo_atomic:
8135    case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
8136    case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8137    case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8138    case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
8139    case nir_intrinsic_load_num_workgroups: {
8140       Temp dst = get_ssa_temp(ctx, &instr->def);
8141       if (ctx->options->load_grid_size_from_user_sgpr) {
8142          bld.copy(Definition(dst), get_arg(ctx, ctx->args->num_work_groups));
8143       } else {
8144          Temp addr = get_arg(ctx, ctx->args->num_work_groups);
8145          assert(addr.regClass() == s2);
8146          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8147                     bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
8148                     bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
8149       }
8150       emit_split_vector(ctx, dst, 3);
8151       break;
8152    }
8153    case nir_intrinsic_load_ray_launch_size: {
8154       Temp dst = get_ssa_temp(ctx, &instr->def);
8155       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->rt.launch_size)));
8156       emit_split_vector(ctx, dst, 3);
8157       break;
8158    }
8159    case nir_intrinsic_load_ray_launch_id: {
8160       Temp dst = get_ssa_temp(ctx, &instr->def);
8161       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->rt.launch_id)));
8162       emit_split_vector(ctx, dst, 3);
8163       break;
8164    }
8165    case nir_intrinsic_load_ray_launch_size_addr_amd: {
8166       Temp dst = get_ssa_temp(ctx, &instr->def);
8167       Temp addr = get_arg(ctx, ctx->args->rt.launch_size_addr);
8168       assert(addr.regClass() == s2);
8169       bld.copy(Definition(dst), Operand(addr));
8170       break;
8171    }
8172    case nir_intrinsic_load_local_invocation_id: {
8173       Temp dst = get_ssa_temp(ctx, &instr->def);
8174       if (ctx->options->gfx_level >= GFX11) {
8175          Temp local_ids[3];
8176
8177          /* Thread IDs are packed in VGPR0, 10 bits per component. */
8178          for (uint32_t i = 0; i < 3; i++) {
8179             local_ids[i] = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
8180                                     get_arg(ctx, ctx->args->local_invocation_ids),
8181                                     Operand::c32(i * 10u), Operand::c32(10u));
8182          }
8183
8184          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), local_ids[0], local_ids[1],
8185                     local_ids[2]);
8186       } else {
8187          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->local_invocation_ids)));
8188       }
8189       emit_split_vector(ctx, dst, 3);
8190       break;
8191    }
8192    case nir_intrinsic_load_workgroup_id: {
8193       Temp dst = get_ssa_temp(ctx, &instr->def);
8194       if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
8195          const struct ac_arg* ids = ctx->args->workgroup_ids;
8196          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8197                     ids[0].used ? Operand(get_arg(ctx, ids[0])) : Operand::zero(),
8198                     ids[1].used ? Operand(get_arg(ctx, ids[1])) : Operand::zero(),
8199                     ids[2].used ? Operand(get_arg(ctx, ids[2])) : Operand::zero());
8200          emit_split_vector(ctx, dst, 3);
8201       } else {
8202          isel_err(&instr->instr, "Unsupported stage for load_workgroup_id");
8203       }
8204       break;
8205    }
8206    case nir_intrinsic_load_local_invocation_index: {
8207       if (ctx->stage.hw == AC_HW_LOCAL_SHADER || ctx->stage.hw == AC_HW_HULL_SHADER) {
8208          if (ctx->options->gfx_level >= GFX11) {
8209             /* On GFX11, RelAutoIndex is WaveID * WaveSize + ThreadID. */
8210             Temp wave_id =
8211                bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8212                         get_arg(ctx, ctx->args->tcs_wave_id), Operand::c32(0u | (3u << 16)));
8213
8214             Temp temp = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), wave_id,
8215                                  Operand::c32(ctx->program->wave_size));
8216             emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def), Operand(), Operand(temp));
8217          } else {
8218             bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
8219                      get_arg(ctx, ctx->args->vs_rel_patch_id));
8220          }
8221          break;
8222       } else if (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER ||
8223                  ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER) {
8224          bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), thread_id_in_threadgroup(ctx));
8225          break;
8226       } else if (ctx->program->workgroup_size <= ctx->program->wave_size) {
8227          emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8228          break;
8229       }
8230
8231       Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8232
8233       /* The tg_size bits [6:11] contain the subgroup id,
8234        * we need this multiplied by the wave size, and then OR the thread id to it.
8235        */
8236       if (ctx->program->wave_size == 64) {
8237          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8238           * feed that to v_or */
8239          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8240                                 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->tg_size));
8241          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num, id);
8242       } else {
8243          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8244          Temp tg_num =
8245             bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8246                      get_arg(ctx, ctx->args->tg_size), Operand::c32(0x6u | (0x6u << 16)));
8247          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num,
8248                   Operand::c32(0x5u), id);
8249       }
8250       break;
8251    }
8252    case nir_intrinsic_load_subgroup_invocation: {
8253       emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8254       break;
8255    }
8256    case nir_intrinsic_ballot: {
8257       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8258       Temp dst = get_ssa_temp(ctx, &instr->def);
8259
8260       if (instr->src[0].ssa->bit_size == 1) {
8261          assert(src.regClass() == bld.lm);
8262       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8263          src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8264       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8265          src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8266       } else {
8267          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8268       }
8269
8270       /* Make sure that all inactive lanes return zero.
8271        * Value-numbering might remove the comparison above */
8272       src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8273       if (dst.size() != bld.lm.size()) {
8274          /* Wave32 with ballot size set to 64 */
8275          src =
8276             bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero());
8277       }
8278
8279       emit_wqm(bld, src, dst);
8280       break;
8281    }
8282    case nir_intrinsic_shuffle:
8283    case nir_intrinsic_read_invocation: {
8284       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8285       if (!nir_src_is_divergent(instr->src[0])) {
8286          emit_uniform_subgroup(ctx, instr, src);
8287       } else {
8288          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8289          if (instr->intrinsic == nir_intrinsic_read_invocation ||
8290              !nir_src_is_divergent(instr->src[1]))
8291             tid = bld.as_uniform(tid);
8292          Temp dst = get_ssa_temp(ctx, &instr->def);
8293
8294          if (instr->def.bit_size != 1)
8295             src = as_vgpr(ctx, src);
8296
8297          if (src.regClass() == v1b || src.regClass() == v2b) {
8298             Temp tmp = bld.tmp(v1);
8299             tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
8300             if (dst.type() == RegType::vgpr)
8301                bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8302                           bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8303             else
8304                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8305          } else if (src.regClass() == v1) {
8306             emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);
8307          } else if (src.regClass() == v2) {
8308             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8309             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8310             lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));
8311             hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));
8312             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8313             emit_split_vector(ctx, dst, 2);
8314          } else if (instr->def.bit_size == 1 && tid.regClass() == s1) {
8315             assert(src.regClass() == bld.lm);
8316             Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
8317             bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8318          } else if (instr->def.bit_size == 1 && tid.regClass() == v1) {
8319             assert(src.regClass() == bld.lm);
8320             Temp tmp;
8321             if (ctx->program->gfx_level <= GFX7)
8322                tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
8323             else if (ctx->program->wave_size == 64)
8324                tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
8325             else
8326                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
8327             tmp = emit_extract_vector(ctx, tmp, 0, v1);
8328             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
8329             emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp),
8330                      dst);
8331          } else {
8332             isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8333          }
8334       }
8335       break;
8336    }
8337    case nir_intrinsic_load_sample_id: {
8338       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8339                get_arg(ctx, ctx->args->ancillary), Operand::c32(8u), Operand::c32(4u));
8340       break;
8341    }
8342    case nir_intrinsic_read_first_invocation: {
8343       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8344       Temp dst = get_ssa_temp(ctx, &instr->def);
8345       if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
8346          emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);
8347       } else if (src.regClass() == v2) {
8348          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8349          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8350          lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
8351          hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
8352          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8353          emit_split_vector(ctx, dst, 2);
8354       } else if (instr->def.bit_size == 1) {
8355          assert(src.regClass() == bld.lm);
8356          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8357                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8358          bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8359       } else {
8360          bld.copy(Definition(dst), src);
8361       }
8362       break;
8363    }
8364    case nir_intrinsic_vote_all: {
8365       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8366       Temp dst = get_ssa_temp(ctx, &instr->def);
8367       assert(src.regClass() == bld.lm);
8368       assert(dst.regClass() == bld.lm);
8369
8370       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8371       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
8372                .def(1)
8373                .getTemp();
8374       Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
8375       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8376       break;
8377    }
8378    case nir_intrinsic_vote_any: {
8379       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8380       Temp dst = get_ssa_temp(ctx, &instr->def);
8381       assert(src.regClass() == bld.lm);
8382       assert(dst.regClass() == bld.lm);
8383
8384       Temp tmp = bool_to_scalar_condition(ctx, src);
8385       bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8386       break;
8387    }
8388    case nir_intrinsic_reduce:
8389    case nir_intrinsic_inclusive_scan:
8390    case nir_intrinsic_exclusive_scan: {
8391       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8392       Temp dst = get_ssa_temp(ctx, &instr->def);
8393       nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8394       unsigned cluster_size =
8395          instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8396       cluster_size = util_next_power_of_two(
8397          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8398       bool create_helpers =
8399          instr->intrinsic == nir_intrinsic_reduce && nir_intrinsic_include_helpers(instr);
8400
8401       if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
8402           instr->def.bit_size != 1) {
8403          /* We use divergence analysis to assign the regclass, so check if it's
8404           * working as expected */
8405          ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8406          if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8407             expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8408          assert(instr->def.divergent == expected_divergent);
8409
8410          if (instr->intrinsic == nir_intrinsic_reduce) {
8411             if (emit_uniform_reduce(ctx, instr))
8412                break;
8413          } else if (emit_uniform_scan(ctx, instr)) {
8414             break;
8415          }
8416       }
8417
8418       if (instr->def.bit_size == 1) {
8419          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
8420             op = nir_op_iand;
8421          else if (op == nir_op_iadd)
8422             op = nir_op_ixor;
8423          else if (op == nir_op_umax || op == nir_op_imax)
8424             op = nir_op_ior;
8425          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
8426
8427          switch (instr->intrinsic) {
8428          case nir_intrinsic_reduce:
8429             emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst, create_helpers);
8430             break;
8431          case nir_intrinsic_exclusive_scan:
8432             emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);
8433             break;
8434          case nir_intrinsic_inclusive_scan:
8435             emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
8436             break;
8437          default: assert(false);
8438          }
8439       } else if (cluster_size == 1) {
8440          bld.copy(Definition(dst), src);
8441       } else {
8442          unsigned bit_size = instr->src[0].ssa->bit_size;
8443
8444          src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8445
8446          ReduceOp reduce_op = get_reduce_op(op, bit_size);
8447
8448          aco_opcode aco_op;
8449          switch (instr->intrinsic) {
8450          case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8451          case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8452          case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8453          default: unreachable("unknown reduce intrinsic");
8454          }
8455
8456          Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
8457                                              bld.def(dst.regClass()), src);
8458          emit_wqm(bld, tmp_dst, dst, create_helpers);
8459       }
8460       break;
8461    }
8462    case nir_intrinsic_quad_broadcast:
8463    case nir_intrinsic_quad_swap_horizontal:
8464    case nir_intrinsic_quad_swap_vertical:
8465    case nir_intrinsic_quad_swap_diagonal:
8466    case nir_intrinsic_quad_swizzle_amd: {
8467       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8468
8469       if (!instr->def.divergent) {
8470          emit_uniform_subgroup(ctx, instr, src);
8471          break;
8472       }
8473
8474       /* Quad broadcast lane. */
8475       unsigned lane = 0;
8476       /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8477       bool bool_use_valu = instr->def.bit_size == 1;
8478
8479       uint16_t dpp_ctrl = 0;
8480
8481       switch (instr->intrinsic) {
8482       case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8483       case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8484       case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8485       case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break;
8486       case nir_intrinsic_quad_broadcast:
8487          lane = nir_src_as_const_value(instr->src[1])->u32;
8488          dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8489          bool_use_valu = false;
8490          break;
8491       default: break;
8492       }
8493
8494       Temp dst = get_ssa_temp(ctx, &instr->def);
8495       Temp tmp(dst);
8496
8497       /* Setup source. */
8498       if (bool_use_valu)
8499          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8500                             Operand::c32(-1), src);
8501       else if (instr->def.bit_size != 1)
8502          src = as_vgpr(ctx, src);
8503
8504       /* Setup temporary destination. */
8505       if (bool_use_valu)
8506          tmp = bld.tmp(v1);
8507       else if (ctx->program->stage == fragment_fs)
8508          tmp = bld.tmp(dst.regClass());
8509
8510       if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8511          /* Special case for quad broadcast using SALU only. */
8512          assert(src.regClass() == bld.lm && tmp.regClass() == bld.lm);
8513
8514          uint32_t half_mask = 0x11111111u << lane;
8515          Operand mask_tmp = bld.lm.bytes() == 4
8516                                ? Operand::c32(half_mask)
8517                                : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8518                                             Operand::c32(half_mask), Operand::c32(half_mask));
8519
8520          src =
8521             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8522          src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8523          bld.sop1(Builder::s_wqm, Definition(tmp), src);
8524       } else if (instr->def.bit_size <= 32 || bool_use_valu) {
8525          unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
8526          Definition def = excess_bytes ? bld.def(v1) : Definition(tmp);
8527
8528          if (ctx->program->gfx_level >= GFX8)
8529             bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl);
8530          else
8531             bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8532
8533          if (excess_bytes)
8534             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp),
8535                        bld.def(RegClass::get(tmp.type(), excess_bytes)), def.getTemp());
8536       } else if (instr->def.bit_size == 64) {
8537          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8538          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8539
8540          if (ctx->program->gfx_level >= GFX8) {
8541             lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl);
8542             hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl);
8543          } else {
8544             lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8545             hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8546          }
8547
8548          bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lo, hi);
8549          emit_split_vector(ctx, tmp, 2);
8550       } else {
8551          isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8552       }
8553
8554       if (tmp.id() != dst.id()) {
8555          if (bool_use_valu)
8556             tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
8557
8558          /* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */
8559          emit_wqm(bld, tmp, dst, true);
8560       }
8561
8562       break;
8563    }
8564    case nir_intrinsic_masked_swizzle_amd: {
8565       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8566       if (!instr->def.divergent) {
8567          emit_uniform_subgroup(ctx, instr, src);
8568          break;
8569       }
8570       Temp dst = get_ssa_temp(ctx, &instr->def);
8571       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8572
8573       if (instr->def.bit_size != 1)
8574          src = as_vgpr(ctx, src);
8575
8576       if (instr->def.bit_size == 1) {
8577          assert(src.regClass() == bld.lm);
8578          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8579                             Operand::c32(-1), src);
8580          src = emit_masked_swizzle(ctx, bld, src, mask);
8581          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8582          emit_wqm(bld, tmp, dst);
8583       } else if (dst.regClass() == v1b) {
8584          Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8585          emit_extract_vector(ctx, tmp, 0, dst);
8586       } else if (dst.regClass() == v2b) {
8587          Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8588          emit_extract_vector(ctx, tmp, 0, dst);
8589       } else if (dst.regClass() == v1) {
8590          emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);
8591       } else if (dst.regClass() == v2) {
8592          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8593          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8594          lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));
8595          hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));
8596          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8597          emit_split_vector(ctx, dst, 2);
8598       } else {
8599          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8600       }
8601       break;
8602    }
8603    case nir_intrinsic_write_invocation_amd: {
8604       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8605       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8606       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8607       Temp dst = get_ssa_temp(ctx, &instr->def);
8608       if (dst.regClass() == v1) {
8609          /* src2 is ignored for writelane. RA assigns the same reg for dst */
8610          emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst);
8611       } else if (dst.regClass() == v2) {
8612          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8613          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8614          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8615          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8616          Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8617          Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8618          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8619          emit_split_vector(ctx, dst, 2);
8620       } else {
8621          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8622       }
8623       break;
8624    }
8625    case nir_intrinsic_mbcnt_amd: {
8626       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8627       Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8628       Temp dst = get_ssa_temp(ctx, &instr->def);
8629       /* Fit 64-bit mask for wave32 */
8630       src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8631       Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));
8632       emit_wqm(bld, wqm_tmp, dst);
8633       break;
8634    }
8635    case nir_intrinsic_lane_permute_16_amd: {
8636       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8637       Temp dst = get_ssa_temp(ctx, &instr->def);
8638       assert(ctx->program->gfx_level >= GFX10);
8639
8640       if (src.regClass() == s1) {
8641          bld.copy(Definition(dst), src);
8642       } else if (dst.regClass() == v1 && src.regClass() == v1) {
8643          bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8644                   bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8645                   bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8646       } else {
8647          isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8648       }
8649       break;
8650    }
8651    case nir_intrinsic_load_helper_invocation:
8652    case nir_intrinsic_is_helper_invocation: {
8653       /* load_helper() after demote() get lowered to is_helper().
8654        * Otherwise, these two behave the same. */
8655       Temp dst = get_ssa_temp(ctx, &instr->def);
8656       bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8657       ctx->block->kind |= block_kind_needs_lowering;
8658       ctx->program->needs_exact = true;
8659       break;
8660    }
8661    case nir_intrinsic_demote:
8662    case nir_intrinsic_demote_if: {
8663       Operand cond = Operand::c32(-1u);
8664       if (instr->intrinsic == nir_intrinsic_demote_if) {
8665          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8666          assert(src.regClass() == bld.lm);
8667          cond =
8668             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8669       }
8670
8671       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8672
8673       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8674          ctx->cf_info.exec_potentially_empty_discard = true;
8675
8676       ctx->block->kind |= block_kind_uses_discard;
8677       ctx->program->needs_exact = true;
8678       break;
8679    }
8680    case nir_intrinsic_terminate:
8681    case nir_intrinsic_terminate_if:
8682    case nir_intrinsic_discard:
8683    case nir_intrinsic_discard_if: {
8684       Operand cond = Operand::c32(-1u);
8685       if (instr->intrinsic == nir_intrinsic_discard_if ||
8686           instr->intrinsic == nir_intrinsic_terminate_if) {
8687          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8688          assert(src.regClass() == bld.lm);
8689          cond =
8690             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8691
8692          ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(instr->src[0]);
8693       }
8694
8695       bld.pseudo(aco_opcode::p_discard_if, cond);
8696
8697       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8698          ctx->cf_info.exec_potentially_empty_discard = true;
8699       ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
8700       ctx->block->kind |= block_kind_uses_discard;
8701       ctx->program->needs_exact = true;
8702       break;
8703    }
8704    case nir_intrinsic_first_invocation: {
8705       emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8706                get_ssa_temp(ctx, &instr->def));
8707       break;
8708    }
8709    case nir_intrinsic_last_invocation: {
8710       Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8711       Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
8712                            Operand::c32(ctx->program->wave_size - 1u), flbit);
8713       emit_wqm(bld, last, get_ssa_temp(ctx, &instr->def));
8714       break;
8715    }
8716    case nir_intrinsic_elect: {
8717       /* p_elect is lowered in aco_insert_exec_mask.
8718        * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8719        * two p_elect with different exec masks as the same.
8720        */
8721       Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm), Operand(exec, bld.lm));
8722       emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->def));
8723       ctx->block->kind |= block_kind_needs_lowering;
8724       break;
8725    }
8726    case nir_intrinsic_shader_clock: {
8727       Temp dst = get_ssa_temp(ctx, &instr->def);
8728       if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
8729           ctx->options->gfx_level >= GFX10_3) {
8730          /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8731          Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8732          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8733       } else if (nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE &&
8734                  ctx->options->gfx_level >= GFX11) {
8735          bld.sop1(aco_opcode::s_sendmsg_rtn_b64, Definition(dst),
8736                   Operand::c32(sendmsg_rtn_get_realtime));
8737       } else {
8738          aco_opcode opcode = nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE
8739                                 ? aco_opcode::s_memrealtime
8740                                 : aco_opcode::s_memtime;
8741          bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8742       }
8743       emit_split_vector(ctx, dst, 2);
8744       break;
8745    }
8746    case nir_intrinsic_load_vertex_id_zero_base: {
8747       Temp dst = get_ssa_temp(ctx, &instr->def);
8748       bld.copy(Definition(dst), get_arg(ctx, ctx->args->vertex_id));
8749       break;
8750    }
8751    case nir_intrinsic_load_first_vertex: {
8752       Temp dst = get_ssa_temp(ctx, &instr->def);
8753       bld.copy(Definition(dst), get_arg(ctx, ctx->args->base_vertex));
8754       break;
8755    }
8756    case nir_intrinsic_load_base_instance: {
8757       Temp dst = get_ssa_temp(ctx, &instr->def);
8758       bld.copy(Definition(dst), get_arg(ctx, ctx->args->start_instance));
8759       break;
8760    }
8761    case nir_intrinsic_load_instance_id: {
8762       Temp dst = get_ssa_temp(ctx, &instr->def);
8763       bld.copy(Definition(dst), get_arg(ctx, ctx->args->instance_id));
8764       break;
8765    }
8766    case nir_intrinsic_load_draw_id: {
8767       Temp dst = get_ssa_temp(ctx, &instr->def);
8768       bld.copy(Definition(dst), get_arg(ctx, ctx->args->draw_id));
8769       break;
8770    }
8771    case nir_intrinsic_load_invocation_id: {
8772       Temp dst = get_ssa_temp(ctx, &instr->def);
8773
8774       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8775          if (ctx->options->gfx_level >= GFX10)
8776             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
8777                          get_arg(ctx, ctx->args->gs_invocation_id));
8778          else
8779             bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_invocation_id));
8780       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8781          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->tcs_rel_ids),
8782                   Operand::c32(8u), Operand::c32(5u));
8783       } else {
8784          unreachable("Unsupported stage for load_invocation_id");
8785       }
8786
8787       break;
8788    }
8789    case nir_intrinsic_load_primitive_id: {
8790       Temp dst = get_ssa_temp(ctx, &instr->def);
8791
8792       switch (ctx->shader->info.stage) {
8793       case MESA_SHADER_GEOMETRY:
8794          bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
8795          break;
8796       case MESA_SHADER_TESS_CTRL:
8797          bld.copy(Definition(dst), get_arg(ctx, ctx->args->tcs_patch_id));
8798          break;
8799       case MESA_SHADER_TESS_EVAL:
8800          bld.copy(Definition(dst), get_arg(ctx, ctx->args->tes_patch_id));
8801          break;
8802       default:
8803          if (ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && !ctx->stage.has(SWStage::GS)) {
8804             /* In case of NGG, the GS threads always have the primitive ID
8805              * even if there is no SW GS. */
8806             bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
8807             break;
8808          } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
8809             bld.copy(Definition(dst), get_arg(ctx, ctx->args->vs_prim_id));
8810             break;
8811          }
8812          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8813       }
8814
8815       break;
8816    }
8817    case nir_intrinsic_sendmsg_amd: {
8818       unsigned imm = nir_intrinsic_base(instr);
8819       Temp m0_content = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8820       bld.sopp(aco_opcode::s_sendmsg, bld.m0(m0_content), -1, imm);
8821       break;
8822    }
8823    case nir_intrinsic_load_gs_wave_id_amd: {
8824       Temp dst = get_ssa_temp(ctx, &instr->def);
8825       if (ctx->args->merged_wave_info.used)
8826          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
8827                     get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(2u), Operand::c32(8u),
8828                     Operand::zero());
8829       else if (ctx->args->gs_wave_id.used)
8830          bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_wave_id));
8831       else
8832          unreachable("Shader doesn't have GS wave ID.");
8833       break;
8834    }
8835    case nir_intrinsic_is_subgroup_invocation_lt_amd: {
8836       Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8837       bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src));
8838       break;
8839    }
8840    case nir_intrinsic_gds_atomic_add_amd: {
8841       Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8842       Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8843       Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8844       Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8845       bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8846              true);
8847       break;
8848    }
8849    case nir_intrinsic_load_sbt_base_amd: {
8850       Temp dst = get_ssa_temp(ctx, &instr->def);
8851       Temp addr = get_arg(ctx, ctx->args->rt.sbt_descriptors);
8852       assert(addr.regClass() == s2);
8853       bld.copy(Definition(dst), Operand(addr));
8854       break;
8855    }
8856    case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8857    case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
8858       bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
8859                get_arg(ctx, ctx->args->rt.dynamic_callable_stack_base));
8860       break;
8861    case nir_intrinsic_load_resume_shader_address_amd: {
8862       bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)),
8863                  bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr)));
8864       break;
8865    }
8866    case nir_intrinsic_overwrite_vs_arguments_amd: {
8867       ctx->arg_temps[ctx->args->vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
8868       ctx->arg_temps[ctx->args->instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
8869       break;
8870    }
8871    case nir_intrinsic_overwrite_tes_arguments_amd: {
8872       ctx->arg_temps[ctx->args->tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
8873       ctx->arg_temps[ctx->args->tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
8874       ctx->arg_temps[ctx->args->tes_rel_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
8875       ctx->arg_temps[ctx->args->tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[2].ssa);
8876       break;
8877    }
8878    case nir_intrinsic_load_scalar_arg_amd:
8879    case nir_intrinsic_load_vector_arg_amd: {
8880       assert(nir_intrinsic_base(instr) < ctx->args->arg_count);
8881       Temp dst = get_ssa_temp(ctx, &instr->def);
8882       Temp src = ctx->arg_temps[nir_intrinsic_base(instr)];
8883       assert(src.id());
8884       assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr
8885                                                                                   : RegType::vgpr));
8886       bld.copy(Definition(dst), src);
8887       emit_split_vector(ctx, dst, dst.size());
8888       break;
8889    }
8890    case nir_intrinsic_ordered_xfb_counter_add_amd: {
8891       Temp dst = get_ssa_temp(ctx, &instr->def);
8892       Temp ordered_id = get_ssa_temp(ctx, instr->src[0].ssa);
8893       Temp counter = get_ssa_temp(ctx, instr->src[1].ssa);
8894
8895       Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
8896       unsigned offset0, offset1;
8897       Instruction* ds_instr;
8898       Operand m;
8899
8900       /* Lock a GDS mutex. */
8901       ds_ordered_count_offsets(ctx, 1 << 24u, false, false, &offset0, &offset1);
8902       m = bld.m0(bld.as_uniform(ordered_id));
8903       ds_instr =
8904          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
8905       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
8906
8907       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
8908          aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)};
8909       unsigned write_mask = nir_intrinsic_write_mask(instr);
8910
8911       bool use_gds_registers = ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl;
8912
8913       for (unsigned i = 0; i < instr->num_components; i++) {
8914          if (write_mask & (1 << i)) {
8915             Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
8916
8917             if (use_gds_registers) {
8918                ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), Operand(),
8919                                  chan_counter, i * 4, 0u, true);
8920             } else {
8921                m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0x100u)));
8922
8923                ds_instr = bld.ds(aco_opcode::ds_add_rtn_u32, bld.def(v1), gds_base, chan_counter, m,
8924                                  i * 4, 0u, true);
8925             }
8926             ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
8927
8928             vec->operands[i] = Operand(ds_instr->definitions[0].getTemp());
8929          } else {
8930             vec->operands[i] = Operand::zero();
8931          }
8932       }
8933
8934       vec->definitions[0] = Definition(dst);
8935       ctx->block->instructions.emplace_back(std::move(vec));
8936
8937       /* Unlock a GDS mutex. */
8938       ds_ordered_count_offsets(ctx, 1 << 24u, true, true, &offset0, &offset1);
8939       m = bld.m0(bld.as_uniform(ordered_id));
8940       ds_instr =
8941          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
8942       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
8943
8944       emit_split_vector(ctx, dst, instr->num_components);
8945       break;
8946    }
8947    case nir_intrinsic_xfb_counter_sub_amd: {
8948       bool use_gds_registers = ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl;
8949
8950       unsigned write_mask = nir_intrinsic_write_mask(instr);
8951       Temp counter = get_ssa_temp(ctx, instr->src[0].ssa);
8952       Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
8953
8954       u_foreach_bit (i, write_mask) {
8955          Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
8956          Instruction* ds_instr;
8957
8958          if (use_gds_registers) {
8959             ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
8960                               i * 4, 0u, true);
8961          } else {
8962             Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0x100u)));
8963
8964             ds_instr = bld.ds(aco_opcode::ds_sub_rtn_u32, bld.def(v1), gds_base, chan_counter, m,
8965                               i * 4, 0u, true);
8966          }
8967          ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
8968       }
8969       break;
8970    }
8971    case nir_intrinsic_export_amd: {
8972       unsigned flags = nir_intrinsic_flags(instr);
8973       unsigned target = nir_intrinsic_base(instr);
8974       unsigned write_mask = nir_intrinsic_write_mask(instr);
8975
8976       /* Mark vertex export block. */
8977       if (target == V_008DFC_SQ_EXP_POS || target <= V_008DFC_SQ_EXP_NULL)
8978          ctx->block->kind |= block_kind_export_end;
8979
8980       if (target < V_008DFC_SQ_EXP_MRTZ)
8981          ctx->program->has_color_exports = true;
8982
8983       aco_ptr<Export_instruction> exp{
8984          create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
8985
8986       exp->dest = target;
8987       exp->enabled_mask = write_mask;
8988       exp->compressed = flags & AC_EXP_FLAG_COMPRESSED;
8989
8990       /* ACO may reorder position/mrt export instructions, then mark done for last
8991        * export instruction. So don't respect the nir AC_EXP_FLAG_DONE for position/mrt
8992        * exports here and leave it to ACO.
8993        */
8994       if (target == V_008DFC_SQ_EXP_PRIM)
8995          exp->done = flags & AC_EXP_FLAG_DONE;
8996       else
8997          exp->done = false;
8998
8999       /* ACO may reorder mrt export instructions, then mark valid mask for last
9000        * export instruction. So don't respect the nir AC_EXP_FLAG_VALID_MASK for mrt
9001        * exports here and leave it to ACO.
9002        */
9003       if (target > V_008DFC_SQ_EXP_NULL)
9004          exp->valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
9005       else
9006          exp->valid_mask = false;
9007
9008       /* Compressed export uses two bits for a channel. */
9009       uint32_t channel_mask =
9010          exp->compressed ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0) : write_mask;
9011
9012       Temp value = get_ssa_temp(ctx, instr->src[0].ssa);
9013       for (unsigned i = 0; i < 4; i++) {
9014          exp->operands[i] = channel_mask & BITFIELD_BIT(i)
9015                                ? Operand(emit_extract_vector(ctx, value, i, v1))
9016                                : Operand(v1);
9017       }
9018
9019       ctx->block->instructions.emplace_back(std::move(exp));
9020       break;
9021    }
9022    case nir_intrinsic_export_dual_src_blend_amd: {
9023       Temp val0 = get_ssa_temp(ctx, instr->src[0].ssa);
9024       Temp val1 = get_ssa_temp(ctx, instr->src[1].ssa);
9025       unsigned write_mask = nir_intrinsic_write_mask(instr);
9026
9027       struct aco_export_mrt mrt0, mrt1;
9028       for (unsigned i = 0; i < 4; i++) {
9029          mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val0, i, v1))
9030                                                     : Operand(v1);
9031
9032          mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val1, i, v1))
9033                                                     : Operand(v1);
9034       }
9035       mrt0.enabled_channels = mrt1.enabled_channels = write_mask;
9036
9037       create_fs_dual_src_export_gfx11(ctx, &mrt0, &mrt1);
9038
9039       ctx->block->kind |= block_kind_export_end;
9040       break;
9041    }
9042    case nir_intrinsic_strict_wqm_coord_amd: {
9043       Temp dst = get_ssa_temp(ctx, &instr->def);
9044       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9045       Temp tmp = bld.tmp(RegClass::get(RegType::vgpr, dst.bytes()));
9046       unsigned begin_size = nir_intrinsic_base(instr);
9047
9048       unsigned num_src = 1;
9049       auto it = ctx->allocated_vec.find(src.id());
9050       if (it != ctx->allocated_vec.end())
9051          num_src = src.bytes() / it->second[0].bytes();
9052
9053       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9054          aco_opcode::p_create_vector, Format::PSEUDO, num_src + !!begin_size, 1)};
9055
9056       if (begin_size)
9057          vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
9058       for (unsigned i = 0; i < num_src; i++) {
9059          Temp comp = it != ctx->allocated_vec.end() ? it->second[i] : src;
9060          vec->operands[i + !!begin_size] = Operand(comp);
9061       }
9062
9063       vec->definitions[0] = Definition(tmp);
9064       ctx->block->instructions.emplace_back(std::move(vec));
9065
9066       bld.pseudo(aco_opcode::p_start_linear_vgpr, Definition(dst), tmp);
9067       break;
9068    }
9069    case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
9070       Temp dst = get_ssa_temp(ctx, &instr->def);
9071       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9072                Operand::c32(aco_symbol_lds_ngg_scratch_base));
9073       break;
9074    }
9075    case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: {
9076       Temp dst = get_ssa_temp(ctx, &instr->def);
9077       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9078                Operand::c32(aco_symbol_lds_ngg_gs_out_vertex_base));
9079       break;
9080    }
9081    case nir_intrinsic_store_scalar_arg_amd: {
9082       ctx->arg_temps[nir_intrinsic_base(instr)] =
9083          bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9084       break;
9085    }
9086    case nir_intrinsic_store_vector_arg_amd: {
9087       ctx->arg_temps[nir_intrinsic_base(instr)] =
9088          as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
9089       break;
9090    }
9091    case nir_intrinsic_begin_invocation_interlock: {
9092       pops_await_overlapped_waves(ctx);
9093       break;
9094    }
9095    case nir_intrinsic_end_invocation_interlock: {
9096       if (ctx->options->gfx_level < GFX11)
9097          bld.pseudo(aco_opcode::p_pops_gfx9_ordered_section_done);
9098       break;
9099    }
9100    default:
9101       isel_err(&instr->instr, "Unimplemented intrinsic instr");
9102       abort();
9103
9104       break;
9105    }
9106 }
9107
9108 void
9109 get_const_vec(nir_def* vec, nir_const_value* cv[4])
9110 {
9111    if (vec->parent_instr->type != nir_instr_type_alu)
9112       return;
9113    nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9114    if (vec_instr->op != nir_op_vec(vec->num_components))
9115       return;
9116
9117    for (unsigned i = 0; i < vec->num_components; i++) {
9118       cv[i] =
9119          vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9120    }
9121 }
9122
9123 void
9124 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9125 {
9126    assert(instr->op != nir_texop_samples_identical);
9127
9128    Builder bld(ctx->program, ctx->block);
9129    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9130         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9131         has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
9132    Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9133                            offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(),
9134                            coord = Temp(), wqm_coord = Temp();
9135    std::vector<Temp> coords;
9136    std::vector<Temp> derivs;
9137    nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9138
9139    for (unsigned i = 0; i < instr->num_srcs; i++) {
9140       switch (instr->src[i].src_type) {
9141       case nir_tex_src_texture_handle:
9142          resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9143          break;
9144       case nir_tex_src_sampler_handle:
9145          sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9146          break;
9147       default: break;
9148       }
9149    }
9150
9151    bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
9152                                   (instr->dest_type & (nir_type_int | nir_type_uint));
9153    bool tg4_integer_cube_workaround =
9154       tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9155
9156    bool a16 = false, g16 = false;
9157
9158    int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
9159    if (coord_idx > 0)
9160       a16 = instr->src[coord_idx].src.ssa->bit_size == 16;
9161
9162    int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
9163    if (ddx_idx > 0)
9164       g16 = instr->src[ddx_idx].src.ssa->bit_size == 16;
9165
9166    for (unsigned i = 0; i < instr->num_srcs; i++) {
9167       switch (instr->src[i].src_type) {
9168       case nir_tex_src_coord: {
9169          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9170          coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9171          break;
9172       }
9173       case nir_tex_src_backend1: {
9174          assert(instr->src[i].src.ssa->bit_size == 32);
9175          wqm_coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9176          has_wqm_coord = true;
9177          break;
9178       }
9179       case nir_tex_src_bias:
9180          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9181          /* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */
9182          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9183          has_bias = true;
9184          break;
9185       case nir_tex_src_lod: {
9186          if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9187             level_zero = true;
9188          } else {
9189             assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9190             lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9191             has_lod = true;
9192          }
9193          break;
9194       }
9195       case nir_tex_src_min_lod:
9196          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9197          clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9198          has_clamped_lod = true;
9199          break;
9200       case nir_tex_src_comparator:
9201          if (instr->is_shadow) {
9202             assert(instr->src[i].src.ssa->bit_size == 32);
9203             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9204             has_compare = true;
9205          }
9206          break;
9207       case nir_tex_src_offset:
9208       case nir_tex_src_backend2:
9209          assert(instr->src[i].src.ssa->bit_size == 32);
9210          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9211          get_const_vec(instr->src[i].src.ssa, const_offset);
9212          has_offset = true;
9213          break;
9214       case nir_tex_src_ddx:
9215          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9216          ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9217          has_ddx = true;
9218          break;
9219       case nir_tex_src_ddy:
9220          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9221          ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9222          has_ddy = true;
9223          break;
9224       case nir_tex_src_ms_index:
9225          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9226          sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9227          has_sample_index = true;
9228          break;
9229       case nir_tex_src_texture_offset:
9230       case nir_tex_src_sampler_offset:
9231       default: break;
9232       }
9233    }
9234
9235    if (has_wqm_coord) {
9236       assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb ||
9237              instr->op == nir_texop_lod);
9238       assert(wqm_coord.regClass().is_linear_vgpr());
9239       assert(!a16 && !g16);
9240    }
9241
9242    if (instr->op == nir_texop_tg4 && !has_lod && !instr->is_gather_implicit_lod)
9243       level_zero = true;
9244
9245    if (has_offset) {
9246       assert(instr->op != nir_texop_txf);
9247
9248       aco_ptr<Instruction> tmp_instr;
9249       Temp acc, pack = Temp();
9250
9251       uint32_t pack_const = 0;
9252       for (unsigned i = 0; i < offset.size(); i++) {
9253          if (!const_offset[i])
9254             continue;
9255          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9256       }
9257
9258       if (offset.type() == RegType::sgpr) {
9259          for (unsigned i = 0; i < offset.size(); i++) {
9260             if (const_offset[i])
9261                continue;
9262
9263             acc = emit_extract_vector(ctx, offset, i, s1);
9264             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9265                            Operand::c32(0x3Fu));
9266
9267             if (i) {
9268                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9269                               Operand::c32(8u * i));
9270             }
9271
9272             if (pack == Temp()) {
9273                pack = acc;
9274             } else {
9275                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9276             }
9277          }
9278
9279          if (pack_const && pack != Temp())
9280             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9281                             Operand::c32(pack_const), pack);
9282       } else {
9283          for (unsigned i = 0; i < offset.size(); i++) {
9284             if (const_offset[i])
9285                continue;
9286
9287             acc = emit_extract_vector(ctx, offset, i, v1);
9288             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9289
9290             if (i) {
9291                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9292             }
9293
9294             if (pack == Temp()) {
9295                pack = acc;
9296             } else {
9297                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9298             }
9299          }
9300
9301          if (pack_const && pack != Temp())
9302             pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9303       }
9304       if (pack_const && pack == Temp())
9305          offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9306       else if (pack == Temp())
9307          has_offset = false;
9308       else
9309          offset = pack;
9310    }
9311
9312    std::vector<Temp> unpacked_coord;
9313    if (coord != Temp())
9314       unpacked_coord.push_back(coord);
9315    if (has_sample_index)
9316       unpacked_coord.push_back(sample_index);
9317    if (has_lod)
9318       unpacked_coord.push_back(lod);
9319    if (has_clamped_lod)
9320       unpacked_coord.push_back(clamped_lod);
9321
9322    coords = emit_pack_v1(ctx, unpacked_coord);
9323
9324    /* pack derivatives */
9325    if (has_ddx || has_ddy) {
9326       assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
9327       std::array<Temp, 2> ddxddy = {ddx, ddy};
9328       for (Temp tmp : ddxddy) {
9329          if (tmp == Temp())
9330             continue;
9331          std::vector<Temp> unpacked = {tmp};
9332          for (Temp derv : emit_pack_v1(ctx, unpacked))
9333             derivs.push_back(derv);
9334       }
9335       has_derivs = true;
9336    }
9337
9338    unsigned dim = 0;
9339    bool da = false;
9340    if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
9341       dim = ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array);
9342       da = should_declare_array((ac_image_dim)dim);
9343    }
9344
9345    /* Build tex instruction */
9346    unsigned dmask = nir_def_components_read(&instr->def) & 0xf;
9347    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9348       dmask = u_bit_consecutive(0, util_last_bit(dmask));
9349    if (instr->is_sparse)
9350       dmask = MAX2(dmask, 1) | 0x10;
9351    bool d16 = instr->def.bit_size == 16;
9352    Temp dst = get_ssa_temp(ctx, &instr->def);
9353    Temp tmp_dst = dst;
9354
9355    /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9356    if (instr->op == nir_texop_tg4) {
9357       assert(instr->def.num_components == (4 + instr->is_sparse));
9358       if (instr->is_shadow)
9359          dmask = 1;
9360       else
9361          dmask = 1 << instr->component;
9362       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9363          tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4));
9364    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9365       tmp_dst = bld.tmp(v1);
9366    } else if (util_bitcount(dmask) != instr->def.num_components || dst.type() == RegType::sgpr) {
9367       unsigned bytes = util_bitcount(dmask) * instr->def.bit_size / 8;
9368       tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes));
9369    }
9370
9371    Temp tg4_compare_cube_wa64 = Temp();
9372
9373    if (tg4_integer_workarounds) {
9374       Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9375       Temp size = bld.tmp(v2);
9376       MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, size, resource,
9377                                         Operand(s4), std::vector<Temp>{tg4_lod});
9378       tex->dim = dim;
9379       tex->dmask = 0x3;
9380       tex->da = da;
9381       emit_split_vector(ctx, size, size.size());
9382
9383       Temp half_texel[2];
9384       for (unsigned i = 0; i < 2; i++) {
9385          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9386          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9387          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9388          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9389                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9390       }
9391
9392       if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9393          /* In vulkan, whether the sampler uses unnormalized
9394           * coordinates or not is a dynamic property of the
9395           * sampler. Hence, to figure out whether or not we
9396           * need to divide by the texture size, we need to test
9397           * the sampler at runtime. This tests the bit set by
9398           * radv_init_sampler().
9399           */
9400          unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9401          Temp not_needed =
9402             bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx));
9403
9404          not_needed = bool_to_vector_condition(ctx, not_needed);
9405          half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9406                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9407          half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9408                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9409       }
9410
9411       Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9412                             bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9413
9414       if (tg4_integer_cube_workaround) {
9415          /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9416          Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9417          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9418             aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9419          split->operands[0] = Operand(resource);
9420          for (unsigned i = 0; i < resource.size(); i++) {
9421             desc[i] = bld.tmp(s1);
9422             split->definitions[i] = Definition(desc[i]);
9423          }
9424          ctx->block->instructions.emplace_back(std::move(split));
9425
9426          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9427                               Operand::c32(20u | (6u << 16)));
9428          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9429                                          Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9430
9431          Temp nfmt;
9432          if (instr->dest_type & nir_type_uint) {
9433             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9434                             Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9435                             Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9436          } else {
9437             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9438                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9439                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9440          }
9441          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9442          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9443
9444          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9445                          Operand::c32(26u));
9446
9447          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9448                             Operand::c32(C_008F14_NUM_FORMAT));
9449          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9450
9451          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9452             aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9453          for (unsigned i = 0; i < resource.size(); i++)
9454             vec->operands[i] = Operand(desc[i]);
9455          resource = bld.tmp(resource.regClass());
9456          vec->definitions[0] = Definition(resource);
9457          ctx->block->instructions.emplace_back(std::move(vec));
9458
9459          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9460                                   tg4_compare_cube_wa64);
9461          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9462                                   tg4_compare_cube_wa64);
9463       }
9464       coords[0] = new_coords[0];
9465       coords[1] = new_coords[1];
9466    }
9467
9468    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9469       // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9470       // ac_build_buffer_load_format_gfx9_safe()
9471
9472       assert(coords.size() == 1);
9473       aco_opcode op;
9474       if (d16) {
9475          switch (util_last_bit(dmask & 0xf)) {
9476          case 1: op = aco_opcode::buffer_load_format_d16_x; break;
9477          case 2: op = aco_opcode::buffer_load_format_d16_xy; break;
9478          case 3: op = aco_opcode::buffer_load_format_d16_xyz; break;
9479          case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break;
9480          default: unreachable("Tex instruction loads more than 4 components.");
9481          }
9482       } else {
9483          switch (util_last_bit(dmask & 0xf)) {
9484          case 1: op = aco_opcode::buffer_load_format_x; break;
9485          case 2: op = aco_opcode::buffer_load_format_xy; break;
9486          case 3: op = aco_opcode::buffer_load_format_xyz; break;
9487          case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9488          default: unreachable("Tex instruction loads more than 4 components.");
9489          }
9490       }
9491
9492       aco_ptr<MUBUF_instruction> mubuf{
9493          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9494       mubuf->operands[0] = Operand(resource);
9495       mubuf->operands[1] = Operand(coords[0]);
9496       mubuf->operands[2] = Operand::c32(0);
9497       mubuf->definitions[0] = Definition(tmp_dst);
9498       mubuf->idxen = true;
9499       mubuf->tfe = instr->is_sparse;
9500       if (mubuf->tfe)
9501          mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9502       ctx->block->instructions.emplace_back(std::move(mubuf));
9503
9504       expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9505       return;
9506    }
9507
9508    /* gather MIMG address components */
9509    std::vector<Temp> args;
9510    if (has_wqm_coord) {
9511       args.emplace_back(wqm_coord);
9512       if (!(ctx->block->kind & block_kind_top_level))
9513          ctx->unended_linear_vgprs.push_back(wqm_coord);
9514    }
9515    if (has_offset)
9516       args.emplace_back(offset);
9517    if (has_bias)
9518       args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
9519    if (has_compare)
9520       args.emplace_back(compare);
9521    if (has_derivs)
9522       args.insert(args.end(), derivs.begin(), derivs.end());
9523
9524    args.insert(args.end(), coords.begin(), coords.end());
9525
9526    if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9527        instr->op == nir_texop_fragment_mask_fetch_amd || instr->op == nir_texop_txf_ms) {
9528       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9529                             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9530                          ? aco_opcode::image_load
9531                          : aco_opcode::image_load_mip;
9532       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9533       MIMG_instruction* tex =
9534          emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, false, vdata);
9535       if (instr->op == nir_texop_fragment_mask_fetch_amd)
9536          tex->dim = da ? ac_image_2darray : ac_image_2d;
9537       else
9538          tex->dim = dim;
9539       tex->dmask = dmask & 0xf;
9540       tex->unrm = true;
9541       tex->da = da;
9542       tex->tfe = instr->is_sparse;
9543       tex->d16 = d16;
9544       tex->a16 = a16;
9545
9546       if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9547          /* Use 0x76543210 if the image doesn't have FMASK. */
9548          assert(dmask == 1 && dst.bytes() == 4);
9549          assert(dst.id() != tmp_dst.id());
9550
9551          if (dst.regClass() == s1) {
9552             Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9553                                         emit_extract_vector(ctx, resource, 1, s1));
9554             bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst),
9555                      Operand::c32(0x76543210), bld.scc(is_not_null));
9556          } else {
9557             Temp is_not_null = bld.tmp(bld.lm);
9558             bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9559                          emit_extract_vector(ctx, resource, 1, s1));
9560             bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9561                      bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9562          }
9563       } else {
9564          expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9565       }
9566       return;
9567    }
9568
9569    bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
9570
9571    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9572    aco_opcode opcode = aco_opcode::image_sample;
9573    if (has_offset) { /* image_sample_*_o */
9574       if (has_clamped_lod) {
9575          if (has_compare) {
9576             opcode = aco_opcode::image_sample_c_cl_o;
9577             if (separate_g16)
9578                opcode = aco_opcode::image_sample_c_d_cl_o_g16;
9579             else if (has_derivs)
9580                opcode = aco_opcode::image_sample_c_d_cl_o;
9581             if (has_bias)
9582                opcode = aco_opcode::image_sample_c_b_cl_o;
9583          } else {
9584             opcode = aco_opcode::image_sample_cl_o;
9585             if (separate_g16)
9586                opcode = aco_opcode::image_sample_d_cl_o_g16;
9587             else if (has_derivs)
9588                opcode = aco_opcode::image_sample_d_cl_o;
9589             if (has_bias)
9590                opcode = aco_opcode::image_sample_b_cl_o;
9591          }
9592       } else if (has_compare) {
9593          opcode = aco_opcode::image_sample_c_o;
9594          if (separate_g16)
9595             opcode = aco_opcode::image_sample_c_d_o_g16;
9596          else if (has_derivs)
9597             opcode = aco_opcode::image_sample_c_d_o;
9598          if (has_bias)
9599             opcode = aco_opcode::image_sample_c_b_o;
9600          if (level_zero)
9601             opcode = aco_opcode::image_sample_c_lz_o;
9602          if (has_lod)
9603             opcode = aco_opcode::image_sample_c_l_o;
9604       } else {
9605          opcode = aco_opcode::image_sample_o;
9606          if (separate_g16)
9607             opcode = aco_opcode::image_sample_d_o_g16;
9608          else if (has_derivs)
9609             opcode = aco_opcode::image_sample_d_o;
9610          if (has_bias)
9611             opcode = aco_opcode::image_sample_b_o;
9612          if (level_zero)
9613             opcode = aco_opcode::image_sample_lz_o;
9614          if (has_lod)
9615             opcode = aco_opcode::image_sample_l_o;
9616       }
9617    } else if (has_clamped_lod) { /* image_sample_*_cl */
9618       if (has_compare) {
9619          opcode = aco_opcode::image_sample_c_cl;
9620          if (separate_g16)
9621             opcode = aco_opcode::image_sample_c_d_cl_g16;
9622          else if (has_derivs)
9623             opcode = aco_opcode::image_sample_c_d_cl;
9624          if (has_bias)
9625             opcode = aco_opcode::image_sample_c_b_cl;
9626       } else {
9627          opcode = aco_opcode::image_sample_cl;
9628          if (separate_g16)
9629             opcode = aco_opcode::image_sample_d_cl_g16;
9630          else if (has_derivs)
9631             opcode = aco_opcode::image_sample_d_cl;
9632          if (has_bias)
9633             opcode = aco_opcode::image_sample_b_cl;
9634       }
9635    } else { /* no offset */
9636       if (has_compare) {
9637          opcode = aco_opcode::image_sample_c;
9638          if (separate_g16)
9639             opcode = aco_opcode::image_sample_c_d_g16;
9640          else if (has_derivs)
9641             opcode = aco_opcode::image_sample_c_d;
9642          if (has_bias)
9643             opcode = aco_opcode::image_sample_c_b;
9644          if (level_zero)
9645             opcode = aco_opcode::image_sample_c_lz;
9646          if (has_lod)
9647             opcode = aco_opcode::image_sample_c_l;
9648       } else {
9649          opcode = aco_opcode::image_sample;
9650          if (separate_g16)
9651             opcode = aco_opcode::image_sample_d_g16;
9652          else if (has_derivs)
9653             opcode = aco_opcode::image_sample_d;
9654          if (has_bias)
9655             opcode = aco_opcode::image_sample_b;
9656          if (level_zero)
9657             opcode = aco_opcode::image_sample_lz;
9658          if (has_lod)
9659             opcode = aco_opcode::image_sample_l;
9660       }
9661    }
9662
9663    if (instr->op == nir_texop_tg4) {
9664       /* GFX11 supports implicit LOD, but the extension is unsupported. */
9665       assert(level_zero || ctx->options->gfx_level < GFX11);
9666
9667       if (has_offset) { /* image_gather4_*_o */
9668          if (has_compare) {
9669             opcode = aco_opcode::image_gather4_c_o;
9670             if (level_zero)
9671                opcode = aco_opcode::image_gather4_c_lz_o;
9672             if (has_lod)
9673                opcode = aco_opcode::image_gather4_c_l_o;
9674             if (has_bias)
9675                opcode = aco_opcode::image_gather4_c_b_o;
9676          } else {
9677             opcode = aco_opcode::image_gather4_o;
9678             if (level_zero)
9679                opcode = aco_opcode::image_gather4_lz_o;
9680             if (has_lod)
9681                opcode = aco_opcode::image_gather4_l_o;
9682             if (has_bias)
9683                opcode = aco_opcode::image_gather4_b_o;
9684          }
9685       } else {
9686          if (has_compare) {
9687             opcode = aco_opcode::image_gather4_c;
9688             if (level_zero)
9689                opcode = aco_opcode::image_gather4_c_lz;
9690             if (has_lod)
9691                opcode = aco_opcode::image_gather4_c_l;
9692             if (has_bias)
9693                opcode = aco_opcode::image_gather4_c_b;
9694          } else {
9695             opcode = aco_opcode::image_gather4;
9696             if (level_zero)
9697                opcode = aco_opcode::image_gather4_lz;
9698             if (has_lod)
9699                opcode = aco_opcode::image_gather4_l;
9700             if (has_bias)
9701                opcode = aco_opcode::image_gather4_b;
9702          }
9703       }
9704    } else if (instr->op == nir_texop_lod) {
9705       opcode = aco_opcode::image_get_lod;
9706    }
9707
9708    bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9709                           !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9710                           instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9711
9712    Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9713    MIMG_instruction* tex =
9714       emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, implicit_derivs, vdata);
9715    tex->dim = dim;
9716    tex->dmask = dmask & 0xf;
9717    tex->da = da;
9718    tex->tfe = instr->is_sparse;
9719    tex->d16 = d16;
9720    tex->a16 = a16;
9721
9722    if (tg4_integer_cube_workaround) {
9723       assert(tmp_dst.id() != dst.id());
9724       assert(tmp_dst.size() == dst.size());
9725
9726       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9727       Temp val[4];
9728       for (unsigned i = 0; i < 4; i++) {
9729          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9730          Temp cvt_val;
9731          if (instr->dest_type & nir_type_uint)
9732             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9733          else
9734             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9735          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9736                            tg4_compare_cube_wa64);
9737       }
9738
9739       Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9740       if (instr->is_sparse)
9741          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9742                               val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9743       else
9744          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9745                               val[3]);
9746    }
9747    unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9748    expand_vector(ctx, tmp_dst, dst, instr->def.num_components, mask);
9749 }
9750
9751 Operand
9752 get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc, bool logical)
9753 {
9754    Temp tmp = get_ssa_temp(ctx, ssa);
9755    if (ssa->parent_instr->type == nir_instr_type_undef) {
9756       return Operand(rc);
9757    } else if (logical && ssa->bit_size == 1 &&
9758               ssa->parent_instr->type == nir_instr_type_load_const) {
9759       bool val = nir_instr_as_load_const(ssa->parent_instr)->value[0].b;
9760       return Operand::c32_or_c64(val ? -1 : 0, ctx->program->lane_mask == s2);
9761    } else {
9762       return Operand(tmp);
9763    }
9764 }
9765
9766 void
9767 visit_phi(isel_context* ctx, nir_phi_instr* instr)
9768 {
9769    aco_ptr<Pseudo_instruction> phi;
9770    Temp dst = get_ssa_temp(ctx, &instr->def);
9771    assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9772
9773    bool logical = !dst.is_linear() || instr->def.divergent;
9774    logical |= (ctx->block->kind & block_kind_merge) != 0;
9775    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9776
9777    /* we want a sorted list of sources, since the predecessor list is also sorted */
9778    std::map<unsigned, nir_def*> phi_src;
9779    nir_foreach_phi_src (src, instr)
9780       phi_src[src->pred->index] = src->src.ssa;
9781
9782    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9783    unsigned num_operands = 0;
9784    Operand* const operands = (Operand*)alloca(
9785       (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
9786    unsigned num_defined = 0;
9787    unsigned cur_pred_idx = 0;
9788    for (std::pair<unsigned, nir_def*> src : phi_src) {
9789       if (cur_pred_idx < preds.size()) {
9790          /* handle missing preds (IF merges with discard/break) and extra preds
9791           * (loop exit with discard) */
9792          unsigned block = ctx->cf_info.nir_to_aco[src.first];
9793          unsigned skipped = 0;
9794          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9795             skipped++;
9796          if (cur_pred_idx + skipped < preds.size()) {
9797             for (unsigned i = 0; i < skipped; i++)
9798                operands[num_operands++] = Operand(dst.regClass());
9799             cur_pred_idx += skipped;
9800          } else {
9801             continue;
9802          }
9803       }
9804       /* Handle missing predecessors at the end. This shouldn't happen with loop
9805        * headers and we can't ignore these sources for loop header phis. */
9806       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9807          continue;
9808       cur_pred_idx++;
9809       Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9810       operands[num_operands++] = op;
9811       num_defined += !op.isUndefined();
9812    }
9813    /* handle block_kind_continue_or_break at loop exit blocks */
9814    while (cur_pred_idx++ < preds.size())
9815       operands[num_operands++] = Operand(dst.regClass());
9816
9817    /* If the loop ends with a break, still add a linear continue edge in case
9818     * that break is divergent or continue_or_break is used. We'll either remove
9819     * this operand later in visit_loop() if it's not necessary or replace the
9820     * undef with something correct. */
9821    if (!logical && ctx->block->kind & block_kind_loop_header) {
9822       nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9823       nir_block* last = nir_loop_last_block(loop);
9824       if (last->successors[0] != instr->instr.block)
9825          operands[num_operands++] = Operand(RegClass());
9826    }
9827
9828    /* we can use a linear phi in some cases if one src is undef */
9829    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9830       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
9831                                                        num_operands, 1));
9832
9833       Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9834       Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9835       assert(invert->kind & block_kind_invert);
9836
9837       unsigned then_block = invert->linear_preds[0];
9838
9839       Block* insert_block = NULL;
9840       for (unsigned i = 0; i < num_operands; i++) {
9841          Operand op = operands[i];
9842          if (op.isUndefined())
9843             continue;
9844          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9845          phi->operands[0] = op;
9846          break;
9847       }
9848       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9849       phi->operands[1] = Operand(dst.regClass());
9850       phi->definitions[0] = Definition(dst);
9851       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9852       return;
9853    }
9854
9855    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9856    for (unsigned i = 0; i < num_operands; i++)
9857       phi->operands[i] = operands[i];
9858    phi->definitions[0] = Definition(dst);
9859    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9860 }
9861
9862 void
9863 visit_undef(isel_context* ctx, nir_undef_instr* instr)
9864 {
9865    Temp dst = get_ssa_temp(ctx, &instr->def);
9866
9867    assert(dst.type() == RegType::sgpr);
9868
9869    if (dst.size() == 1) {
9870       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9871    } else {
9872       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9873          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9874       for (unsigned i = 0; i < dst.size(); i++)
9875          vec->operands[i] = Operand::zero();
9876       vec->definitions[0] = Definition(dst);
9877       ctx->block->instructions.emplace_back(std::move(vec));
9878    }
9879 }
9880
9881 void
9882 begin_loop(isel_context* ctx, loop_context* lc)
9883 {
9884    // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9885    append_logical_end(ctx->block);
9886    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9887    Builder bld(ctx->program, ctx->block);
9888    bld.branch(aco_opcode::p_branch, bld.def(s2));
9889    unsigned loop_preheader_idx = ctx->block->index;
9890
9891    lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9892
9893    ctx->program->next_loop_depth++;
9894
9895    Block* loop_header = ctx->program->create_and_insert_block();
9896    loop_header->kind |= block_kind_loop_header;
9897    add_edge(loop_preheader_idx, loop_header);
9898    ctx->block = loop_header;
9899
9900    append_logical_start(ctx->block);
9901
9902    lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
9903    lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
9904    lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
9905    lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
9906    lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
9907 }
9908
9909 void
9910 end_loop(isel_context* ctx, loop_context* lc)
9911 {
9912    // TODO: what if a loop ends with a unconditional or uniformly branched continue
9913    //       and this branch is never taken?
9914    if (!ctx->cf_info.has_branch) {
9915       unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
9916       Builder bld(ctx->program, ctx->block);
9917       append_logical_end(ctx->block);
9918
9919       if (ctx->cf_info.exec_potentially_empty_discard ||
9920           ctx->cf_info.exec_potentially_empty_break) {
9921          /* Discards can result in code running with an empty exec mask.
9922           * This would result in divergent breaks not ever being taken. As a
9923           * workaround, break the loop when the loop mask is empty instead of
9924           * always continuing. */
9925          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9926          unsigned block_idx = ctx->block->index;
9927
9928          /* create helper blocks to avoid critical edges */
9929          Block* break_block = ctx->program->create_and_insert_block();
9930          break_block->kind = block_kind_uniform;
9931          bld.reset(break_block);
9932          bld.branch(aco_opcode::p_branch, bld.def(s2));
9933          add_linear_edge(block_idx, break_block);
9934          add_linear_edge(break_block->index, &lc->loop_exit);
9935
9936          Block* continue_block = ctx->program->create_and_insert_block();
9937          continue_block->kind = block_kind_uniform;
9938          bld.reset(continue_block);
9939          bld.branch(aco_opcode::p_branch, bld.def(s2));
9940          add_linear_edge(block_idx, continue_block);
9941          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
9942
9943          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9944             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
9945          ctx->block = &ctx->program->blocks[block_idx];
9946       } else {
9947          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
9948          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9949             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9950          else
9951             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9952       }
9953
9954       bld.reset(ctx->block);
9955       bld.branch(aco_opcode::p_branch, bld.def(s2));
9956    }
9957
9958    ctx->cf_info.has_branch = false;
9959    ctx->program->next_loop_depth--;
9960
9961    // TODO: if the loop has not a single exit, we must add one °°
9962    /* emit loop successor block */
9963    ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
9964    append_logical_start(ctx->block);
9965
9966 #if 0
9967    // TODO: check if it is beneficial to not branch on continues
9968    /* trim linear phis in loop header */
9969    for (auto&& instr : loop_entry->instructions) {
9970       if (instr->opcode == aco_opcode::p_linear_phi) {
9971          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
9972          new_phi->definitions[0] = instr->definitions[0];
9973          for (unsigned i = 0; i < new_phi->operands.size(); i++)
9974             new_phi->operands[i] = instr->operands[i];
9975          /* check that the remaining operands are all the same */
9976          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
9977             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
9978          instr.swap(new_phi);
9979       } else if (instr->opcode == aco_opcode::p_phi) {
9980          continue;
9981       } else {
9982          break;
9983       }
9984    }
9985 #endif
9986
9987    ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
9988    ctx->cf_info.parent_loop.exit = lc->exit_old;
9989    ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
9990    ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
9991    ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
9992    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
9993       ctx->cf_info.exec_potentially_empty_discard = false;
9994 }
9995
9996 void
9997 emit_loop_jump(isel_context* ctx, bool is_break)
9998 {
9999    Builder bld(ctx->program, ctx->block);
10000    Block* logical_target;
10001    append_logical_end(ctx->block);
10002    unsigned idx = ctx->block->index;
10003
10004    if (is_break) {
10005       logical_target = ctx->cf_info.parent_loop.exit;
10006       add_logical_edge(idx, logical_target);
10007       ctx->block->kind |= block_kind_break;
10008
10009       if (!ctx->cf_info.parent_if.is_divergent &&
10010           !ctx->cf_info.parent_loop.has_divergent_continue) {
10011          /* uniform break - directly jump out of the loop */
10012          ctx->block->kind |= block_kind_uniform;
10013          ctx->cf_info.has_branch = true;
10014          bld.branch(aco_opcode::p_branch, bld.def(s2));
10015          add_linear_edge(idx, logical_target);
10016          return;
10017       }
10018       ctx->cf_info.parent_loop.has_divergent_branch = true;
10019    } else {
10020       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10021       add_logical_edge(idx, logical_target);
10022       ctx->block->kind |= block_kind_continue;
10023
10024       if (!ctx->cf_info.parent_if.is_divergent) {
10025          /* uniform continue - directly jump to the loop header */
10026          ctx->block->kind |= block_kind_uniform;
10027          ctx->cf_info.has_branch = true;
10028          bld.branch(aco_opcode::p_branch, bld.def(s2));
10029          add_linear_edge(idx, logical_target);
10030          return;
10031       }
10032
10033       /* for potential uniform breaks after this continue,
10034          we must ensure that they are handled correctly */
10035       ctx->cf_info.parent_loop.has_divergent_continue = true;
10036       ctx->cf_info.parent_loop.has_divergent_branch = true;
10037    }
10038
10039    if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10040       ctx->cf_info.exec_potentially_empty_break = true;
10041       ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10042    }
10043
10044    /* remove critical edges from linear CFG */
10045    bld.branch(aco_opcode::p_branch, bld.def(s2));
10046    Block* break_block = ctx->program->create_and_insert_block();
10047    break_block->kind |= block_kind_uniform;
10048    add_linear_edge(idx, break_block);
10049    /* the loop_header pointer might be invalidated by this point */
10050    if (!is_break)
10051       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10052    add_linear_edge(break_block->index, logical_target);
10053    bld.reset(break_block);
10054    bld.branch(aco_opcode::p_branch, bld.def(s2));
10055
10056    Block* continue_block = ctx->program->create_and_insert_block();
10057    add_linear_edge(idx, continue_block);
10058    append_logical_start(continue_block);
10059    ctx->block = continue_block;
10060 }
10061
10062 void
10063 emit_loop_break(isel_context* ctx)
10064 {
10065    emit_loop_jump(ctx, true);
10066 }
10067
10068 void
10069 emit_loop_continue(isel_context* ctx)
10070 {
10071    emit_loop_jump(ctx, false);
10072 }
10073
10074 void
10075 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10076 {
10077    /* visit_block() would usually do this but divergent jumps updates ctx->block */
10078    ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10079
10080    switch (instr->type) {
10081    case nir_jump_break: emit_loop_break(ctx); break;
10082    case nir_jump_continue: emit_loop_continue(ctx); break;
10083    default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10084    }
10085 }
10086
10087 void
10088 visit_block(isel_context* ctx, nir_block* block)
10089 {
10090    if (ctx->block->kind & block_kind_top_level) {
10091       Builder bld(ctx->program, ctx->block);
10092       for (Temp tmp : ctx->unended_linear_vgprs)
10093          bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
10094       ctx->unended_linear_vgprs.clear();
10095    }
10096
10097    ctx->block->instructions.reserve(ctx->block->instructions.size() +
10098                                     exec_list_length(&block->instr_list) * 2);
10099    nir_foreach_instr (instr, block) {
10100       switch (instr->type) {
10101       case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10102       case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10103       case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10104       case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10105       case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10106       case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
10107       case nir_instr_type_deref: break;
10108       case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10109       default: isel_err(instr, "Unknown NIR instr type");
10110       }
10111    }
10112
10113    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10114       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10115 }
10116
10117 static Operand
10118 create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10119                      aco_ptr<Instruction>& header_phi, Operand* vals)
10120 {
10121    vals[0] = Operand(header_phi->definitions[0].getTemp());
10122    RegClass rc = vals[0].regClass();
10123
10124    unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10125
10126    unsigned next_pred = 1;
10127
10128    for (unsigned idx = first + 1; idx <= last; idx++) {
10129       Block& block = ctx->program->blocks[idx];
10130       if (block.loop_nest_depth != loop_nest_depth) {
10131          vals[idx - first] = vals[idx - 1 - first];
10132          continue;
10133       }
10134
10135       if ((block.kind & block_kind_continue) && block.index != last) {
10136          vals[idx - first] = header_phi->operands[next_pred];
10137          next_pred++;
10138          continue;
10139       }
10140
10141       bool all_same = true;
10142       for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10143          all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10144
10145       Operand val;
10146       if (all_same) {
10147          val = vals[block.linear_preds[0] - first];
10148       } else {
10149          aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10150             aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10151          for (unsigned i = 0; i < block.linear_preds.size(); i++)
10152             phi->operands[i] = vals[block.linear_preds[i] - first];
10153          val = Operand(ctx->program->allocateTmp(rc));
10154          phi->definitions[0] = Definition(val.getTemp());
10155          block.instructions.emplace(block.instructions.begin(), std::move(phi));
10156       }
10157       vals[idx - first] = val;
10158    }
10159
10160    return vals[last - first];
10161 }
10162
10163 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10164 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10165 static void end_uniform_if(isel_context* ctx, if_context* ic);
10166
10167 static void
10168 visit_loop(isel_context* ctx, nir_loop* loop)
10169 {
10170    assert(!nir_loop_has_continue_construct(loop));
10171    loop_context lc;
10172    begin_loop(ctx, &lc);
10173
10174    bool unreachable = visit_cf_list(ctx, &loop->body);
10175
10176    unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10177
10178    /* Fixup phis in loop header from unreachable blocks.
10179     * has_branch/has_divergent_branch also indicates if the loop ends with a
10180     * break/continue instruction, but we don't emit those if unreachable=true */
10181    if (unreachable) {
10182       assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10183       bool linear = ctx->cf_info.has_branch;
10184       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10185       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10186          if ((logical && instr->opcode == aco_opcode::p_phi) ||
10187              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
10188             /* the last operand should be the one that needs to be removed */
10189             instr->operands.pop_back();
10190          } else if (!is_phi(instr)) {
10191             break;
10192          }
10193       }
10194    }
10195
10196    /* Fixup linear phis in loop header from expecting a continue. Both this fixup
10197     * and the previous one shouldn't both happen at once because a break in the
10198     * merge block would get CSE'd */
10199    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10200       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10201       Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10202       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10203          if (instr->opcode == aco_opcode::p_linear_phi) {
10204             if (ctx->cf_info.has_branch)
10205                instr->operands.pop_back();
10206             else
10207                instr->operands.back() =
10208                   create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10209          } else if (!is_phi(instr)) {
10210             break;
10211          }
10212       }
10213    }
10214
10215    /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10216     * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10217     */
10218    if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10219       Builder bld(ctx->program, ctx->block);
10220       Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10221       if_context ic;
10222       begin_uniform_if_then(ctx, &ic, cond);
10223       emit_loop_break(ctx);
10224       begin_uniform_if_else(ctx, &ic);
10225       end_uniform_if(ctx, &ic);
10226    }
10227
10228    end_loop(ctx, &lc);
10229 }
10230
10231 static void
10232 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
10233                         nir_selection_control sel_ctrl = nir_selection_control_none)
10234 {
10235    ic->cond = cond;
10236
10237    append_logical_end(ctx->block);
10238    ctx->block->kind |= block_kind_branch;
10239
10240    /* branch to linear then block */
10241    assert(cond.regClass() == ctx->program->lane_mask);
10242    aco_ptr<Pseudo_branch_instruction> branch;
10243    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10244                                                               Format::PSEUDO_BRANCH, 1, 1));
10245    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10246    branch->operands[0] = Operand(cond);
10247    branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
10248                                       sel_ctrl == nir_selection_control_divergent_always_taken;
10249    ctx->block->instructions.push_back(std::move(branch));
10250
10251    ic->BB_if_idx = ctx->block->index;
10252    ic->BB_invert = Block();
10253    /* Invert blocks are intentionally not marked as top level because they
10254     * are not part of the logical cfg. */
10255    ic->BB_invert.kind |= block_kind_invert;
10256    ic->BB_endif = Block();
10257    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10258
10259    ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10260    ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10261    ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10262    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10263    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10264    ctx->cf_info.parent_if.is_divergent = true;
10265
10266    /* divergent branches use cbranch_execz */
10267    ctx->cf_info.exec_potentially_empty_discard = false;
10268    ctx->cf_info.exec_potentially_empty_break = false;
10269    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10270
10271    /** emit logical then block */
10272    ctx->program->next_divergent_if_logical_depth++;
10273    Block* BB_then_logical = ctx->program->create_and_insert_block();
10274    add_edge(ic->BB_if_idx, BB_then_logical);
10275    ctx->block = BB_then_logical;
10276    append_logical_start(BB_then_logical);
10277 }
10278
10279 static void
10280 begin_divergent_if_else(isel_context* ctx, if_context* ic,
10281                         nir_selection_control sel_ctrl = nir_selection_control_none)
10282 {
10283    Block* BB_then_logical = ctx->block;
10284    append_logical_end(BB_then_logical);
10285    /* branch from logical then block to invert block */
10286    aco_ptr<Pseudo_branch_instruction> branch;
10287    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10288                                                               Format::PSEUDO_BRANCH, 0, 1));
10289    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10290    BB_then_logical->instructions.emplace_back(std::move(branch));
10291    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10292    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10293       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10294    BB_then_logical->kind |= block_kind_uniform;
10295    assert(!ctx->cf_info.has_branch);
10296    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10297    ctx->cf_info.parent_loop.has_divergent_branch = false;
10298    ctx->program->next_divergent_if_logical_depth--;
10299
10300    /** emit linear then block */
10301    Block* BB_then_linear = ctx->program->create_and_insert_block();
10302    BB_then_linear->kind |= block_kind_uniform;
10303    add_linear_edge(ic->BB_if_idx, BB_then_linear);
10304    /* branch from linear then block to invert block */
10305    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10306                                                               Format::PSEUDO_BRANCH, 0, 1));
10307    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10308    BB_then_linear->instructions.emplace_back(std::move(branch));
10309    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10310
10311    /** emit invert merge block */
10312    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10313    ic->invert_idx = ctx->block->index;
10314
10315    /* branch to linear else block (skip else) */
10316    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10317                                                               Format::PSEUDO_BRANCH, 0, 1));
10318    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10319    branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
10320                                       sel_ctrl == nir_selection_control_divergent_always_taken;
10321    ctx->block->instructions.push_back(std::move(branch));
10322
10323    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10324    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10325    ic->exec_potentially_empty_break_depth_old = std::min(
10326       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10327    /* divergent branches use cbranch_execz */
10328    ctx->cf_info.exec_potentially_empty_discard = false;
10329    ctx->cf_info.exec_potentially_empty_break = false;
10330    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10331
10332    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10333    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10334
10335    /** emit logical else block */
10336    ctx->program->next_divergent_if_logical_depth++;
10337    Block* BB_else_logical = ctx->program->create_and_insert_block();
10338    add_logical_edge(ic->BB_if_idx, BB_else_logical);
10339    add_linear_edge(ic->invert_idx, BB_else_logical);
10340    ctx->block = BB_else_logical;
10341    append_logical_start(BB_else_logical);
10342 }
10343
10344 static void
10345 end_divergent_if(isel_context* ctx, if_context* ic)
10346 {
10347    Block* BB_else_logical = ctx->block;
10348    append_logical_end(BB_else_logical);
10349
10350    /* branch from logical else block to endif block */
10351    aco_ptr<Pseudo_branch_instruction> branch;
10352    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10353                                                               Format::PSEUDO_BRANCH, 0, 1));
10354    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10355    BB_else_logical->instructions.emplace_back(std::move(branch));
10356    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10357    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10358       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10359    BB_else_logical->kind |= block_kind_uniform;
10360    ctx->program->next_divergent_if_logical_depth--;
10361
10362    assert(!ctx->cf_info.has_branch);
10363    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10364
10365    /** emit linear else block */
10366    Block* BB_else_linear = ctx->program->create_and_insert_block();
10367    BB_else_linear->kind |= block_kind_uniform;
10368    add_linear_edge(ic->invert_idx, BB_else_linear);
10369
10370    /* branch from linear else block to endif block */
10371    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10372                                                               Format::PSEUDO_BRANCH, 0, 1));
10373    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10374    BB_else_linear->instructions.emplace_back(std::move(branch));
10375    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10376
10377    /** emit endif merge block */
10378    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10379    append_logical_start(ctx->block);
10380
10381    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10382    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10383    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10384    ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10385       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10386    if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10387        !ctx->cf_info.parent_if.is_divergent) {
10388       ctx->cf_info.exec_potentially_empty_break = false;
10389       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10390    }
10391    /* uniform control flow never has an empty exec-mask */
10392    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10393       ctx->cf_info.exec_potentially_empty_discard = false;
10394       ctx->cf_info.exec_potentially_empty_break = false;
10395       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10396    }
10397    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10398 }
10399
10400 static void
10401 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10402 {
10403    assert(cond.regClass() == s1);
10404
10405    append_logical_end(ctx->block);
10406    ctx->block->kind |= block_kind_uniform;
10407
10408    aco_ptr<Pseudo_branch_instruction> branch;
10409    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10410    branch.reset(
10411       create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10412    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10413    branch->operands[0] = Operand(cond);
10414    branch->operands[0].setFixed(scc);
10415    ctx->block->instructions.emplace_back(std::move(branch));
10416
10417    ic->BB_if_idx = ctx->block->index;
10418    ic->BB_endif = Block();
10419    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10420
10421    ctx->cf_info.has_branch = false;
10422    ctx->cf_info.parent_loop.has_divergent_branch = false;
10423
10424    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10425
10426    /** emit then block */
10427    ctx->program->next_uniform_if_depth++;
10428    Block* BB_then = ctx->program->create_and_insert_block();
10429    add_edge(ic->BB_if_idx, BB_then);
10430    append_logical_start(BB_then);
10431    ctx->block = BB_then;
10432 }
10433
10434 static void
10435 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10436 {
10437    Block* BB_then = ctx->block;
10438
10439    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10440    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10441
10442    if (!ic->uniform_has_then_branch) {
10443       append_logical_end(BB_then);
10444       /* branch from then block to endif block */
10445       aco_ptr<Pseudo_branch_instruction> branch;
10446       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10447                                                                  Format::PSEUDO_BRANCH, 0, 1));
10448       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10449       BB_then->instructions.emplace_back(std::move(branch));
10450       add_linear_edge(BB_then->index, &ic->BB_endif);
10451       if (!ic->then_branch_divergent)
10452          add_logical_edge(BB_then->index, &ic->BB_endif);
10453       BB_then->kind |= block_kind_uniform;
10454    }
10455
10456    ctx->cf_info.has_branch = false;
10457    ctx->cf_info.parent_loop.has_divergent_branch = false;
10458
10459    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10460    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10461
10462    /** emit else block */
10463    Block* BB_else = ctx->program->create_and_insert_block();
10464    add_edge(ic->BB_if_idx, BB_else);
10465    append_logical_start(BB_else);
10466    ctx->block = BB_else;
10467 }
10468
10469 static void
10470 end_uniform_if(isel_context* ctx, if_context* ic)
10471 {
10472    Block* BB_else = ctx->block;
10473
10474    if (!ctx->cf_info.has_branch) {
10475       append_logical_end(BB_else);
10476       /* branch from then block to endif block */
10477       aco_ptr<Pseudo_branch_instruction> branch;
10478       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10479                                                                  Format::PSEUDO_BRANCH, 0, 1));
10480       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10481       BB_else->instructions.emplace_back(std::move(branch));
10482       add_linear_edge(BB_else->index, &ic->BB_endif);
10483       if (!ctx->cf_info.parent_loop.has_divergent_branch)
10484          add_logical_edge(BB_else->index, &ic->BB_endif);
10485       BB_else->kind |= block_kind_uniform;
10486    }
10487
10488    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10489    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10490    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10491
10492    /** emit endif merge block */
10493    ctx->program->next_uniform_if_depth--;
10494    if (!ctx->cf_info.has_branch) {
10495       ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10496       append_logical_start(ctx->block);
10497    }
10498 }
10499
10500 static bool
10501 visit_if(isel_context* ctx, nir_if* if_stmt)
10502 {
10503    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10504    Builder bld(ctx->program, ctx->block);
10505    aco_ptr<Pseudo_branch_instruction> branch;
10506    if_context ic;
10507
10508    if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10509       /**
10510        * Uniform conditionals are represented in the following way*) :
10511        *
10512        * The linear and logical CFG:
10513        *                        BB_IF
10514        *                        /    \
10515        *       BB_THEN (logical)      BB_ELSE (logical)
10516        *                        \    /
10517        *                        BB_ENDIF
10518        *
10519        * *) Exceptions may be due to break and continue statements within loops
10520        *    If a break/continue happens within uniform control flow, it branches
10521        *    to the loop exit/entry block. Otherwise, it branches to the next
10522        *    merge block.
10523        **/
10524
10525       assert(cond.regClass() == ctx->program->lane_mask);
10526       cond = bool_to_scalar_condition(ctx, cond);
10527
10528       begin_uniform_if_then(ctx, &ic, cond);
10529       visit_cf_list(ctx, &if_stmt->then_list);
10530
10531       begin_uniform_if_else(ctx, &ic);
10532       visit_cf_list(ctx, &if_stmt->else_list);
10533
10534       end_uniform_if(ctx, &ic);
10535    } else { /* non-uniform condition */
10536       /**
10537        * To maintain a logical and linear CFG without critical edges,
10538        * non-uniform conditionals are represented in the following way*) :
10539        *
10540        * The linear CFG:
10541        *                        BB_IF
10542        *                        /    \
10543        *       BB_THEN (logical)      BB_THEN (linear)
10544        *                        \    /
10545        *                        BB_INVERT (linear)
10546        *                        /    \
10547        *       BB_ELSE (logical)      BB_ELSE (linear)
10548        *                        \    /
10549        *                        BB_ENDIF
10550        *
10551        * The logical CFG:
10552        *                        BB_IF
10553        *                        /    \
10554        *       BB_THEN (logical)      BB_ELSE (logical)
10555        *                        \    /
10556        *                        BB_ENDIF
10557        *
10558        * *) Exceptions may be due to break and continue statements within loops
10559        **/
10560
10561       begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
10562       visit_cf_list(ctx, &if_stmt->then_list);
10563
10564       begin_divergent_if_else(ctx, &ic, if_stmt->control);
10565       visit_cf_list(ctx, &if_stmt->else_list);
10566
10567       end_divergent_if(ctx, &ic);
10568    }
10569
10570    return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10571 }
10572
10573 static bool
10574 visit_cf_list(isel_context* ctx, struct exec_list* list)
10575 {
10576    foreach_list_typed (nir_cf_node, node, node, list) {
10577       switch (node->type) {
10578       case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10579       case nir_cf_node_if:
10580          if (!visit_if(ctx, nir_cf_node_as_if(node)))
10581             return true;
10582          break;
10583       case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10584       default: unreachable("unimplemented cf list type");
10585       }
10586    }
10587    return false;
10588 }
10589
10590 struct mrt_color_export {
10591    int slot;
10592    unsigned write_mask;
10593    Operand values[4];
10594    uint8_t col_format;
10595
10596    /* Fields below are only used for PS epilogs. */
10597    bool is_int8;
10598    bool is_int10;
10599    bool enable_mrt_output_nan_fixup;
10600 };
10601
10602 static void
10603 export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt)
10604 {
10605    Builder bld(ctx->program, ctx->block);
10606
10607    bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3],
10608            mrt->enabled_channels, mrt->target, mrt->compr);
10609
10610    ctx->program->has_color_exports = true;
10611 }
10612
10613 static bool
10614 export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export* out,
10615                     struct aco_export_mrt* mrt)
10616 {
10617    Builder bld(ctx->program, ctx->block);
10618    Operand values[4];
10619
10620    for (unsigned i = 0; i < 4; ++i) {
10621       values[i] = out->values[i];
10622    }
10623
10624    unsigned target;
10625    unsigned enabled_channels = 0;
10626    aco_opcode compr_op = aco_opcode::num_opcodes;
10627    bool compr = false;
10628    bool is_16bit = values[0].regClass() == v2b;
10629
10630    target = V_008DFC_SQ_EXP_MRT + out->slot;
10631
10632    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10633    if (out->enable_mrt_output_nan_fixup && !is_16bit &&
10634        (out->col_format == V_028714_SPI_SHADER_32_R ||
10635         out->col_format == V_028714_SPI_SHADER_32_GR ||
10636         out->col_format == V_028714_SPI_SHADER_32_AR ||
10637         out->col_format == V_028714_SPI_SHADER_32_ABGR ||
10638         out->col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10639       u_foreach_bit (i, out->write_mask) {
10640          Temp is_not_nan =
10641             bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]);
10642          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i],
10643                               is_not_nan);
10644       }
10645    }
10646
10647    switch (out->col_format) {
10648    case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10649
10650    case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10651
10652    case V_028714_SPI_SHADER_32_AR:
10653       if (ctx->options->gfx_level >= GFX10) {
10654          /* Special case: on GFX10, the outputs are different for 32_AR */
10655          enabled_channels = 0x3;
10656          values[1] = values[3];
10657          values[3] = Operand(v1);
10658       } else {
10659          enabled_channels = 0x9;
10660       }
10661       break;
10662
10663    case V_028714_SPI_SHADER_FP16_ABGR:
10664       for (int i = 0; i < 2; i++) {
10665          bool enabled = (out->write_mask >> (i * 2)) & 0x3;
10666          if (enabled) {
10667             enabled_channels |= 0x3 << (i * 2);
10668             if (is_16bit) {
10669                values[i] =
10670                   bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
10671                              values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
10672                              values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
10673             } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
10674                values[i] =
10675                   bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
10676                            values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10677                            values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10678             } else {
10679                values[i] =
10680                   bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
10681                            values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
10682                            values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
10683             }
10684          } else {
10685             values[i] = Operand(v1);
10686          }
10687       }
10688       values[2] = Operand(v1);
10689       values[3] = Operand(v1);
10690       compr = true;
10691       break;
10692
10693    case V_028714_SPI_SHADER_UNORM16_ABGR:
10694       if (is_16bit && ctx->options->gfx_level >= GFX9) {
10695          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10696       } else {
10697          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10698       }
10699       break;
10700
10701    case V_028714_SPI_SHADER_SNORM16_ABGR:
10702       if (is_16bit && ctx->options->gfx_level >= GFX9) {
10703          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10704       } else {
10705          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10706       }
10707       break;
10708
10709    case V_028714_SPI_SHADER_UINT16_ABGR:
10710       compr_op = aco_opcode::v_cvt_pk_u16_u32;
10711       if (out->is_int8 || out->is_int10) {
10712          /* clamp */
10713          uint32_t max_rgb = out->is_int8 ? 255 : out->is_int10 ? 1023 : 0;
10714
10715          u_foreach_bit (i, out->write_mask) {
10716             uint32_t max = i == 3 && out->is_int10 ? 3 : max_rgb;
10717
10718             values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]);
10719          }
10720       } else if (is_16bit) {
10721          u_foreach_bit (i, out->write_mask) {
10722             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10723             values[i] = Operand(tmp);
10724          }
10725       }
10726       break;
10727
10728    case V_028714_SPI_SHADER_SINT16_ABGR:
10729       compr_op = aco_opcode::v_cvt_pk_i16_i32;
10730       if (out->is_int8 || out->is_int10) {
10731          /* clamp */
10732          uint32_t max_rgb = out->is_int8 ? 127 : out->is_int10 ? 511 : 0;
10733          uint32_t min_rgb = out->is_int8 ? -128 : out->is_int10 ? -512 : 0;
10734
10735          u_foreach_bit (i, out->write_mask) {
10736             uint32_t max = i == 3 && out->is_int10 ? 1 : max_rgb;
10737             uint32_t min = i == 3 && out->is_int10 ? -2u : min_rgb;
10738
10739             values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]);
10740             values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]);
10741          }
10742       } else if (is_16bit) {
10743          u_foreach_bit (i, out->write_mask) {
10744             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10745             values[i] = Operand(tmp);
10746          }
10747       }
10748       break;
10749
10750    case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
10751
10752    case V_028714_SPI_SHADER_ZERO:
10753    default: return false;
10754    }
10755
10756    if (compr_op != aco_opcode::num_opcodes) {
10757       for (int i = 0; i < 2; i++) {
10758          /* check if at least one of the values to be compressed is enabled */
10759          bool enabled = (out->write_mask >> (i * 2)) & 0x3;
10760          if (enabled) {
10761             enabled_channels |= 0x3 << (i * 2);
10762             values[i] = bld.vop3(
10763                compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10764                values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10765          } else {
10766             values[i] = Operand(v1);
10767          }
10768       }
10769       values[2] = Operand(v1);
10770       values[3] = Operand(v1);
10771       compr = true;
10772    } else if (!compr) {
10773       for (int i = 0; i < 4; i++)
10774          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10775    }
10776
10777    if (ctx->program->gfx_level >= GFX11) {
10778       /* GFX11 doesn't use COMPR for exports, but the channel mask should be
10779        * 0x3 instead.
10780        */
10781       enabled_channels = compr ? 0x3 : enabled_channels;
10782       compr = false;
10783    }
10784
10785    for (unsigned i = 0; i < 4; i++)
10786       mrt->out[i] = values[i];
10787    mrt->target = target;
10788    mrt->enabled_channels = enabled_channels;
10789    mrt->compr = compr;
10790
10791    return true;
10792 }
10793
10794 static void
10795 create_fs_null_export(isel_context* ctx)
10796 {
10797    /* FS must always have exports.
10798     * So when there are none, we need to add a null export.
10799     */
10800
10801    Builder bld(ctx->program, ctx->block);
10802    /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */
10803    unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
10804    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
10805            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
10806
10807    ctx->program->has_color_exports = true;
10808 }
10809
10810 static void
10811 create_fs_jump_to_epilog(isel_context* ctx)
10812 {
10813    Builder bld(ctx->program, ctx->block);
10814    std::vector<Operand> color_exports;
10815    PhysReg exports_start(256); /* VGPR 0 */
10816
10817    for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) {
10818       unsigned color_index = slot - FRAG_RESULT_DATA0;
10819       unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
10820       unsigned write_mask = ctx->outputs.mask[slot];
10821
10822       if (!write_mask)
10823          continue;
10824
10825       PhysReg color_start(exports_start.reg() + color_index * 4);
10826
10827       for (unsigned i = 0; i < 4; i++) {
10828          if (!(write_mask & BITFIELD_BIT(i))) {
10829             color_exports.emplace_back(Operand(v1));
10830             continue;
10831          }
10832
10833          PhysReg chan_reg = color_start.advance(i * 4u);
10834          Operand chan(ctx->outputs.temps[slot * 4u + i]);
10835
10836          if (color_type == ACO_TYPE_FLOAT16) {
10837             chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan);
10838          } else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) {
10839             bool sign_ext = color_type == ACO_TYPE_INT16;
10840             Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
10841             chan = Operand(tmp);
10842          }
10843
10844          chan.setFixed(chan_reg);
10845          color_exports.emplace_back(chan);
10846       }
10847    }
10848
10849    Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.ps.epilog_pc));
10850
10851    aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
10852       aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + color_exports.size(), 0)};
10853    jump->operands[0] = Operand(continue_pc);
10854    for (unsigned i = 0; i < color_exports.size(); i++) {
10855       jump->operands[i + 1] = color_exports[i];
10856    }
10857    ctx->block->instructions.emplace_back(std::move(jump));
10858 }
10859
10860 PhysReg
10861 get_arg_reg(const struct ac_shader_args* args, struct ac_arg arg)
10862 {
10863    assert(arg.used);
10864    enum ac_arg_regfile file = args->args[arg.arg_index].file;
10865    unsigned reg = args->args[arg.arg_index].offset;
10866    return PhysReg(file == AC_ARG_SGPR ? reg : reg + 256);
10867 }
10868
10869 static Operand
10870 get_arg_for_end(isel_context* ctx, struct ac_arg arg)
10871 {
10872    return Operand(get_arg(ctx, arg), get_arg_reg(ctx->args, arg));
10873 }
10874
10875 static Temp
10876 get_tcs_out_current_patch_data_offset(isel_context* ctx)
10877 {
10878    Builder bld(ctx->program, ctx->block);
10879
10880    const unsigned output_vertex_size = ctx->program->info.tcs.num_linked_outputs * 4u;
10881    const unsigned pervertex_output_patch_size =
10882       ctx->program->info.tcs.tcs_vertices_out * output_vertex_size;
10883    const unsigned output_patch_stride =
10884       pervertex_output_patch_size + ctx->program->info.tcs.num_linked_patch_outputs * 4u;
10885
10886    Temp tcs_rel_ids = get_arg(ctx, ctx->args->tcs_rel_ids);
10887    Temp rel_patch_id =
10888       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand::c32(0u), Operand::c32(8u));
10889    Temp patch_offset = bld.v_mul_imm(bld.def(v1), rel_patch_id, output_patch_stride, false);
10890
10891    Temp tcs_offchip_layout = get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout);
10892
10893    Temp patch_control_points = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
10894                                         tcs_offchip_layout, Operand::c32(0x3f));
10895
10896    Temp num_patches = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10897                                tcs_offchip_layout, Operand::c32(0x60006));
10898
10899    Temp lshs_vertex_stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10900                                       tcs_offchip_layout, Operand::c32(0x8000c));
10901
10902    Temp input_patch_size =
10903       bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), patch_control_points, lshs_vertex_stride);
10904
10905    Temp output_patch0_offset =
10906       bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches, input_patch_size);
10907
10908    Temp output_patch_offset =
10909       bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
10910                      Operand::c32(pervertex_output_patch_size), output_patch0_offset);
10911
10912    return bld.nuw().vadd32(bld.def(v1), patch_offset, output_patch_offset);
10913 }
10914
10915 static Temp
10916 get_patch_base(isel_context* ctx)
10917 {
10918    Builder bld(ctx->program, ctx->block);
10919
10920    const unsigned output_vertex_size = ctx->program->info.tcs.num_linked_outputs * 16u;
10921    const unsigned pervertex_output_patch_size =
10922       ctx->program->info.tcs.tcs_vertices_out * output_vertex_size;
10923
10924    Temp num_patches =
10925       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10926                get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout), Operand::c32(0x60006));
10927
10928    return bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches,
10929                    Operand::c32(pervertex_output_patch_size));
10930 }
10931
10932 static void
10933 passthrough_all_args(isel_context* ctx, std::vector<Operand>& regs)
10934 {
10935    struct ac_arg arg;
10936    arg.used = true;
10937
10938    for (arg.arg_index = 0; arg.arg_index < ctx->args->arg_count; arg.arg_index++)
10939       regs.emplace_back(get_arg_for_end(ctx, arg));
10940 }
10941
10942 static void
10943 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
10944 {
10945    aco_ptr<Pseudo_instruction> end{create_instruction<Pseudo_instruction>(
10946       aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
10947
10948    for (unsigned i = 0; i < regs.size(); i++)
10949       end->operands[i] = regs[i];
10950
10951    ctx->block->instructions.emplace_back(std::move(end));
10952 }
10953
10954 static void
10955 create_tcs_jump_to_epilog(isel_context* ctx)
10956 {
10957    Builder bld(ctx->program, ctx->block);
10958
10959    PhysReg vgpr_start(256); /* VGPR 0 */
10960    PhysReg sgpr_start(0);   /* SGPR 0 */
10961
10962    /* SGPRs */
10963    Operand ring_offsets = Operand(get_arg(ctx, ctx->args->ring_offsets));
10964    ring_offsets.setFixed(sgpr_start);
10965
10966    Operand tess_offchip_offset = Operand(get_arg(ctx, ctx->args->tess_offchip_offset));
10967    tess_offchip_offset.setFixed(sgpr_start.advance(8u));
10968
10969    Operand tcs_factor_offset = Operand(get_arg(ctx, ctx->args->tcs_factor_offset));
10970    tcs_factor_offset.setFixed(sgpr_start.advance(12u));
10971
10972    Operand tcs_offchip_layout = Operand(get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout));
10973    tcs_offchip_layout.setFixed(sgpr_start.advance(16u));
10974
10975    Operand patch_base = Operand(get_patch_base(ctx));
10976    patch_base.setFixed(sgpr_start.advance(20u));
10977
10978    /* VGPRs */
10979    Operand tcs_out_current_patch_data_offset = Operand(get_tcs_out_current_patch_data_offset(ctx));
10980    tcs_out_current_patch_data_offset.setFixed(vgpr_start);
10981
10982    Operand invocation_id =
10983       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
10984                Operand::c32(8u), Operand::c32(5u));
10985    invocation_id.setFixed(vgpr_start.advance(4u));
10986
10987    Operand rel_patch_id =
10988       bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
10989                  Operand::c32(0u), Operand::c32(8u), Operand::c32(0u));
10990    rel_patch_id.setFixed(vgpr_start.advance(8u));
10991
10992    Temp continue_pc =
10993       convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.tcs.epilog_pc));
10994
10995    aco_ptr<Pseudo_instruction> jump{
10996       create_instruction<Pseudo_instruction>(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 9, 0)};
10997    jump->operands[0] = Operand(continue_pc);
10998    jump->operands[1] = ring_offsets;
10999    jump->operands[2] = tess_offchip_offset;
11000    jump->operands[3] = tcs_factor_offset;
11001    jump->operands[4] = tcs_offchip_layout;
11002    jump->operands[5] = patch_base;
11003    jump->operands[6] = tcs_out_current_patch_data_offset;
11004    jump->operands[7] = invocation_id;
11005    jump->operands[8] = rel_patch_id;
11006    ctx->block->instructions.emplace_back(std::move(jump));
11007 }
11008
11009 static void
11010 create_tcs_end_for_epilog(isel_context* ctx)
11011 {
11012    std::vector<Operand> regs;
11013
11014    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.tcs.tcs_offchip_layout));
11015    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.tcs.tes_offchip_addr));
11016    regs.emplace_back(get_arg_for_end(ctx, ctx->args->tess_offchip_offset));
11017    regs.emplace_back(get_arg_for_end(ctx, ctx->args->tcs_factor_offset));
11018
11019    Builder bld(ctx->program, ctx->block);
11020
11021    /* Leave a hole corresponding to the two input VGPRs. This ensures that
11022     * the invocation_id output does not alias the tcs_rel_ids input,
11023     * which saves a V_MOV on gfx9.
11024     */
11025    unsigned vgpr = 256 + ctx->args->num_vgprs_used;
11026
11027    Temp rel_patch_id =
11028       bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11029                  Operand::c32(0u), Operand::c32(8u), Operand::c32(0u));
11030    regs.emplace_back(Operand(rel_patch_id, PhysReg{vgpr++}));
11031
11032    Temp invocation_id =
11033       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11034                Operand::c32(8u), Operand::c32(5u));
11035    regs.emplace_back(Operand(invocation_id, PhysReg{vgpr++}));
11036
11037    if (ctx->program->info.tcs.pass_tessfactors_by_reg) {
11038       vgpr++; /* skip the tess factor LDS offset */
11039
11040       unsigned slot = VARYING_SLOT_TESS_LEVEL_OUTER;
11041       u_foreach_bit (i, ctx->outputs.mask[slot]) {
11042          regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11043       }
11044       vgpr += 4;
11045
11046       slot = VARYING_SLOT_TESS_LEVEL_INNER;
11047       u_foreach_bit (i, ctx->outputs.mask[slot]) {
11048          regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11049       }
11050    } else {
11051       Temp patch0_patch_data_offset =
11052          bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11053                   get_arg(ctx, ctx->program->info.tcs.vs_state_bits), Operand::c32(0xe000a));
11054
11055       Temp tf_lds_offset =
11056          bld.v_mul24_imm(bld.def(v1), rel_patch_id, ctx->program->info.tcs.patch_stride);
11057       tf_lds_offset = bld.nuw().vadd32(bld.def(v1), tf_lds_offset, patch0_patch_data_offset);
11058
11059       regs.emplace_back(Operand(tf_lds_offset, PhysReg{vgpr}));
11060    }
11061
11062    build_end_with_regs(ctx, regs);
11063 }
11064
11065 Pseudo_instruction*
11066 add_startpgm(struct isel_context* ctx)
11067 {
11068    unsigned def_count = 0;
11069    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11070       if (ctx->args->args[i].skip)
11071          continue;
11072       unsigned align = MIN2(4, util_next_power_of_two(ctx->args->args[i].size));
11073       if (ctx->args->args[i].file == AC_ARG_SGPR && ctx->args->args[i].offset % align)
11074          def_count += ctx->args->args[i].size;
11075       else
11076          def_count++;
11077    }
11078
11079    Pseudo_instruction* startpgm =
11080       create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
11081    ctx->block->instructions.emplace_back(startpgm);
11082    for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
11083       if (ctx->args->args[i].skip)
11084          continue;
11085
11086       enum ac_arg_regfile file = ctx->args->args[i].file;
11087       unsigned size = ctx->args->args[i].size;
11088       unsigned reg = ctx->args->args[i].offset;
11089       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11090
11091       if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) {
11092          Temp elems[16];
11093          for (unsigned j = 0; j < size; j++) {
11094             elems[j] = ctx->program->allocateTmp(s1);
11095             startpgm->definitions[arg++] = Definition(elems[j].id(), PhysReg{reg + j}, s1);
11096          }
11097          ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
11098       } else {
11099          Temp dst = ctx->program->allocateTmp(type);
11100          Definition def(dst);
11101          def.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11102          ctx->arg_temps[i] = dst;
11103          startpgm->definitions[arg++] = def;
11104
11105          if (ctx->args->args[i].pending_vmem) {
11106             assert(file == AC_ARG_VGPR);
11107             ctx->program->args_pending_vmem.push_back(def);
11108          }
11109       }
11110    }
11111
11112    /* epilog has no scratch */
11113    if (ctx->args->scratch_offset.used) {
11114       if (ctx->program->gfx_level < GFX9) {
11115          /* Stash these in the program so that they can be accessed later when
11116           * handling spilling.
11117           */
11118          if (ctx->args->ring_offsets.used)
11119             ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11120
11121          ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
11122       } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
11123          /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
11124           */
11125          Operand scratch_offset = Operand(get_arg(ctx, ctx->args->scratch_offset));
11126          scratch_offset.setLateKill(true);
11127
11128          Operand scratch_addr = ctx->args->ring_offsets.used
11129                                    ? Operand(get_arg(ctx, ctx->args->ring_offsets))
11130                                    : Operand(s2);
11131
11132          Builder bld(ctx->program, ctx->block);
11133          bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
11134                     scratch_offset);
11135       }
11136    }
11137
11138    return startpgm;
11139 }
11140
11141 void
11142 fix_ls_vgpr_init_bug(isel_context* ctx)
11143 {
11144    Builder bld(ctx->program, ctx->block);
11145    constexpr unsigned hs_idx = 1u;
11146    Builder::Result hs_thread_count =
11147       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11148                get_arg(ctx, ctx->args->merged_wave_info), Operand::c32((8u << 16) | (hs_idx * 8u)));
11149    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11150
11151    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11152
11153    Temp instance_id =
11154       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->vertex_id),
11155                get_arg(ctx, ctx->args->instance_id), ls_has_nonzero_hs_threads);
11156    Temp vs_rel_patch_id =
11157       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11158                get_arg(ctx, ctx->args->vs_rel_patch_id), ls_has_nonzero_hs_threads);
11159    Temp vertex_id =
11160       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_patch_id),
11161                get_arg(ctx, ctx->args->vertex_id), ls_has_nonzero_hs_threads);
11162
11163    ctx->arg_temps[ctx->args->instance_id.arg_index] = instance_id;
11164    ctx->arg_temps[ctx->args->vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11165    ctx->arg_temps[ctx->args->vertex_id.arg_index] = vertex_id;
11166 }
11167
11168 void
11169 split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11170 {
11171    /* Split all arguments except for the first (ring_offsets) and the last
11172     * (exec) so that the dead channels don't stay live throughout the program.
11173     */
11174    for (int i = 1; i < startpgm->definitions.size(); i++) {
11175       if (startpgm->definitions[i].regClass().size() > 1) {
11176          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11177                            startpgm->definitions[i].regClass().size());
11178       }
11179    }
11180 }
11181
11182 void
11183 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11184 {
11185    Program* program = ctx->program;
11186
11187    unsigned float_controls = shader->info.float_controls_execution_mode;
11188
11189    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11190       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11191    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11192       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11193                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11194
11195    program->next_fp_mode.must_flush_denorms32 =
11196       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11197    program->next_fp_mode.must_flush_denorms16_64 =
11198       float_controls &
11199       (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11200
11201    program->next_fp_mode.care_about_round32 =
11202       float_controls &
11203       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11204
11205    program->next_fp_mode.care_about_round16_64 =
11206       float_controls &
11207       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11208        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11209
11210    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11211     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11212    if (program->next_fp_mode.must_flush_denorms16_64)
11213       program->next_fp_mode.denorm16_64 = 0;
11214    else
11215       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11216
11217    /* preserving fp32 denorms is expensive, so only do it if asked */
11218    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11219       program->next_fp_mode.denorm32 = fp_denorm_keep;
11220    else
11221       program->next_fp_mode.denorm32 = 0;
11222
11223    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11224       program->next_fp_mode.round32 = fp_round_tz;
11225    else
11226       program->next_fp_mode.round32 = fp_round_ne;
11227
11228    if (float_controls &
11229        (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11230       program->next_fp_mode.round16_64 = fp_round_tz;
11231    else
11232       program->next_fp_mode.round16_64 = fp_round_ne;
11233
11234    ctx->block->fp_mode = program->next_fp_mode;
11235 }
11236
11237 void
11238 cleanup_cfg(Program* program)
11239 {
11240    /* create linear_succs/logical_succs */
11241    for (Block& BB : program->blocks) {
11242       for (unsigned idx : BB.linear_preds)
11243          program->blocks[idx].linear_succs.emplace_back(BB.index);
11244       for (unsigned idx : BB.logical_preds)
11245          program->blocks[idx].logical_succs.emplace_back(BB.index);
11246    }
11247 }
11248
11249 Temp
11250 lanecount_to_mask(isel_context* ctx, Temp count)
11251 {
11252    assert(count.regClass() == s1);
11253
11254    Builder bld(ctx->program, ctx->block);
11255    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11256    Temp cond;
11257
11258    if (ctx->program->wave_size == 64) {
11259       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11260       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11261                                 Operand::c32(6u /* log2(64) */));
11262       cond =
11263          bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11264    } else {
11265       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11266        * the register */
11267       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11268    }
11269
11270    return cond;
11271 }
11272
11273 Temp
11274 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11275 {
11276    Builder bld(ctx->program, ctx->block);
11277
11278    /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11279    Temp count = i == 0 ? get_arg(ctx, ctx->args->merged_wave_info)
11280                        : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11281                                   get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(i * 8u));
11282
11283    return lanecount_to_mask(ctx, count);
11284 }
11285
11286 static void
11287 insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
11288 {
11289    append_logical_end(ctx.block);
11290    ctx.block->kind |= block_kind_uniform;
11291
11292    unsigned src_count = ctx.args->arg_count;
11293    Pseudo_instruction* ret =
11294       create_instruction<Pseudo_instruction>(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
11295    ctx.block->instructions.emplace_back(ret);
11296
11297    for (unsigned i = 0; i < src_count; i++) {
11298       enum ac_arg_regfile file = ctx.args->args[i].file;
11299       unsigned size = ctx.args->args[i].size;
11300       unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
11301       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11302       Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
11303                                          : Operand(PhysReg{reg}, type);
11304       ret->operands[i] = op;
11305    }
11306
11307    Builder bld(ctx.program, ctx.block);
11308    bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
11309 }
11310
11311 void
11312 select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
11313                   const struct ac_shader_args* args)
11314 {
11315    for (unsigned i = 0; i < shader_count; i++) {
11316       if (i) {
11317          ctx.block = ctx.program->create_and_insert_block();
11318          ctx.block->kind = block_kind_top_level | block_kind_resume;
11319       }
11320
11321       nir_shader* nir = shaders[i];
11322       init_context(&ctx, nir);
11323       setup_fp_mode(&ctx, nir);
11324
11325       Pseudo_instruction* startpgm = add_startpgm(&ctx);
11326       append_logical_start(ctx.block);
11327       split_arguments(&ctx, startpgm);
11328       visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
11329
11330       /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
11331        * shader without shader calls.
11332        */
11333       if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
11334          insert_rt_jump_next(ctx, args);
11335
11336       cleanup_context(&ctx);
11337    }
11338
11339    ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
11340    cleanup_cfg(ctx.program);
11341 }
11342
11343 void
11344 pops_await_overlapped_waves(isel_context* ctx)
11345 {
11346    ctx->program->has_pops_overlapped_waves_wait = true;
11347
11348    Builder bld(ctx->program, ctx->block);
11349
11350    if (ctx->program->gfx_level >= GFX11) {
11351       /* GFX11+ - waiting for the export from the overlapped waves.
11352        * Await the export_ready event (bit wait_event_imm_dont_wait_export_ready clear).
11353        */
11354       bld.sopp(aco_opcode::s_wait_event, -1, 0);
11355       return;
11356    }
11357
11358    /* Pre-GFX11 - sleep loop polling the exiting wave ID. */
11359
11360    const Temp collision = get_arg(ctx, ctx->args->pops_collision_wave_id);
11361
11362    /* Check if there's an overlap in the current wave - otherwise, the wait may result in a hang. */
11363    const Temp did_overlap =
11364       bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), collision, Operand::c32(31));
11365    if_context did_overlap_if_context;
11366    begin_uniform_if_then(ctx, &did_overlap_if_context, did_overlap);
11367    bld.reset(ctx->block);
11368
11369    /* Set the packer register - after this, pops_exiting_wave_id can be polled. */
11370    if (ctx->program->gfx_level >= GFX10) {
11371       /* 2 packer ID bits on GFX10-10.3. */
11372       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11373                                       collision, Operand::c32(0x2001c));
11374       /* POPS_PACKER register: bit 0 - POPS enabled for this wave, bits 2:1 - packer ID. */
11375       const Temp packer_id_hwreg_bits = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1),
11376                                                  bld.def(s1, scc), packer_id, Operand::c32(1));
11377       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((3 - 1) << 11) | 25);
11378    } else {
11379       /* 1 packer ID bit on GFX9. */
11380       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11381                                       collision, Operand::c32(0x1001c));
11382       /* MODE register: bit 24 - wave is associated with packer 0, bit 25 - with packer 1.
11383        * Packer index to packer bits: 0 to 0b01, 1 to 0b10.
11384        */
11385       const Temp packer_id_hwreg_bits =
11386          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), packer_id, Operand::c32(1));
11387       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((2 - 1) << 11) | (24 << 6) | 1);
11388    }
11389
11390    Temp newest_overlapped_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11391                                              collision, Operand::c32(0xa0010));
11392    if (ctx->program->gfx_level < GFX10) {
11393       /* On GFX9, the newest overlapped wave ID value passed to the shader is smaller than the
11394        * actual wave ID by 1 in case of wraparound.
11395        */
11396       const Temp current_wave_id = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11397                                             collision, Operand::c32(0x3ff));
11398       const Temp newest_overlapped_wave_id_wrapped = bld.sopc(
11399          aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), newest_overlapped_wave_id, current_wave_id);
11400       newest_overlapped_wave_id =
11401          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), newest_overlapped_wave_id,
11402                   newest_overlapped_wave_id_wrapped);
11403    }
11404
11405    /* The wave IDs are the low 10 bits of a monotonically increasing wave counter.
11406     * The overlapped and the exiting wave IDs can't be larger than the current wave ID, and they are
11407     * no more than 1023 values behind the current wave ID.
11408     * Remap the overlapped and the exiting wave IDs from wrapping to monotonic so an unsigned
11409     * comparison can be used: the wave `current - 1023` becomes 0, it's followed by a piece growing
11410     * away from 0, then a piece increasing until UINT32_MAX, and the current wave is UINT32_MAX.
11411     * To do that, subtract `current - 1023`, which with wrapping arithmetic is (current + 1), and
11412     * `a - (b + 1)` is `a + ~b`.
11413     * Note that if the 10-bit current wave ID is 1023 (thus 1024 will be subtracted), the wave
11414     * `current - 1023` will become `UINT32_MAX - 1023` rather than 0, but all the possible wave IDs
11415     * will still grow monotonically in the 32-bit value, and the unsigned comparison will behave as
11416     * expected.
11417     */
11418    const Temp wave_id_offset = bld.sop2(aco_opcode::s_nand_b32, bld.def(s1), bld.def(s1, scc),
11419                                         collision, Operand::c32(0x3ff));
11420    newest_overlapped_wave_id = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11421                                         newest_overlapped_wave_id, wave_id_offset);
11422
11423    /* Await the overlapped waves. */
11424
11425    loop_context wait_loop_context;
11426    begin_loop(ctx, &wait_loop_context);
11427    bld.reset(ctx->block);
11428
11429    const Temp exiting_wave_id = bld.pseudo(aco_opcode::p_pops_gfx9_add_exiting_wave_id, bld.def(s1),
11430                                            bld.def(s1, scc), wave_id_offset);
11431    /* If the exiting (not exited) wave ID is larger than the newest overlapped wave ID (after
11432     * remapping both to monotonically increasing unsigned integers), the newest overlapped wave has
11433     * exited the ordered section.
11434     */
11435    const Temp newest_overlapped_wave_exited = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc),
11436                                                        newest_overlapped_wave_id, exiting_wave_id);
11437    if_context newest_overlapped_wave_exited_if_context;
11438    begin_uniform_if_then(ctx, &newest_overlapped_wave_exited_if_context,
11439                          newest_overlapped_wave_exited);
11440    emit_loop_break(ctx);
11441    begin_uniform_if_else(ctx, &newest_overlapped_wave_exited_if_context);
11442    end_uniform_if(ctx, &newest_overlapped_wave_exited_if_context);
11443    bld.reset(ctx->block);
11444
11445    /* Sleep before rechecking to let overlapped waves run for some time. */
11446    bld.sopp(aco_opcode::s_sleep, -1, ctx->program->gfx_level >= GFX10 ? UINT16_MAX : 3);
11447
11448    end_loop(ctx, &wait_loop_context);
11449    bld.reset(ctx->block);
11450
11451    /* Indicate the wait has been done to subsequent compilation stages. */
11452    bld.pseudo(aco_opcode::p_pops_gfx9_overlapped_wave_wait_done);
11453
11454    begin_uniform_if_else(ctx, &did_overlap_if_context);
11455    end_uniform_if(ctx, &did_overlap_if_context);
11456    bld.reset(ctx->block);
11457 }
11458
11459 static void
11460 create_merged_jump_to_epilog(isel_context* ctx)
11461 {
11462    Builder bld(ctx->program, ctx->block);
11463    std::vector<Operand> regs;
11464
11465    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11466       if (!ctx->args->args[i].preserved)
11467          continue;
11468
11469       const enum ac_arg_regfile file = ctx->args->args[i].file;
11470       const unsigned reg = ctx->args->args[i].offset;
11471
11472       Operand op(ctx->arg_temps[i]);
11473       op.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11474       regs.emplace_back(op);
11475    }
11476
11477    Temp continue_pc =
11478       convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));
11479
11480    aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
11481       aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
11482    jump->operands[0] = Operand(continue_pc);
11483    for (unsigned i = 0; i < regs.size(); i++) {
11484       jump->operands[i + 1] = regs[i];
11485    }
11486    ctx->block->instructions.emplace_back(std::move(jump));
11487 }
11488
11489 void
11490 select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, const bool need_barrier,
11491               if_context* ic_merged_wave_info, const bool check_merged_wave_info,
11492               const bool endif_merged_wave_info)
11493 {
11494    init_context(&ctx, nir);
11495    setup_fp_mode(&ctx, nir);
11496
11497    Program* program = ctx.program;
11498
11499    if (need_startpgm) {
11500       /* Needs to be after init_context() for FS. */
11501       Pseudo_instruction* startpgm = add_startpgm(&ctx);
11502       append_logical_start(ctx.block);
11503
11504       if (unlikely(ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
11505          fix_ls_vgpr_init_bug(&ctx);
11506
11507       split_arguments(&ctx, startpgm);
11508
11509       if (!program->info.vs.has_prolog &&
11510           (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11511          Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11512       }
11513    }
11514
11515    if (program->gfx_level == GFX10 && program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER &&
11516        !program->stage.has(SWStage::GS)) {
11517       /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11518        * s_sendmsg(GS_ALLOC_REQ).
11519        */
11520       Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11521    }
11522
11523    if (check_merged_wave_info) {
11524       const unsigned i =
11525          nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ? 0 : 1;
11526       const Temp cond = merged_wave_info_to_mask(&ctx, i);
11527       begin_divergent_if_then(&ctx, ic_merged_wave_info, cond);
11528    }
11529
11530    if (need_barrier) {
11531       const sync_scope scope = ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq &&
11532                                      program->wave_size % nir->info.tess.tcs_vertices_out == 0
11533                                   ? scope_subgroup
11534                                   : scope_workgroup;
11535
11536       Builder(ctx.program, ctx.block)
11537          .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
11538                   scope);
11539    }
11540
11541    nir_function_impl* func = nir_shader_get_entrypoint(nir);
11542    visit_cf_list(&ctx, &func->body);
11543
11544    if (ctx.program->info.has_epilog) {
11545       if (ctx.stage == fragment_fs) {
11546          create_fs_jump_to_epilog(&ctx);
11547
11548          /* FS epilogs always have at least one color/null export. */
11549          ctx.program->has_color_exports = true;
11550          ctx.block->kind |= block_kind_export_end;
11551       } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
11552          assert(ctx.stage == tess_control_hs || ctx.stage == vertex_tess_control_hs);
11553          if (ctx.options->is_opengl)
11554             create_tcs_end_for_epilog(&ctx);
11555          else
11556             create_tcs_jump_to_epilog(&ctx);
11557       }
11558    }
11559
11560    if (endif_merged_wave_info) {
11561       begin_divergent_if_else(&ctx, ic_merged_wave_info);
11562       end_divergent_if(&ctx, ic_merged_wave_info);
11563    }
11564
11565    if (!ctx.program->info.is_monolithic &&
11566        (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES)) {
11567       assert(program->gfx_level >= GFX9);
11568       create_merged_jump_to_epilog(&ctx);
11569       ctx.block->kind |= block_kind_export_end;
11570    }
11571
11572    cleanup_context(&ctx);
11573 }
11574
11575 void
11576 select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader* const* shaders)
11577 {
11578    if_context ic_merged_wave_info;
11579    const bool ngg_gs = ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.has(SWStage::GS);
11580
11581    for (unsigned i = 0; i < shader_count; i++) {
11582       nir_shader* nir = shaders[i];
11583
11584       /* We always need to insert p_startpgm at the beginning of the first shader.  */
11585       const bool need_startpgm = i == 0;
11586
11587       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11588       nir_function_impl* func = nir_shader_get_entrypoint(nir);
11589       const bool empty_shader =
11590          nir_cf_list_is_empty_block(&func->body) &&
11591          ((nir->info.stage == MESA_SHADER_VERTEX &&
11592            (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11593           (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11594
11595       /* See if we need to emit a check of the merged wave info SGPR. */
11596       const bool check_merged_wave_info =
11597          ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11598       const bool endif_merged_wave_info =
11599          ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11600
11601       /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11602       const bool tcs_skip_barrier =
11603          ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read;
11604
11605       /* A barrier is usually needed at the beginning of the second shader, with exceptions. */
11606       const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
11607
11608       select_shader(ctx, nir, need_startpgm, need_barrier, &ic_merged_wave_info,
11609                     check_merged_wave_info, endif_merged_wave_info);
11610
11611       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11612          /* Special handling when TCS input and output patch size is the same.
11613           * Outputs of the previous stage are inputs to the next stage.
11614           */
11615          ctx.inputs = ctx.outputs;
11616          ctx.outputs = shader_io_state();
11617       }
11618    }
11619 }
11620
11621 Temp
11622 get_tess_ring_descriptor(isel_context* ctx, const struct aco_tcs_epilog_info* einfo,
11623                          bool is_tcs_factor_ring)
11624 {
11625    Builder bld(ctx->program, ctx->block);
11626
11627    if (!ctx->options->is_opengl) {
11628       Temp ring_offsets = get_arg(ctx, ctx->args->ring_offsets);
11629       uint32_t tess_ring_offset =
11630          is_tcs_factor_ring ? 5 /* RING_HS_TESS_FACTOR */ : 6 /* RING_HS_TESS_OFFCHIP */;
11631       return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ring_offsets,
11632                       Operand::c32(tess_ring_offset * 16u));
11633    }
11634
11635    Temp addr = get_arg(ctx, einfo->tcs_out_lds_layout);
11636    /* TCS only receives high 13 bits of the address. */
11637    addr = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), addr,
11638                    Operand::c32(0xfff80000));
11639
11640    if (is_tcs_factor_ring) {
11641       addr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr,
11642                       Operand::c32(einfo->tess_offchip_ring_size));
11643    }
11644
11645    uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
11646                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
11647
11648    if (ctx->options->gfx_level >= GFX11) {
11649       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
11650                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
11651    } else if (ctx->options->gfx_level >= GFX10) {
11652       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
11653                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
11654    } else {
11655       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
11656                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
11657    }
11658
11659    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr,
11660                      Operand::c32(ctx->options->address32_hi), Operand::c32(0xffffffff),
11661                      Operand::c32(rsrc3));
11662 }
11663
11664 void
11665 store_tess_factor_to_tess_ring(isel_context* ctx, Temp tess_ring_desc, Temp factors[],
11666                                unsigned factor_comps, Temp sbase, Temp voffset, Temp num_patches,
11667                                unsigned patch_offset)
11668 {
11669    Builder bld(ctx->program, ctx->block);
11670
11671    Temp soffset = sbase;
11672    if (patch_offset) {
11673       Temp offset =
11674          bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches, Operand::c32(patch_offset));
11675       soffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), soffset, offset);
11676    }
11677
11678    Temp data = factor_comps == 1
11679                   ? factors[0]
11680                   : create_vec_from_array(ctx, factors, factor_comps, RegType::vgpr, 4);
11681
11682    emit_single_mubuf_store(ctx, tess_ring_desc, voffset, soffset, Temp(), data, 0,
11683                            memory_sync_info(storage_vmem_output), true, false, false);
11684 }
11685
11686 Temp
11687 build_fast_udiv_nuw(isel_context* ctx, Temp num, Temp multiplier, Temp pre_shift, Temp post_shift,
11688                     Temp increment)
11689 {
11690    Builder bld(ctx->program, ctx->block);
11691
11692    num = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), pre_shift, num);
11693    num = bld.nuw().vadd32(bld.def(v1), num, increment);
11694    num = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), num, multiplier);
11695    return bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), post_shift, num);
11696 }
11697
11698 Temp
11699 get_gl_vs_prolog_vertex_index(isel_context* ctx, const struct aco_gl_vs_prolog_info* vinfo,
11700                               unsigned input_index, Temp instance_divisor_constbuf)
11701 {
11702    bool divisor_is_one = vinfo->instance_divisor_is_one & (1u << input_index);
11703    bool divisor_is_fetched = vinfo->instance_divisor_is_fetched & (1u << input_index);
11704
11705    Builder bld(ctx->program, ctx->block);
11706
11707    Temp index;
11708    if (divisor_is_one) {
11709       index = get_arg(ctx, ctx->args->instance_id);
11710    } else if (divisor_is_fetched) {
11711       Temp instance_id = get_arg(ctx, ctx->args->instance_id);
11712
11713       Temp udiv_factors = bld.smem(aco_opcode::s_buffer_load_dwordx4, bld.def(s4),
11714                                    instance_divisor_constbuf, Operand::c32(input_index * 16));
11715       emit_split_vector(ctx, udiv_factors, 4);
11716
11717       index = build_fast_udiv_nuw(ctx, instance_id, emit_extract_vector(ctx, udiv_factors, 0, s1),
11718                                   emit_extract_vector(ctx, udiv_factors, 1, s1),
11719                                   emit_extract_vector(ctx, udiv_factors, 2, s1),
11720                                   emit_extract_vector(ctx, udiv_factors, 3, s1));
11721    }
11722
11723    if (divisor_is_one || divisor_is_fetched) {
11724       Temp start_instance = get_arg(ctx, ctx->args->start_instance);
11725       index = bld.vadd32(bld.def(v1), index, start_instance);
11726    } else {
11727       Temp base_vertex = get_arg(ctx, ctx->args->base_vertex);
11728       Temp vertex_id = get_arg(ctx, ctx->args->vertex_id);
11729       index = bld.vadd32(bld.def(v1), base_vertex, vertex_id);
11730    }
11731
11732    return index;
11733 }
11734
11735 } /* end namespace */
11736
11737 void
11738 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11739                ac_shader_config* config, const struct aco_compiler_options* options,
11740                const struct aco_shader_info* info, const struct ac_shader_args* args)
11741 {
11742    isel_context ctx =
11743       setup_isel_context(program, shader_count, shaders, config, options, info, args);
11744
11745    if (ctx.stage == raytracing_cs)
11746       return select_program_rt(ctx, shader_count, shaders, args);
11747
11748    if (shader_count >= 2) {
11749       select_program_merged(ctx, shader_count, shaders);
11750    } else {
11751       bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
11752       if_context ic_merged_wave_info;
11753
11754       /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
11755       if (!ctx.program->info.is_monolithic) {
11756          assert(ctx.program->gfx_level >= GFX9);
11757          if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
11758             check_merged_wave_info = endif_merged_wave_info = true;
11759          } else {
11760             const bool ngg_gs =
11761                ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.sw == SWStage::GS;
11762             assert(ctx.stage == tess_control_hs || ctx.stage == geometry_gs || ngg_gs);
11763             check_merged_wave_info = endif_merged_wave_info = !ngg_gs;
11764             need_barrier = !ngg_gs;
11765          }
11766       }
11767
11768       select_shader(ctx, shaders[0], true, need_barrier, &ic_merged_wave_info,
11769                     check_merged_wave_info, endif_merged_wave_info);
11770    }
11771
11772    program->config->float_mode = program->blocks[0].fp_mode.val;
11773
11774    append_logical_end(ctx.block);
11775    ctx.block->kind |= block_kind_uniform;
11776
11777    if (!ctx.program->info.has_epilog ||
11778        (shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL &&
11779         options->gfx_level >= GFX9)) {
11780       Builder bld(ctx.program, ctx.block);
11781       bld.sopp(aco_opcode::s_endpgm);
11782    }
11783
11784    cleanup_cfg(program);
11785 }
11786
11787 void
11788 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
11789                            const struct aco_compiler_options* options,
11790                            const struct aco_shader_info* info, const struct ac_shader_args* args)
11791 {
11792    assert(options->gfx_level == GFX8);
11793
11794    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
11795                 config);
11796
11797    isel_context ctx = {};
11798    ctx.program = program;
11799    ctx.args = args;
11800    ctx.options = options;
11801    ctx.stage = program->stage;
11802
11803    ctx.block = ctx.program->create_and_insert_block();
11804    ctx.block->kind = block_kind_top_level;
11805
11806    program->workgroup_size = 1; /* XXX */
11807
11808    add_startpgm(&ctx);
11809    append_logical_start(ctx.block);
11810
11811    Builder bld(ctx.program, ctx.block);
11812
11813    /* Load the buffer descriptor from TMA. */
11814    bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
11815             Operand::zero());
11816
11817    /* Store TTMP0-TTMP1. */
11818    bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
11819             Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
11820
11821    uint32_t hw_regs_idx[] = {
11822       2, /* HW_REG_STATUS */
11823       3, /* HW_REG_TRAP_STS */
11824       4, /* HW_REG_HW_ID */
11825       7, /* HW_REG_IB_STS */
11826    };
11827
11828    /* Store some hardware registers. */
11829    for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
11830       /* "((size - 1) << 11) | register" */
11831       bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
11832                ((20 - 1) << 11) | hw_regs_idx[i]);
11833
11834       bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
11835                Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
11836    }
11837
11838    program->config->float_mode = program->blocks[0].fp_mode.val;
11839
11840    append_logical_end(ctx.block);
11841    ctx.block->kind |= block_kind_uniform;
11842    bld.sopp(aco_opcode::s_endpgm);
11843
11844    cleanup_cfg(program);
11845 }
11846
11847 Operand
11848 get_arg_fixed(const struct ac_shader_args* args, struct ac_arg arg)
11849 {
11850    enum ac_arg_regfile file = args->args[arg.arg_index].file;
11851    unsigned size = args->args[arg.arg_index].size;
11852    RegClass rc = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11853    return Operand(get_arg_reg(args, arg), rc);
11854 }
11855
11856 unsigned
11857 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
11858 {
11859    unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
11860
11861    unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
11862    if (bld.program->gfx_level >= GFX10 && num_loads > 1)
11863       bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
11864
11865    for (unsigned i = 0; i < count;) {
11866       unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
11867
11868       if (size == 4)
11869          bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
11870                   Operand::c32((start + i) * 16u));
11871       else if (size == 2)
11872          bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
11873                   Operand::c32((start + i) * 16u));
11874       else
11875          bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
11876                   Operand::c32((start + i) * 16u));
11877
11878       dest = dest.advance(size * 16u);
11879       i += size;
11880    }
11881
11882    return count;
11883 }
11884
11885 Operand
11886 calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args,
11887                             const struct aco_vs_prolog_info* pinfo, unsigned index,
11888                             Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
11889                             PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
11890 {
11891    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
11892             get_arg_fixed(args, pinfo->inputs), Operand::c32(8u + index * 8u));
11893
11894    wait_imm lgkm_imm;
11895    lgkm_imm.lgkm = 0;
11896    bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->gfx_level));
11897
11898    Definition fetch_index_def(tmp_vgpr0, v1);
11899    Operand fetch_index(tmp_vgpr0, v1);
11900
11901    Operand div_info(tmp_sgpr, s1);
11902    if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX11) {
11903       /* use SDWA */
11904       if (bld.program->gfx_level < GFX9) {
11905          bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
11906          div_info = Operand(tmp_vgpr1, v1);
11907       }
11908
11909       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
11910
11911       Instruction* instr;
11912       if (bld.program->gfx_level >= GFX9)
11913          instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
11914       else
11915          instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
11916                                div_info, fetch_index)
11917                     .instr;
11918       instr->sdwa().sel[0] = SubdwordSel::ubyte1;
11919
11920       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
11921                fetch_index);
11922
11923       instr =
11924          bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
11925       instr->sdwa().sel[0] = SubdwordSel::ubyte2;
11926    } else {
11927       Operand tmp_op(tmp_vgpr1, v1);
11928       Definition tmp_def(tmp_vgpr1, v1);
11929
11930       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
11931
11932       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
11933       bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
11934
11935       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
11936                Operand(tmp_sgpr.advance(4), s1));
11937
11938       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
11939       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
11940    }
11941
11942    bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
11943
11944    return fetch_index;
11945 }
11946
11947 void
11948 select_rt_prolog(Program* program, ac_shader_config* config,
11949                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
11950                  const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
11951 {
11952    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
11953                 config);
11954    Block* block = program->create_and_insert_block();
11955    block->kind = block_kind_top_level;
11956    program->workgroup_size = info->workgroup_size;
11957    program->wave_size = info->workgroup_size;
11958    calc_min_waves(program);
11959    Builder bld(program, block);
11960    block->instructions.reserve(32);
11961    unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
11962    unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
11963
11964    /* Inputs:
11965     * Ring offsets:                s[0-1]
11966     * Indirect descriptor sets:    s[2]
11967     * Push constants pointer:      s[3]
11968     * SBT descriptors:             s[4-5]
11969     * Traversal shader address:    s[6-7]
11970     * Ray launch size address:     s[8-9]
11971     * Dynamic callable stack base: s[10]
11972     * Workgroup IDs (xyz):         s[11], s[12], s[13]
11973     * Scratch offset:              s[14]
11974     * Local invocation IDs:        v[0-2]
11975     */
11976    PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
11977    PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
11978    PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
11979    PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
11980    PhysReg in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]);
11981    PhysReg in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]);
11982    PhysReg in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]);
11983    PhysReg in_scratch_offset;
11984    if (options->gfx_level < GFX11)
11985       in_scratch_offset = get_arg_reg(in_args, in_args->scratch_offset);
11986    PhysReg in_local_ids[2] = {
11987       get_arg_reg(in_args, in_args->local_invocation_ids),
11988       get_arg_reg(in_args, in_args->local_invocation_ids).advance(4),
11989    };
11990
11991    /* Outputs:
11992     * Callee shader PC:            s[0-1]
11993     * Indirect descriptor sets:    s[2]
11994     * Push constants pointer:      s[3]
11995     * SBT descriptors:             s[4-5]
11996     * Traversal shader address:    s[6-7]
11997     * Ray launch sizes (xyz):      s[8], s[9], s[10]
11998     * Scratch offset (<GFX9 only): s[11]
11999     * Ring offsets (<GFX9 only):   s[12-13]
12000     * Ray launch IDs:              v[0-2]
12001     * Stack pointer:               v[3]
12002     * Shader VA:                   v[4-5]
12003     * Shader Record Ptr:           v[6-7]
12004     */
12005    PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
12006    PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_size);
12007    PhysReg out_launch_size_z = out_launch_size_x.advance(8);
12008    PhysReg out_launch_ids[3];
12009    for (unsigned i = 0; i < 3; i++)
12010       out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_id).advance(i * 4);
12011    PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
12012    PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
12013
12014    /* Temporaries: */
12015    num_sgprs = align(num_sgprs, 2) + 4;
12016    PhysReg tmp_raygen_sbt = PhysReg{num_sgprs - 4};
12017    PhysReg tmp_ring_offsets = PhysReg{num_sgprs - 2};
12018
12019    /* Confirm some assumptions about register aliasing */
12020    assert(in_ring_offsets == out_uniform_shader_addr);
12021    assert(get_arg_reg(in_args, in_args->push_constants) ==
12022           get_arg_reg(out_args, out_args->push_constants));
12023    assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
12024           get_arg_reg(out_args, out_args->rt.sbt_descriptors));
12025    assert(in_launch_size_addr == out_launch_size_x);
12026    assert(in_stack_base == out_launch_size_z);
12027    assert(in_local_ids[0] == out_launch_ids[0]);
12028
12029    /* load raygen sbt */
12030    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
12031             Operand::c32(0u));
12032
12033    /* init scratch */
12034    if (options->gfx_level < GFX9) {
12035       /* copy ring offsets to temporary location*/
12036       bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
12037                Operand(in_ring_offsets, s2));
12038    } else if (options->gfx_level < GFX11) {
12039       hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
12040                       Operand(in_scratch_offset, s1));
12041    }
12042
12043    /* set stack ptr */
12044    bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
12045
12046    /* load raygen address */
12047    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
12048             Operand(tmp_raygen_sbt, s2), Operand::c32(0u));
12049
12050    /* load ray launch sizes */
12051    bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
12052             Operand(in_launch_size_addr, s2), Operand::c32(8u));
12053    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
12054             Operand(in_launch_size_addr, s2), Operand::c32(0u));
12055
12056    /* calculate ray launch ids */
12057    if (options->gfx_level >= GFX11) {
12058       /* Thread IDs are packed in VGPR0, 10 bits per component. */
12059       bld.vop3(aco_opcode::v_bfe_u32, Definition(in_local_ids[1], v1), Operand(in_local_ids[0], v1),
12060                Operand::c32(10u), Operand::c32(3u));
12061       bld.vop2(aco_opcode::v_and_b32, Definition(in_local_ids[0], v1), Operand::c32(0x7),
12062                Operand(in_local_ids[0], v1));
12063    }
12064    /* Do this backwards to reduce some RAW hazards on GFX11+ */
12065    bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
12066    bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1), Operand(in_wg_id_y, s1),
12067             Operand::c32(program->workgroup_size == 32 ? 4 : 8), Operand(in_local_ids[1], v1));
12068    bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1),
12069             Operand::c32(8), Operand(in_local_ids[0], v1));
12070
12071    if (options->gfx_level < GFX9) {
12072       /* write scratch/ring offsets to outputs, if needed */
12073       bld.sop1(aco_opcode::s_mov_b32,
12074                Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
12075                Operand(in_scratch_offset, s1));
12076       bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
12077                Operand(tmp_ring_offsets, s2));
12078    }
12079
12080    /* calculate shader record ptr: SBT + RADV_RT_HANDLE_SIZE */
12081    if (options->gfx_level < GFX9) {
12082       bld.vop2_e64(aco_opcode::v_add_co_u32, Definition(out_record_ptr, v1), Definition(vcc, s2),
12083                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12084    } else {
12085       bld.vop2_e64(aco_opcode::v_add_u32, Definition(out_record_ptr, v1),
12086                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12087    }
12088    bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
12089             Operand(tmp_raygen_sbt.advance(4), s1));
12090
12091    /* jump to raygen */
12092    bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
12093
12094    program->config->float_mode = program->blocks[0].fp_mode.val;
12095    program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12096    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12097 }
12098
12099 void
12100 select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config,
12101                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12102                  const struct ac_shader_args* args)
12103 {
12104    assert(pinfo->num_attributes > 0);
12105
12106    /* This should be enough for any shader/stage. */
12107    unsigned max_user_sgprs = options->gfx_level >= GFX9 ? 32 : 16;
12108
12109    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12110                 config);
12111    program->dev.vgpr_limit = 256;
12112
12113    Block* block = program->create_and_insert_block();
12114    block->kind = block_kind_top_level;
12115
12116    program->workgroup_size = 64;
12117    calc_min_waves(program);
12118
12119    Builder bld(program, block);
12120
12121    block->instructions.reserve(16 + pinfo->num_attributes * 4);
12122
12123    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
12124
12125    uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
12126    bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors & attrib_mask;
12127
12128    wait_imm lgkm_imm;
12129    lgkm_imm.lgkm = 0;
12130
12131    /* choose sgprs */
12132    PhysReg vertex_buffers(align(max_user_sgprs + 14, 2));
12133    PhysReg prolog_input = vertex_buffers.advance(8);
12134    PhysReg desc(
12135       align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
12136
12137    Operand start_instance = get_arg_fixed(args, args->start_instance);
12138    Operand instance_id = get_arg_fixed(args, args->instance_id);
12139
12140    PhysReg attributes_start(256 + args->num_vgprs_used);
12141    /* choose vgprs that won't be used for anything else until the last attribute load */
12142    PhysReg vertex_index(attributes_start.reg() + pinfo->num_attributes * 4 - 1);
12143    PhysReg instance_index(attributes_start.reg() + pinfo->num_attributes * 4 - 2);
12144    PhysReg start_instance_vgpr(attributes_start.reg() + pinfo->num_attributes * 4 - 3);
12145    PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + pinfo->num_attributes * 4 - 4);
12146    PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + pinfo->num_attributes * 4);
12147
12148    bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
12149             get_arg_fixed(args, args->vertex_buffers));
12150    if (options->address32_hi >= 0xffff8000 || options->address32_hi <= 0x7fff) {
12151       bld.sopk(aco_opcode::s_movk_i32, Definition(vertex_buffers.advance(4), s1),
12152                options->address32_hi & 0xFFFF);
12153    } else {
12154       bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
12155                Operand::c32((unsigned)options->address32_hi));
12156    }
12157
12158    /* calculate vgpr requirements */
12159    unsigned num_vgprs = attributes_start.reg() - 256;
12160    num_vgprs += pinfo->num_attributes * 4;
12161    if (has_nontrivial_divisors && program->gfx_level <= GFX8)
12162       num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
12163    unsigned num_sgprs = 0;
12164
12165    const struct ac_vtx_format_info* vtx_info_table =
12166       ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
12167
12168    for (unsigned loc = 0; loc < pinfo->num_attributes;) {
12169       unsigned num_descs =
12170          load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc);
12171       num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
12172
12173       if (loc == 0) {
12174          /* perform setup while we load the descriptors */
12175          if (pinfo->is_ngg || pinfo->next_stage != MESA_SHADER_VERTEX) {
12176             Operand count = get_arg_fixed(args, args->merged_wave_info);
12177             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
12178             if (program->wave_size == 64) {
12179                bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
12180                         Operand::c32(6u /* log2(64) */));
12181                bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
12182                         Operand(exec, s2), Operand(scc, s1));
12183             }
12184          }
12185
12186          bool needs_instance_index = false;
12187          bool needs_start_instance = false;
12188          u_foreach_bit (i, pinfo->state.instance_rate_inputs & attrib_mask) {
12189             needs_instance_index |= pinfo->state.divisors[i] == 1;
12190             needs_start_instance |= pinfo->state.divisors[i] == 0;
12191          }
12192          bool needs_vertex_index = ~pinfo->state.instance_rate_inputs & attrib_mask;
12193          if (needs_vertex_index)
12194             bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
12195                        get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);
12196          if (needs_instance_index)
12197             bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12198                        Operand(s2), true);
12199          if (needs_start_instance)
12200             bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12201       }
12202
12203       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
12204
12205       for (unsigned i = 0; i < num_descs;) {
12206          PhysReg dest(attributes_start.reg() + loc * 4u);
12207
12208          /* calculate index */
12209          Operand fetch_index = Operand(vertex_index, v1);
12210          if (pinfo->state.instance_rate_inputs & (1u << loc)) {
12211             uint32_t divisor = pinfo->state.divisors[loc];
12212             if (divisor) {
12213                fetch_index = instance_id;
12214                if (pinfo->state.nontrivial_divisors & (1u << loc)) {
12215                   unsigned index =
12216                      util_bitcount(pinfo->state.nontrivial_divisors & BITFIELD_MASK(loc));
12217                   fetch_index = calc_nontrivial_instance_id(
12218                      bld, args, pinfo, index, instance_id, start_instance, prolog_input,
12219                      nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12220                } else {
12221                   fetch_index = Operand(instance_index, v1);
12222                }
12223             } else {
12224                fetch_index = Operand(start_instance_vgpr, v1);
12225             }
12226          }
12227
12228          /* perform load */
12229          PhysReg cur_desc = desc.advance(i * 16);
12230          if ((pinfo->misaligned_mask & (1u << loc))) {
12231             const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->state.formats[loc]];
12232
12233             assert(vtx_info->has_hw_format & 0x1);
12234             unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12235             unsigned nfmt = vtx_info->hw_format[0] >> 4;
12236
12237             for (unsigned j = 0; j < vtx_info->num_channels; j++) {
12238                bool post_shuffle = pinfo->state.post_shuffle & (1u << loc);
12239                unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12240
12241                /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
12242                 * doesn't require this to work, but some GL CTS tests over Zink do this anyway.
12243                 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
12244                 * care).
12245                 */
12246                if (dfmt == V_008F0C_BUF_DATA_FORMAT_32)
12247                   bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
12248                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
12249                             false, true);
12250                else if (vtx_info->chan_byte_size == 8)
12251                   bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
12252                             Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
12253                             fetch_index, Operand::c32(0u), dfmt, nfmt, offset, false, true);
12254                else
12255                   bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12256                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), dfmt, nfmt,
12257                             offset, false, true);
12258             }
12259             uint32_t one =
12260                nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
12261                   ? 1u
12262                   : 0x3f800000u;
12263             /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
12264              * For 64-bit data types, no default attribute values are provided. Input variables must
12265              * not use more components than provided by the attribute.
12266              */
12267             for (unsigned j = vtx_info->num_channels; vtx_info->chan_byte_size != 8 && j < 4; j++) {
12268                bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
12269                         Operand::c32(j == 3 ? one : 0u));
12270             }
12271
12272             unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
12273             loc += slots;
12274             i += slots;
12275          } else {
12276             bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12277                       Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
12278             loc++;
12279             i++;
12280          }
12281       }
12282    }
12283
12284    if (pinfo->state.alpha_adjust_lo | pinfo->state.alpha_adjust_hi) {
12285       wait_imm vm_imm;
12286       vm_imm.vm = 0;
12287       bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->gfx_level));
12288    }
12289
12290    /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
12291     * so we may need to fix it up. */
12292    u_foreach_bit (loc, (pinfo->state.alpha_adjust_lo | pinfo->state.alpha_adjust_hi)) {
12293       PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
12294
12295       unsigned alpha_adjust = (pinfo->state.alpha_adjust_lo >> loc) & 0x1;
12296       alpha_adjust |= ((pinfo->state.alpha_adjust_hi >> loc) & 0x1) << 1;
12297
12298       if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
12299          bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
12300
12301       /* For the integer-like cases, do a natural sign extension.
12302        *
12303        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
12304        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
12305        * exponent.
12306        */
12307       unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
12308       bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
12309                Operand::c32(offset), Operand::c32(2u));
12310
12311       /* Convert back to the right type. */
12312       if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
12313          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12314          bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
12315                   Operand(alpha, v1));
12316       } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
12317          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12318       }
12319    }
12320
12321    block->kind |= block_kind_uniform;
12322
12323    /* continue on to the main shader */
12324    Operand continue_pc = get_arg_fixed(args, pinfo->inputs);
12325    if (has_nontrivial_divisors) {
12326       bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
12327                get_arg_fixed(args, pinfo->inputs), Operand::c32(0u));
12328       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
12329       continue_pc = Operand(prolog_input, s2);
12330    }
12331
12332    bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
12333
12334    program->config->float_mode = program->blocks[0].fp_mode.val;
12335    /* addition on GFX6-8 requires a carry-out (we use VCC) */
12336    program->needs_vcc = program->gfx_level <= GFX8;
12337    program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
12338    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12339 }
12340
12341 void
12342 select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
12343                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12344                  const struct ac_shader_args* args)
12345 {
12346    const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo;
12347    isel_context ctx =
12348       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
12349
12350    ctx.block->fp_mode = program->next_fp_mode;
12351
12352    add_startpgm(&ctx);
12353    append_logical_start(ctx.block);
12354
12355    Builder bld(ctx.program, ctx.block);
12356
12357    /* Export all color render targets */
12358    struct aco_export_mrt mrts[8];
12359    uint8_t exported_mrts = 0;
12360
12361    for (unsigned i = 0; i < 8; i++) {
12362       unsigned col_format = (einfo->spi_shader_col_format >> (i * 4)) & 0xf;
12363
12364       if (col_format == V_028714_SPI_SHADER_ZERO)
12365          continue;
12366
12367       struct mrt_color_export out;
12368
12369       out.slot = i;
12370       out.write_mask = 0xf;
12371       out.col_format = col_format;
12372       out.is_int8 = (einfo->color_is_int8 >> i) & 1;
12373       out.is_int10 = (einfo->color_is_int10 >> i) & 1;
12374       out.enable_mrt_output_nan_fixup = (options->enable_mrt_output_nan_fixup >> i) & 1;
12375
12376       Temp inputs = get_arg(&ctx, einfo->inputs[i]);
12377       emit_split_vector(&ctx, inputs, 4);
12378       for (unsigned c = 0; c < 4; ++c) {
12379          out.values[c] = Operand(emit_extract_vector(&ctx, inputs, c, v1));
12380       }
12381
12382       if (export_fs_mrt_color(&ctx, &out, &mrts[i])) {
12383          exported_mrts |= 1 << i;
12384       }
12385    }
12386
12387    if (exported_mrts) {
12388       if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) {
12389          struct aco_export_mrt* mrt0 = (exported_mrts & BITFIELD_BIT(0)) ? &mrts[0] : NULL;
12390          struct aco_export_mrt* mrt1 = (exported_mrts & BITFIELD_BIT(1)) ? &mrts[1] : NULL;
12391          create_fs_dual_src_export_gfx11(&ctx, mrt0, mrt1);
12392       } else {
12393          u_foreach_bit (i, exported_mrts) {
12394             export_mrt(&ctx, &mrts[i]);
12395          }
12396       }
12397    } else {
12398       create_fs_null_export(&ctx);
12399    }
12400
12401    program->config->float_mode = program->blocks[0].fp_mode.val;
12402
12403    append_logical_end(ctx.block);
12404    ctx.block->kind |= block_kind_export_end;
12405    bld.reset(ctx.block);
12406    bld.sopp(aco_opcode::s_endpgm);
12407
12408    cleanup_cfg(program);
12409 }
12410
12411 void
12412 select_tcs_epilog(Program* program, void* pinfo, ac_shader_config* config,
12413                   const struct aco_compiler_options* options, const struct aco_shader_info* info,
12414                   const struct ac_shader_args* args)
12415 {
12416    const struct aco_tcs_epilog_info* einfo = (const struct aco_tcs_epilog_info*)pinfo;
12417    isel_context ctx =
12418       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::TCS);
12419
12420    ctx.block->fp_mode = program->next_fp_mode;
12421
12422    add_startpgm(&ctx);
12423    append_logical_start(ctx.block);
12424
12425    Builder bld(ctx.program, ctx.block);
12426
12427    /* Add a barrier before loading tess factors from LDS. */
12428    if (!einfo->pass_tessfactors_by_reg) {
12429       /* To generate s_waitcnt lgkmcnt(0) when waitcnt insertion. */
12430       program->pending_lds_access = true;
12431
12432       sync_scope scope = einfo->tcs_out_patch_fits_subgroup ? scope_subgroup : scope_workgroup;
12433       bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
12434                   scope);
12435    }
12436
12437    Temp invocation_id = get_arg(&ctx, einfo->invocation_id);
12438
12439    Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), invocation_id);
12440
12441    if_context ic_invoc_0;
12442    begin_divergent_if_then(&ctx, &ic_invoc_0, cond);
12443
12444    int outer_comps, inner_comps;
12445    switch (einfo->primitive_mode) {
12446    case TESS_PRIMITIVE_ISOLINES:
12447       outer_comps = 2;
12448       inner_comps = 0;
12449       break;
12450    case TESS_PRIMITIVE_TRIANGLES:
12451       outer_comps = 3;
12452       inner_comps = 1;
12453       break;
12454    case TESS_PRIMITIVE_QUADS:
12455       outer_comps = 4;
12456       inner_comps = 2;
12457       break;
12458    default: unreachable("invalid primitive mode"); return;
12459    }
12460
12461    bld.reset(ctx.block);
12462
12463    unsigned tess_lvl_out_loc =
12464       ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER) * 16;
12465    unsigned tess_lvl_in_loc =
12466       ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER) * 16;
12467
12468    Temp outer[4];
12469    Temp inner[2];
12470    if (einfo->pass_tessfactors_by_reg) {
12471       for (int i = 0; i < outer_comps; i++)
12472          outer[i] = get_arg(&ctx, einfo->tess_lvl_out[i]);
12473
12474       for (int i = 0; i < inner_comps; i++)
12475          inner[i] = get_arg(&ctx, einfo->tess_lvl_in[i]);
12476    } else {
12477       Temp addr = get_arg(&ctx, einfo->tcs_out_current_patch_data_offset);
12478       addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr);
12479
12480       Temp data = program->allocateTmp(RegClass(RegType::vgpr, outer_comps));
12481       load_lds(&ctx, 4, outer_comps, data, addr, tess_lvl_out_loc, 4);
12482       for (int i = 0; i < outer_comps; i++)
12483          outer[i] = emit_extract_vector(&ctx, data, i, v1);
12484
12485       if (inner_comps) {
12486          data = program->allocateTmp(RegClass(RegType::vgpr, inner_comps));
12487          load_lds(&ctx, 4, inner_comps, data, addr, tess_lvl_in_loc, 4);
12488          for (int i = 0; i < inner_comps; i++)
12489             inner[i] = emit_extract_vector(&ctx, data, i, v1);
12490       }
12491    }
12492
12493    Temp tess_factor_ring_desc = get_tess_ring_descriptor(&ctx, einfo, true);
12494    Temp tess_factor_ring_base = get_arg(&ctx, args->tcs_factor_offset);
12495    Temp rel_patch_id = get_arg(&ctx, einfo->rel_patch_id);
12496    unsigned tess_factor_ring_const_offset = 0;
12497
12498    if (program->gfx_level <= GFX8) {
12499       /* Store the dynamic HS control word. */
12500       cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), rel_patch_id);
12501
12502       if_context ic_patch_0;
12503       begin_divergent_if_then(&ctx, &ic_patch_0, cond);
12504
12505       bld.reset(ctx.block);
12506
12507       Temp data = bld.copy(bld.def(v1), Operand::c32(0x80000000u));
12508
12509       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, Temp(0, v1), tess_factor_ring_base,
12510                               Temp(), data, 0, memory_sync_info(), true, false, false);
12511
12512       tess_factor_ring_const_offset += 4;
12513
12514       begin_divergent_if_else(&ctx, &ic_patch_0);
12515       end_divergent_if(&ctx, &ic_patch_0);
12516    }
12517
12518    bld.reset(ctx.block);
12519
12520    Temp tess_factor_ring_offset =
12521       bld.v_mul_imm(bld.def(v1), rel_patch_id, (inner_comps + outer_comps) * 4, false);
12522
12523    switch (einfo->primitive_mode) {
12524    case TESS_PRIMITIVE_ISOLINES: {
12525       /* For isolines, the hardware expects tess factors in the reverse order. */
12526       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), outer[1], outer[0]);
12527       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
12528                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
12529                               memory_sync_info(), true, false, false);
12530       break;
12531    }
12532    case TESS_PRIMITIVE_TRIANGLES: {
12533       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), outer[0], outer[1], outer[2],
12534                              inner[0]);
12535       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
12536                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
12537                               memory_sync_info(), true, false, false);
12538       break;
12539    }
12540    case TESS_PRIMITIVE_QUADS: {
12541       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), outer[0], outer[1], outer[2],
12542                              outer[3]);
12543       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
12544                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
12545                               memory_sync_info(), true, false, false);
12546
12547       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), inner[0], inner[1]);
12548       emit_single_mubuf_store(
12549          &ctx, tess_factor_ring_desc, tess_factor_ring_offset, tess_factor_ring_base, Temp(), data,
12550          tess_factor_ring_const_offset + 16, memory_sync_info(), true, false, false);
12551       break;
12552    }
12553    default: unreachable("invalid primitive mode"); break;
12554    }
12555
12556    if (einfo->tes_reads_tessfactors) {
12557       Temp layout = get_arg(&ctx, einfo->tcs_offchip_layout);
12558       Temp num_patches, patch_base;
12559
12560       if (ctx.options->is_opengl) {
12561          num_patches = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), layout,
12562                                 Operand::c32(0x3f));
12563          num_patches = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), num_patches,
12564                                 Operand::c32(1));
12565
12566          patch_base = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), layout,
12567                                Operand::c32(16));
12568       } else {
12569          num_patches = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), layout,
12570                                 Operand::c32(0x60006));
12571
12572          patch_base = get_arg(&ctx, einfo->patch_base);
12573       }
12574
12575       Temp tess_ring_desc = get_tess_ring_descriptor(&ctx, einfo, false);
12576       Temp tess_ring_base = get_arg(&ctx, args->tess_offchip_offset);
12577
12578       Temp sbase =
12579          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), tess_ring_base, patch_base);
12580
12581       Temp voffset =
12582          bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4), rel_patch_id);
12583
12584       store_tess_factor_to_tess_ring(&ctx, tess_ring_desc, outer, outer_comps, sbase, voffset,
12585                                      num_patches, tess_lvl_out_loc);
12586
12587       if (inner_comps) {
12588          store_tess_factor_to_tess_ring(&ctx, tess_ring_desc, inner, inner_comps, sbase, voffset,
12589                                         num_patches, tess_lvl_in_loc);
12590       }
12591    }
12592
12593    begin_divergent_if_else(&ctx, &ic_invoc_0);
12594    end_divergent_if(&ctx, &ic_invoc_0);
12595
12596    program->config->float_mode = program->blocks[0].fp_mode.val;
12597
12598    append_logical_end(ctx.block);
12599
12600    bld.reset(ctx.block);
12601    bld.sopp(aco_opcode::s_endpgm);
12602
12603    cleanup_cfg(program);
12604 }
12605
12606 void
12607 select_gl_vs_prolog(Program* program, void* pinfo, ac_shader_config* config,
12608                     const struct aco_compiler_options* options, const struct aco_shader_info* info,
12609                     const struct ac_shader_args* args)
12610 {
12611    const struct aco_gl_vs_prolog_info* vinfo = (const struct aco_gl_vs_prolog_info*)pinfo;
12612    isel_context ctx =
12613       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::VS);
12614
12615    ctx.block->fp_mode = program->next_fp_mode;
12616
12617    add_startpgm(&ctx);
12618    append_logical_start(ctx.block);
12619
12620    Builder bld(ctx.program, ctx.block);
12621
12622    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
12623
12624    if (vinfo->as_ls && options->has_ls_vgpr_init_bug)
12625       fix_ls_vgpr_init_bug(&ctx);
12626
12627    std::vector<Operand> regs;
12628    passthrough_all_args(&ctx, regs);
12629
12630    Temp instance_divisor_constbuf;
12631
12632    if (vinfo->instance_divisor_is_fetched) {
12633       Temp list = get_arg(&ctx, vinfo->internal_bindings);
12634       list = convert_pointer_to_64_bit(&ctx, list);
12635
12636       instance_divisor_constbuf = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list,
12637                                            Operand::c32(vinfo->instance_diviser_buf_offset));
12638    }
12639
12640    unsigned vgpr = 256 + ctx.args->num_vgprs_used;
12641
12642    for (unsigned i = 0; i < vinfo->num_inputs; i++) {
12643       Temp index = get_gl_vs_prolog_vertex_index(&ctx, vinfo, i, instance_divisor_constbuf);
12644       regs.emplace_back(Operand(index, PhysReg{vgpr + i}));
12645    }
12646
12647    program->config->float_mode = program->blocks[0].fp_mode.val;
12648
12649    append_logical_end(ctx.block);
12650
12651    build_end_with_regs(&ctx, regs);
12652
12653    cleanup_cfg(program);
12654 }
12655
12656 } // namespace aco