src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include "aco_instruction_selection.h"
  27
  28 #include "aco_builder.h"
  29 #include "aco_interface.h"
  30 #include "aco_ir.h"
  31
  32 #include "common/ac_nir.h"
  33 #include "common/sid.h"
  34
  35 #include "util/fast_idiv_by_const.h"
  36 #include "util/memstream.h"
  37
  38 #include <array>
  39 #include <functional>
  40 #include <map>
  41 #include <numeric>
  42 #include <stack>
  43 #include <utility>
  44 #include <vector>
  45
  46 namespace aco {
  47 namespace {
  48
  49 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
  50
  51 static void
  52 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
  53           const char* msg)
  54 {
  55    char* out;
  56    size_t outsize;
  57    struct u_memstream mem;
  58    u_memstream_open(&mem, &out, &outsize);
  59    FILE* const memf = u_memstream_get(&mem);
  60
  61    fprintf(memf, "%s: ", msg);
  62    nir_print_instr(instr, memf);
  63    u_memstream_close(&mem);
  64
  65    _aco_err(ctx->program, file, line, out);
  66    free(out);
  67 }
  68
  69 struct if_context {
  70    Temp cond;
  71
  72    bool divergent_old;
  73    bool exec_potentially_empty_discard_old;
  74    bool exec_potentially_empty_break_old;
  75    bool had_divergent_discard_old;
  76    bool had_divergent_discard_then;
  77    uint16_t exec_potentially_empty_break_depth_old;
  78
  79    unsigned BB_if_idx;
  80    unsigned invert_idx;
  81    bool uniform_has_then_branch;
  82    bool then_branch_divergent;
  83    Block BB_invert;
  84    Block BB_endif;
  85 };
  86
  87 struct loop_context {
  88    Block loop_exit;
  89
  90    unsigned header_idx_old;
  91    Block* exit_old;
  92    bool divergent_cont_old;
  93    bool divergent_branch_old;
  94    bool divergent_if_old;
  95 };
  96
  97 static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
  98
  99 static void
 100 add_logical_edge(unsigned pred_idx, Block* succ)
 101 {
 102    succ->logical_preds.emplace_back(pred_idx);
 103 }
 104
 105 static void
 106 add_linear_edge(unsigned pred_idx, Block* succ)
 107 {
 108    succ->linear_preds.emplace_back(pred_idx);
 109 }
 110
 111 static void
 112 add_edge(unsigned pred_idx, Block* succ)
 113 {
 114    add_logical_edge(pred_idx, succ);
 115    add_linear_edge(pred_idx, succ);
 116 }
 117
 118 static void
 119 append_logical_start(Block* b)
 120 {
 121    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 122 }
 123
 124 static void
 125 append_logical_end(Block* b)
 126 {
 127    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 128 }
 129
 130 Temp
 131 get_ssa_temp(struct isel_context* ctx, nir_def* def)
 132 {
 133    uint32_t id = ctx->first_temp_id + def->index;
 134    return Temp(id, ctx->program->temp_rc[id]);
 135 }
 136
 137 Temp
 138 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
 139 {
 140    Builder bld(ctx->program, ctx->block);
 141    assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
 142    assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
 143
 144    if (ctx->program->wave_size == 32) {
 145       Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
 146       return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
 147    }
 148
 149    Operand mask_lo = Operand::c32(-1u);
 150    Operand mask_hi = Operand::c32(-1u);
 151
 152    if (mask.isTemp()) {
 153       RegClass rc = RegClass(mask.regClass().type(), 1);
 154       Builder::Result mask_split =
 155          bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
 156       mask_lo = Operand(mask_split.def(0).getTemp());
 157       mask_hi = Operand(mask_split.def(1).getTemp());
 158    } else if (mask.physReg() == exec) {
 159       mask_lo = Operand(exec_lo, s1);
 160       mask_hi = Operand(exec_hi, s1);
 161    }
 162
 163    Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
 164
 165    if (ctx->program->gfx_level <= GFX7)
 166       return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
 167    else
 168       return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
 169 }
 170
 171 Temp
 172 emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)
 173 {
 174    if (bld.program->stage != fragment_fs) {
 175       if (!dst.id())
 176          return src;
 177       else
 178          return bld.copy(Definition(dst), src);
 179    } else if (!dst.id()) {
 180       dst = bld.tmp(src.regClass());
 181    }
 182
 183    assert(src.bytes() == dst.bytes());
 184    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 185    bld.program->needs_wqm |= program_needs_wqm;
 186    return dst;
 187 }
 188
 189 static Temp
 190 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
 191 {
 192    if (index.regClass() == s1)
 193       return bld.readlane(bld.def(s1), data, index);
 194
 195    /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists
 196     * of multiple binaries, because the VGPR use is not known when choosing
 197     * which registers to use for the shared VGPRs.
 198     */
 199    const bool avoid_shared_vgprs =
 200       ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 &&
 201       ctx->program->wave_size == 64 &&
 202       (ctx->program->info.has_epilog || !ctx->program->info.is_monolithic ||
 203        ctx->stage == raytracing_cs);
 204
 205    if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
 206       /* GFX6-7: there is no bpermute instruction */
 207       Operand index_op(index);
 208       Operand input_data(data);
 209       index_op.setLateKill(true);
 210       input_data.setLateKill(true);
 211
 212       return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
 213                         bld.def(bld.lm, vcc), index_op, input_data);
 214    } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
 215
 216       /* GFX10 wave64 mode: emulate full-wave bpermute */
 217       Temp index_is_lo =
 218          bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
 219       Builder::Result index_is_lo_split =
 220          bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
 221       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
 222                                      index_is_lo_split.def(1).getTemp());
 223       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 224                                      index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
 225       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
 226       Operand input_data(data);
 227
 228       index_x4.setLateKill(true);
 229       input_data.setLateKill(true);
 230       same_half.setLateKill(true);
 231
 232       if (ctx->options->gfx_level <= GFX10_3) {
 233          /* We need one pair of shared VGPRs:
 234           * Note, that these have twice the allocation granularity of normal VGPRs
 235           */
 236          ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
 237
 238          return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2),
 239                            bld.def(s1, scc), index_x4, input_data, same_half);
 240       } else {
 241          return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2),
 242                            bld.def(s1, scc), Operand(v1.as_linear()), index_x4, input_data,
 243                            same_half);
 244       }
 245    } else {
 246       /* GFX8-9 or GFX10 wave32: bpermute works normally */
 247       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
 248       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
 249    }
 250 }
 251
 252 static Temp
 253 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
 254 {
 255    if (ctx->options->gfx_level >= GFX8) {
 256       unsigned and_mask = mask & 0x1f;
 257       unsigned or_mask = (mask >> 5) & 0x1f;
 258       unsigned xor_mask = (mask >> 10) & 0x1f;
 259
 260       uint16_t dpp_ctrl = 0xffff;
 261
 262       /* DPP16 before DPP8 before v_permlane(x)16_b32
 263        * because DPP16 supports modifiers and v_permlane
 264        * can't be folded into valu instructions.
 265        */
 266       if ((and_mask & 0x1c) == 0x1c && or_mask < 4 && xor_mask < 4) {
 267          unsigned res[4] = {0, 1, 2, 3};
 268          for (unsigned i = 0; i < 4; i++)
 269             res[i] = (((res[i] & and_mask) | or_mask) ^ xor_mask) & 0x3;
 270          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
 271       } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
 272          dpp_ctrl = dpp_row_rr(8);
 273       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
 274          dpp_ctrl = dpp_row_mirror;
 275       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
 276          dpp_ctrl = dpp_row_half_mirror;
 277       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x10 && or_mask < 0x10 &&
 278                  xor_mask < 0x10) {
 279          dpp_ctrl = dpp_row_share(or_mask ^ xor_mask);
 280       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && !or_mask &&
 281                  xor_mask < 0x10) {
 282          dpp_ctrl = dpp_row_xmask(xor_mask);
 283       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && or_mask < 8 &&
 284                  xor_mask < 8) {
 285          Builder::Result ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src);
 286          for (unsigned i = 0; i < 8; i++) {
 287             ret->dpp8().lane_sel[i] = (((i & and_mask) | or_mask) ^ xor_mask) & 0x7;
 288          }
 289          return ret;
 290       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10 && or_mask < 0x10) {
 291          uint64_t lane_mask = 0;
 292          for (unsigned i = 0; i < 16; i++)
 293             lane_mask |= uint64_t(((i & and_mask) | or_mask) ^ (xor_mask & 0xf)) << i * 4;
 294          aco_opcode opcode =
 295             xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32;
 296          Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff));
 297          Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
 298          Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
 299          ret->valu().opsel = 0x3; /* set BOUND_CTRL/FETCH_INACTIVE */
 300          return ret;
 301       }
 302
 303       if (dpp_ctrl != 0xffff)
 304          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
 305    }
 306
 307    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
 308 }
 309
 310 Temp
 311 as_vgpr(Builder& bld, Temp val)
 312 {
 313    if (val.type() == RegType::sgpr)
 314       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 315    assert(val.type() == RegType::vgpr);
 316    return val;
 317 }
 318
 319 Temp
 320 as_vgpr(isel_context* ctx, Temp val)
 321 {
 322    Builder bld(ctx->program, ctx->block);
 323    return as_vgpr(bld, val);
 324 }
 325
 326 void
 327 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 328 {
 329    Builder bld(ctx->program, ctx->block);
 330    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
 331 }
 332
 333 Temp
 334 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 335 {
 336    /* no need to extract the whole vector */
 337    if (src.regClass() == dst_rc) {
 338       assert(idx == 0);
 339       return src;
 340    }
 341
 342    assert(src.bytes() > (idx * dst_rc.bytes()));
 343    Builder bld(ctx->program, ctx->block);
 344    auto it = ctx->allocated_vec.find(src.id());
 345    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
 346       if (it->second[idx].regClass() == dst_rc) {
 347          return it->second[idx];
 348       } else {
 349          assert(!dst_rc.is_subdword());
 350          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 351          return bld.copy(bld.def(dst_rc), it->second[idx]);
 352       }
 353    }
 354
 355    if (dst_rc.is_subdword())
 356       src = as_vgpr(ctx, src);
 357
 358    if (src.bytes() == dst_rc.bytes()) {
 359       assert(idx == 0);
 360       return bld.copy(bld.def(dst_rc), src);
 361    } else {
 362       Temp dst = bld.tmp(dst_rc);
 363       emit_extract_vector(ctx, src, idx, dst);
 364       return dst;
 365    }
 366 }
 367
 368 void
 369 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 370 {
 371    if (num_components == 1)
 372       return;
 373    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 374       return;
 375    RegClass rc;
 376    if (num_components > vec_src.size()) {
 377       if (vec_src.type() == RegType::sgpr) {
 378          /* should still help get_alu_src() */
 379          emit_split_vector(ctx, vec_src, vec_src.size());
 380          return;
 381       }
 382       /* sub-dword split */
 383       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
 384    } else {
 385       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
 386    }
 387    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
 388       aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 389    split->operands[0] = Operand(vec_src);
 390    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 391    for (unsigned i = 0; i < num_components; i++) {
 392       elems[i] = ctx->program->allocateTmp(rc);
 393       split->definitions[i] = Definition(elems[i]);
 394    }
 395    ctx->block->instructions.emplace_back(std::move(split));
 396    ctx->allocated_vec.emplace(vec_src.id(), elems);
 397 }
 398
 399 /* This vector expansion uses a mask to determine which elements in the new vector
 400  * come from the original vector. The other elements are undefined. */
 401 void
 402 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
 403               bool zero_padding = false)
 404 {
 405    assert(vec_src.type() == RegType::vgpr);
 406    Builder bld(ctx->program, ctx->block);
 407
 408    if (dst.type() == RegType::sgpr && num_components > dst.size()) {
 409       Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components));
 410       expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
 411       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst);
 412       ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
 413       return;
 414    }
 415
 416    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 417
 418    if (vec_src == dst)
 419       return;
 420
 421    if (num_components == 1) {
 422       if (dst.type() == RegType::sgpr)
 423          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 424       else
 425          bld.copy(Definition(dst), vec_src);
 426       return;
 427    }
 428
 429    unsigned component_bytes = dst.bytes() / num_components;
 430    RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes);
 431    RegClass dst_rc = RegClass::get(dst.type(), component_bytes);
 432    assert(dst.type() == RegType::vgpr || !src_rc.is_subdword());
 433    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 434
 435    Temp padding = Temp(0, dst_rc);
 436    if (zero_padding)
 437       padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));
 438
 439    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
 440       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 441    vec->definitions[0] = Definition(dst);
 442    unsigned k = 0;
 443    for (unsigned i = 0; i < num_components; i++) {
 444       if (mask & (1 << i)) {
 445          Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
 446          if (dst.type() == RegType::sgpr)
 447             src = bld.as_uniform(src);
 448          vec->operands[i] = Operand(src);
 449          elems[i] = src;
 450       } else {
 451          vec->operands[i] = Operand::zero(component_bytes);
 452          elems[i] = padding;
 453       }
 454    }
 455    ctx->block->instructions.emplace_back(std::move(vec));
 456    ctx->allocated_vec.emplace(dst.id(), elems);
 457 }
 458
 459 /* adjust misaligned small bit size loads */
 460 void
 461 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
 462 {
 463    Builder bld(ctx->program, ctx->block);
 464    Operand shift;
 465    Temp select = Temp();
 466    if (offset.isConstant()) {
 467       assert(offset.constantValue() && offset.constantValue() < 4);
 468       shift = Operand::c32(offset.constantValue() * 8);
 469    } else {
 470       /* bit_offset = 8 * (offset & 0x3) */
 471       Temp tmp =
 472          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
 473       select = bld.tmp(s1);
 474       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
 475                        Operand::c32(3u));
 476    }
 477
 478    if (vec.size() == 1) {
 479       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
 480    } else if (vec.size() == 2) {
 481       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
 482       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
 483       if (tmp == dst)
 484          emit_split_vector(ctx, dst, 2);
 485       else
 486          emit_extract_vector(ctx, tmp, 0, dst);
 487    } else if (vec.size() == 3 || vec.size() == 4) {
 488       Temp lo = bld.tmp(s2), hi;
 489       if (vec.size() == 3) {
 490          /* this can happen if we use VMEM for a uniform load */
 491          hi = bld.tmp(s1);
 492          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
 493       } else {
 494          hi = bld.tmp(s2);
 495          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
 496          hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
 497       }
 498       if (select != Temp())
 499          hi =
 500             bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
 501       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
 502       Temp mid = bld.tmp(s1);
 503       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
 504       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
 505       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
 506       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
 507       emit_split_vector(ctx, dst, 2);
 508    }
 509 }
 510
 511 void
 512 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
 513 {
 514    Builder bld(ctx->program, ctx->block);
 515    if (offset.isTemp()) {
 516       Temp tmp[4] = {vec, vec, vec, vec};
 517
 518       if (vec.size() == 4) {
 519          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
 520          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
 521                     Definition(tmp[2]), Definition(tmp[3]), vec);
 522       } else if (vec.size() == 3) {
 523          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
 524          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
 525                     Definition(tmp[2]), vec);
 526       } else if (vec.size() == 2) {
 527          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
 528          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
 529       }
 530       for (unsigned i = 0; i < dst.size(); i++)
 531          tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
 532
 533       vec = tmp[0];
 534       if (dst.size() == 2)
 535          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
 536
 537       offset = Operand::zero();
 538    }
 539
 540    unsigned num_components = vec.bytes() / component_size;
 541    if (vec.regClass() == dst.regClass()) {
 542       assert(offset.constantValue() == 0);
 543       bld.copy(Definition(dst), vec);
 544       emit_split_vector(ctx, dst, num_components);
 545       return;
 546    }
 547
 548    emit_split_vector(ctx, vec, num_components);
 549    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 550    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
 551
 552    assert(offset.constantValue() % component_size == 0);
 553    unsigned skip = offset.constantValue() / component_size;
 554    for (unsigned i = skip; i < num_components; i++)
 555       elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
 556
 557    if (dst.type() == RegType::vgpr) {
 558       /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
 559       num_components = dst.bytes() / component_size;
 560       aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
 561          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 562       for (unsigned i = 0; i < num_components; i++)
 563          create_vec->operands[i] = Operand(elems[i]);
 564       create_vec->definitions[0] = Definition(dst);
 565       bld.insert(std::move(create_vec));
 566
 567    } else if (skip) {
 568       /* if dst is sgpr - split the src, but move the original to sgpr. */
 569       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
 570       byte_align_scalar(ctx, vec, offset, dst);
 571    } else {
 572       assert(dst.size() == vec.size());
 573       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
 574    }
 575
 576    ctx->allocated_vec.emplace(dst.id(), elems);
 577 }
 578
 579 Temp
 580 get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit)
 581 {
 582    RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components);
 583    Temp tmp = get_ssa_temp(ctx, def);
 584    if (tmp.bytes() != rc.bytes())
 585       return emit_extract_vector(ctx, tmp, 0, rc);
 586    else
 587       return tmp;
 588 }
 589
 590 Temp
 591 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
 592 {
 593    Builder bld(ctx->program, ctx->block);
 594    if (!dst.id())
 595       dst = bld.tmp(bld.lm);
 596
 597    assert(val.regClass() == s1);
 598    assert(dst.regClass() == bld.lm);
 599
 600    return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
 601                    bld.scc(val));
 602 }
 603
 604 Temp
 605 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
 606 {
 607    Builder bld(ctx->program, ctx->block);
 608    if (!dst.id())
 609       dst = bld.tmp(s1);
 610
 611    assert(val.regClass() == bld.lm);
 612    assert(dst.regClass() == s1);
 613
 614    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
 615    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm));
 616    return dst;
 617 }
 618
 619 /**
 620  * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
 621  * src_bits and dst_bits are truncated.
 622  *
 623  * Sign extension may be applied using the sign_extend parameter. The position of the input sign
 624  * bit is indicated by src_bits in this case.
 625  *
 626  * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
 627  */
 628 Temp
 629 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
 630             bool sign_extend, Temp dst = Temp())
 631 {
 632    assert(!(sign_extend && dst_bits < src_bits) &&
 633           "Shrinking integers is not supported for signed inputs");
 634
 635    if (!dst.id()) {
 636       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
 637          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
 638       else
 639          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
 640    }
 641
 642    assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
 643    assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
 644
 645    if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
 646       /* Copy the raw value, leaving an undefined value in the upper bits for
 647        * the caller to handle appropriately */
 648       return bld.copy(Definition(dst), src);
 649    } else if (dst.bytes() < src.bytes()) {
 650       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
 651    }
 652
 653    Temp tmp = dst;
 654    if (dst_bits == 64)
 655       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
 656
 657    if (tmp == src) {
 658    } else if (src.regClass() == s1) {
 659       assert(src_bits < 32);
 660       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
 661                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
 662    } else {
 663       assert(src_bits < 32);
 664       bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(),
 665                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
 666    }
 667
 668    if (dst_bits == 64) {
 669       if (sign_extend && dst.regClass() == s2) {
 670          Temp high =
 671             bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
 672          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
 673       } else if (sign_extend && dst.regClass() == v2) {
 674          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
 675          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
 676       } else {
 677          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
 678       }
 679    }
 680
 681    return dst;
 682 }
 683
 684 enum sgpr_extract_mode {
 685    sgpr_extract_sext,
 686    sgpr_extract_zext,
 687    sgpr_extract_undef,
 688 };
 689
 690 Temp
 691 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
 692 {
 693    Temp vec = get_ssa_temp(ctx, src->src.ssa);
 694    unsigned src_size = src->src.ssa->bit_size;
 695    unsigned swizzle = src->swizzle[0];
 696
 697    if (vec.size() > 1) {
 698       assert(src_size == 16);
 699       vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
 700       swizzle = swizzle & 1;
 701    }
 702
 703    Builder bld(ctx->program, ctx->block);
 704    Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
 705
 706    if (mode == sgpr_extract_undef && swizzle == 0)
 707       bld.copy(Definition(tmp), vec);
 708    else
 709       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
 710                  Operand::c32(swizzle), Operand::c32(src_size),
 711                  Operand::c32((mode == sgpr_extract_sext)));
 712
 713    if (dst.regClass() == s2)
 714       convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
 715
 716    return dst;
 717 }
 718
 719 Temp
 720 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
 721 {
 722    if (src.src.ssa->num_components == 1 && size == 1)
 723       return get_ssa_temp(ctx, src.src.ssa);
 724
 725    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 726    unsigned elem_size = src.src.ssa->bit_size / 8u;
 727    bool identity_swizzle = true;
 728
 729    for (unsigned i = 0; identity_swizzle && i < size; i++) {
 730       if (src.swizzle[i] != i)
 731          identity_swizzle = false;
 732    }
 733    if (identity_swizzle)
 734       return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
 735
 736    assert(elem_size > 0);
 737    assert(vec.bytes() % elem_size == 0);
 738
 739    if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
 740       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
 741       return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
 742                                            sgpr_extract_undef);
 743    }
 744
 745    bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
 746    if (as_uniform)
 747       vec = as_vgpr(ctx, vec);
 748
 749    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
 750                                     : RegClass(vec.type(), elem_size / 4);
 751    if (size == 1) {
 752       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 753    } else {
 754       assert(size <= 4);
 755       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 756       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
 757          aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 758       for (unsigned i = 0; i < size; ++i) {
 759          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 760          vec_instr->operands[i] = Operand{elems[i]};
 761       }
 762       Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
 763       vec_instr->definitions[0] = Definition(dst);
 764       ctx->block->instructions.emplace_back(std::move(vec_instr));
 765       ctx->allocated_vec.emplace(dst.id(), elems);
 766       return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
 767    }
 768 }
 769
 770 Temp
 771 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
 772 {
 773    /* returns v2b or v1 for vop3p usage.
 774     * The source expects exactly 2 16bit components
 775     * which are within the same dword
 776     */
 777    assert(src.src.ssa->bit_size == 16);
 778    assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
 779
 780    Temp tmp = get_ssa_temp(ctx, src.src.ssa);
 781    if (tmp.size() == 1)
 782       return tmp;
 783
 784    /* the size is larger than 1 dword: check the swizzle */
 785    unsigned dword = src.swizzle[0] >> 1;
 786
 787    /* extract a full dword if possible */
 788    if (tmp.bytes() >= (dword + 1) * 4) {
 789       /* if the source is split into components, use p_create_vector */
 790       auto it = ctx->allocated_vec.find(tmp.id());
 791       if (it != ctx->allocated_vec.end()) {
 792          unsigned index = dword << 1;
 793          Builder bld(ctx->program, ctx->block);
 794          if (it->second[index].regClass() == v2b)
 795             return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
 796                               it->second[index + 1]);
 797       }
 798       return emit_extract_vector(ctx, tmp, dword, v1);
 799    } else {
 800       /* This must be a swizzled access to %a.zz where %a is v6b */
 801       assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
 802       assert(tmp.regClass() == v6b && dword == 1);
 803       return emit_extract_vector(ctx, tmp, dword * 2, v2b);
 804    }
 805 }
 806
 807 uint32_t
 808 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
 809 {
 810    nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
 811    return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
 812 }
 813
 814 Temp
 815 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
 816 {
 817    if (ptr.size() == 2)
 818       return ptr;
 819    Builder bld(ctx->program, ctx->block);
 820    if (ptr.type() == RegType::vgpr && !non_uniform)
 821       ptr = bld.as_uniform(ptr);
 822    return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
 823                      Operand::c32((unsigned)ctx->options->address32_hi));
 824 }
 825
 826 void
 827 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
 828                       bool writes_scc, uint8_t uses_ub = 0)
 829 {
 830    aco_ptr<SOP2_instruction> sop2{
 831       create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 832    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 833    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 834    sop2->definitions[0] = Definition(dst);
 835    if (instr->no_unsigned_wrap)
 836       sop2->definitions[0].setNUW(true);
 837    if (writes_scc)
 838       sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
 839
 840    for (int i = 0; i < 2; i++) {
 841       if (uses_ub & (1 << i)) {
 842          uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
 843          if (src_ub <= 0xffff)
 844             sop2->operands[i].set16bit(true);
 845          else if (src_ub <= 0xffffff)
 846             sop2->operands[i].set24bit(true);
 847       }
 848    }
 849
 850    ctx->block->instructions.emplace_back(std::move(sop2));
 851 }
 852
 853 void
 854 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
 855                       bool commutative, bool swap_srcs = false, bool flush_denorms = false,
 856                       bool nuw = false, uint8_t uses_ub = 0)
 857 {
 858    Builder bld(ctx->program, ctx->block);
 859    bld.is_precise = instr->exact;
 860
 861    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 862    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 863    if (src1.type() == RegType::sgpr) {
 864       if (commutative && src0.type() == RegType::vgpr) {
 865          Temp t = src0;
 866          src0 = src1;
 867          src1 = t;
 868       } else {
 869          src1 = as_vgpr(ctx, src1);
 870       }
 871    }
 872
 873    Operand op[2] = {Operand(src0), Operand(src1)};
 874
 875    for (int i = 0; i < 2; i++) {
 876       if (uses_ub & (1 << i)) {
 877          uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
 878          if (src_ub <= 0xffff)
 879             op[i].set16bit(true);
 880          else if (src_ub <= 0xffffff)
 881             op[i].set24bit(true);
 882       }
 883    }
 884
 885    if (flush_denorms && ctx->program->gfx_level < GFX9) {
 886       assert(dst.size() == 1);
 887       Temp tmp = bld.vop2(opc, bld.def(v1), op[0], op[1]);
 888       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
 889    } else {
 890       if (nuw) {
 891          bld.nuw().vop2(opc, Definition(dst), op[0], op[1]);
 892       } else {
 893          bld.vop2(opc, Definition(dst), op[0], op[1]);
 894       }
 895    }
 896 }
 897
 898 void
 899 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
 900 {
 901    Builder bld(ctx->program, ctx->block);
 902    bld.is_precise = instr->exact;
 903
 904    Temp src0 = get_alu_src(ctx, instr->src[0]);
 905    Temp src1 = get_alu_src(ctx, instr->src[1]);
 906
 907    if (src1.type() == RegType::sgpr) {
 908       assert(src0.type() == RegType::vgpr);
 909       std::swap(src0, src1);
 910    }
 911
 912    Temp src00 = bld.tmp(src0.type(), 1);
 913    Temp src01 = bld.tmp(src0.type(), 1);
 914    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
 915    Temp src10 = bld.tmp(v1);
 916    Temp src11 = bld.tmp(v1);
 917    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
 918    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
 919    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
 920    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
 921 }
 922
 923 void
 924 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
 925                        bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
 926 {
 927    assert(num_sources == 2 || num_sources == 3);
 928    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
 929    bool has_sgpr = false;
 930    for (unsigned i = 0; i < num_sources; i++) {
 931       src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
 932       if (has_sgpr)
 933          src[i] = as_vgpr(ctx, src[i]);
 934       else
 935          has_sgpr = src[i].type() == RegType::sgpr;
 936    }
 937
 938    Builder bld(ctx->program, ctx->block);
 939    bld.is_precise = instr->exact;
 940    if (flush_denorms && ctx->program->gfx_level < GFX9) {
 941       Temp tmp;
 942       if (num_sources == 3)
 943          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
 944       else
 945          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
 946       if (dst.size() == 1)
 947          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
 948       else
 949          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
 950    } else if (num_sources == 3) {
 951       bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
 952    } else {
 953       bld.vop3(op, Definition(dst), src[0], src[1]);
 954    }
 955 }
 956
 957 Builder::Result
 958 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
 959                        bool swap_srcs = false)
 960 {
 961    Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
 962    Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
 963    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 964       src1 = as_vgpr(ctx, src1);
 965    assert(instr->def.num_components == 2);
 966
 967    /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
 968    unsigned opsel_lo =
 969       (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
 970    unsigned opsel_hi =
 971       (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
 972
 973    Builder bld(ctx->program, ctx->block);
 974    bld.is_precise = instr->exact;
 975    Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
 976    return res;
 977 }
 978
 979 void
 980 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
 981                       unsigned neg_lo = 0)
 982 {
 983    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
 984    bool has_sgpr = false;
 985    for (unsigned i = 0; i < 3; i++) {
 986       src[i] = get_alu_src(ctx, instr->src[i]);
 987       if (has_sgpr)
 988          src[i] = as_vgpr(ctx, src[i]);
 989       else
 990          has_sgpr = src[i].type() == RegType::sgpr;
 991    }
 992
 993    Builder bld(ctx->program, ctx->block);
 994    bld.is_precise = instr->exact;
 995    VALU_instruction& vop3p =
 996       bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
 997    vop3p.clamp = clamp;
 998    vop3p.neg_lo = neg_lo;
 999 }
1000
1001 void
1002 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1003 {
1004    Builder bld(ctx->program, ctx->block);
1005    bld.is_precise = instr->exact;
1006    if (dst.type() == RegType::sgpr)
1007       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1008                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
1009    else
1010       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
1011 }
1012
1013 void
1014 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1015 {
1016    Temp src0 = get_alu_src(ctx, instr->src[0]);
1017    Temp src1 = get_alu_src(ctx, instr->src[1]);
1018    assert(src0.size() == src1.size());
1019
1020    aco_ptr<Instruction> vopc;
1021    if (src1.type() == RegType::sgpr) {
1022       if (src0.type() == RegType::vgpr) {
1023          /* to swap the operands, we might also have to change the opcode */
1024          switch (op) {
1025          case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
1026          case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
1027          case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
1028          case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
1029          case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
1030          case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
1031          case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
1032          case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
1033          case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
1034          case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
1035          case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
1036          case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
1037          case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
1038          case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1039          case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1040          case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1041          case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1042          case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1043          default: /* eq and ne are commutative */ break;
1044          }
1045          Temp t = src0;
1046          src0 = src1;
1047          src1 = t;
1048       } else {
1049          src1 = as_vgpr(ctx, src1);
1050       }
1051    }
1052
1053    Builder bld(ctx->program, ctx->block);
1054    bld.vopc(op, Definition(dst), src0, src1);
1055 }
1056
1057 void
1058 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1059 {
1060    Temp src0 = get_alu_src(ctx, instr->src[0]);
1061    Temp src1 = get_alu_src(ctx, instr->src[1]);
1062    Builder bld(ctx->program, ctx->block);
1063
1064    assert(dst.regClass() == bld.lm);
1065    assert(src0.type() == RegType::sgpr);
1066    assert(src1.type() == RegType::sgpr);
1067
1068    /* Emit the SALU comparison instruction */
1069    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1070    /* Turn the result into a per-lane bool */
1071    bool_to_vector_condition(ctx, cmp, dst);
1072 }
1073
1074 void
1075 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1076                 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1077                 aco_opcode s64_op = aco_opcode::num_opcodes)
1078 {
1079    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
1080                      : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1081                                                              : aco_opcode::num_opcodes;
1082    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
1083                      : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1084                                                              : v16_op;
1085    bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
1086                    get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1087                    get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1088    aco_opcode op = use_valu ? v_op : s_op;
1089    assert(op != aco_opcode::num_opcodes);
1090    assert(dst.regClass() == ctx->program->lane_mask);
1091
1092    if (use_valu)
1093       emit_vopc_instruction(ctx, instr, op, dst);
1094    else
1095       emit_sopc_instruction(ctx, instr, op, dst);
1096 }
1097
1098 void
1099 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1100                    Temp dst)
1101 {
1102    Builder bld(ctx->program, ctx->block);
1103    Temp src0 = get_alu_src(ctx, instr->src[0]);
1104    Temp src1 = get_alu_src(ctx, instr->src[1]);
1105
1106    assert(dst.regClass() == bld.lm);
1107    assert(src0.regClass() == bld.lm);
1108    assert(src1.regClass() == bld.lm);
1109
1110    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1111 }
1112
1113 void
1114 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1115 {
1116    Builder bld(ctx->program, ctx->block);
1117    Temp cond = get_alu_src(ctx, instr->src[0]);
1118    Temp then = get_alu_src(ctx, instr->src[1]);
1119    Temp els = get_alu_src(ctx, instr->src[2]);
1120
1121    assert(cond.regClass() == bld.lm);
1122
1123    if (dst.type() == RegType::vgpr) {
1124       aco_ptr<Instruction> bcsel;
1125       if (dst.size() == 1) {
1126          then = as_vgpr(ctx, then);
1127          els = as_vgpr(ctx, els);
1128
1129          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1130       } else if (dst.size() == 2) {
1131          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1132          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1133          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1134          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1135
1136          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1137          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1138
1139          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1140       } else {
1141          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1142       }
1143       return;
1144    }
1145
1146    if (instr->def.bit_size == 1) {
1147       assert(dst.regClass() == bld.lm);
1148       assert(then.regClass() == bld.lm);
1149       assert(els.regClass() == bld.lm);
1150    }
1151
1152    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1153       if (dst.regClass() == s1 || dst.regClass() == s2) {
1154          assert((then.regClass() == s1 || then.regClass() == s2) &&
1155                 els.regClass() == then.regClass());
1156          assert(dst.size() == then.size());
1157          aco_opcode op =
1158             dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1159          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1160       } else {
1161          isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1162       }
1163       return;
1164    }
1165
1166    /* divergent boolean bcsel
1167     * this implements bcsel on bools: dst = s0 ? s1 : s2
1168     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1169    assert(instr->def.bit_size == 1);
1170
1171    if (cond.id() != then.id())
1172       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1173
1174    if (cond.id() == els.id())
1175       bld.copy(Definition(dst), then);
1176    else
1177       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1178                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1179 }
1180
1181 void
1182 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1183                uint32_t undo)
1184 {
1185    /* multiply by 16777216 to handle denormals */
1186    Temp is_denormal = bld.tmp(bld.lm);
1187    VALU_instruction& valu =
1188       bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal), val, Operand::c32(1u << 4))
1189          ->valu();
1190    valu.neg[0] = true;
1191    valu.abs[0] = true;
1192    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1193    scaled = bld.vop1(op, bld.def(v1), scaled);
1194    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1195
1196    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1197
1198    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1199 }
1200
1201 void
1202 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1203 {
1204    if (ctx->block->fp_mode.denorm32 == 0) {
1205       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1206       return;
1207    }
1208
1209    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1210 }
1211
1212 void
1213 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1214 {
1215    if (ctx->block->fp_mode.denorm32 == 0) {
1216       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1217       return;
1218    }
1219
1220    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1221 }
1222
1223 void
1224 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1225 {
1226    if (ctx->block->fp_mode.denorm32 == 0) {
1227       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1228       return;
1229    }
1230
1231    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1232 }
1233
1234 void
1235 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1236 {
1237    if (ctx->block->fp_mode.denorm32 == 0) {
1238       bld.vop1(aco_opcode::v_log_f32, dst, val);
1239       return;
1240    }
1241
1242    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1243 }
1244
1245 Temp
1246 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1247 {
1248    if (ctx->options->gfx_level >= GFX7)
1249       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1250
1251    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1252    /* TODO: create more efficient code! */
1253    if (val.type() == RegType::sgpr)
1254       val = as_vgpr(ctx, val);
1255
1256    /* Split the input value. */
1257    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1258    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1259
1260    /* Extract the exponent and compute the unbiased value. */
1261    Temp exponent =
1262       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1263    exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1264
1265    /* Extract the fractional part. */
1266    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1267                                 Operand::c32(0x000fffffu));
1268    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1269
1270    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1271    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1272               fract_mask);
1273
1274    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1275    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1276    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1277    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1278    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1279
1280    /* Get the sign bit. */
1281    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1282
1283    /* Decide the operation to apply depending on the unbiased exponent. */
1284    Temp exp_lt0 =
1285       bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
1286    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1287                           bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1288    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1289    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1290    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1291    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1292
1293    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1294 }
1295
1296 Temp
1297 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1298 {
1299    if (ctx->options->gfx_level >= GFX7)
1300       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1301
1302    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1303     * lowered at NIR level for precision reasons). */
1304    Temp src0 = as_vgpr(ctx, val);
1305
1306    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1307                              Operand::c32(0x3fefffffu));
1308
1309    Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
1310    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1311    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1312
1313    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1314    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1315    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1316    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1317
1318    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1319    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1320
1321    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1322
1323    Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1324    add->valu().neg[1] = true;
1325
1326    return add->definitions[0].getTemp();
1327 }
1328
1329 Temp
1330 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1331 {
1332    if (bld.program->gfx_level < GFX8) {
1333       Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1334       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1335                           add.def(1).getTemp());
1336    }
1337
1338    Builder::Result add(NULL);
1339    if (bld.program->gfx_level >= GFX9) {
1340       add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1341    } else {
1342       add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
1343    }
1344    add->valu().clamp = 1;
1345    return dst.getTemp();
1346 }
1347
1348 Temp
1349 usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1350 {
1351    if (bld.program->gfx_level < GFX8) {
1352       Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
1353       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
1354                           sub.def(1).getTemp());
1355    }
1356
1357    Builder::Result sub(NULL);
1358    if (bld.program->gfx_level >= GFX9) {
1359       sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
1360    } else {
1361       sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
1362    }
1363    sub->valu().clamp = 1;
1364    return dst.getTemp();
1365 }
1366
1367 void
1368 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1369 {
1370    Builder bld(ctx->program, ctx->block);
1371    bld.is_precise = instr->exact;
1372    Temp dst = get_ssa_temp(ctx, &instr->def);
1373    switch (instr->op) {
1374    case nir_op_vec2:
1375    case nir_op_vec3:
1376    case nir_op_vec4:
1377    case nir_op_vec5:
1378    case nir_op_vec8:
1379    case nir_op_vec16: {
1380       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1381       unsigned num = instr->def.num_components;
1382       for (unsigned i = 0; i < num; ++i)
1383          elems[i] = get_alu_src(ctx, instr->src[i]);
1384
1385       if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
1386          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1387             aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1)};
1388          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->def.bit_size / 8u);
1389          for (unsigned i = 0; i < num; ++i) {
1390             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1391                elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1392             vec->operands[i] = Operand{elems[i]};
1393          }
1394          vec->definitions[0] = Definition(dst);
1395          ctx->block->instructions.emplace_back(std::move(vec));
1396          ctx->allocated_vec.emplace(dst.id(), elems);
1397       } else {
1398          bool use_s_pack = ctx->program->gfx_level >= GFX9;
1399          Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
1400
1401          std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1402          uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1403          for (unsigned i = 0; i < num; i++) {
1404             unsigned packed_size = use_s_pack ? 16 : 32;
1405             unsigned idx = i * instr->def.bit_size / packed_size;
1406             unsigned offset = i * instr->def.bit_size % packed_size;
1407             if (nir_src_is_const(instr->src[i].src)) {
1408                const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1409                continue;
1410             }
1411             if (nir_src_is_undef(instr->src[i].src))
1412                continue;
1413
1414             if (offset != packed_size - instr->def.bit_size)
1415                elems[i] =
1416                   bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1417
1418             if (offset)
1419                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1420                                    Operand::c32(offset));
1421
1422             if (packed[idx].id())
1423                packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1424                                       packed[idx]);
1425             else
1426                packed[idx] = elems[i];
1427          }
1428
1429          if (use_s_pack) {
1430             for (unsigned i = 0; i < dst.size(); i++) {
1431                bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1432
1433                if (packed[i * 2].id() && packed[i * 2 + 1].id())
1434                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1435                                        packed[i * 2 + 1]);
1436                else if (packed[i * 2 + 1].id())
1437                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1438                                        Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1439                else if (packed[i * 2].id())
1440                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1441                                        Operand::c32(const_vals[i * 2 + 1]));
1442                else
1443                   packed[i] = Temp(); /* Both constants, so reset the entry */
1444
1445                if (same)
1446                   const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1447                else
1448                   const_vals[i] = 0;
1449             }
1450          }
1451
1452          for (unsigned i = 0; i < dst.size(); i++) {
1453             if (const_vals[i] && packed[i].id())
1454                packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1455                                     Operand::c32(const_vals[i]), packed[i]);
1456             else if (!packed[i].id())
1457                packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1458          }
1459
1460          if (dst.size() == 1)
1461             bld.copy(Definition(dst), packed[0]);
1462          else {
1463             aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1464                aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
1465             vec->definitions[0] = Definition(dst);
1466             for (unsigned i = 0; i < dst.size(); ++i)
1467                vec->operands[i] = Operand(packed[i]);
1468             bld.insert(std::move(vec));
1469          }
1470       }
1471       break;
1472    }
1473    case nir_op_mov: {
1474       Temp src = get_alu_src(ctx, instr->src[0]);
1475       if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1476          /* use size() instead of bytes() for 8/16-bit */
1477          assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1478          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1479       } else {
1480          assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1481          bld.copy(Definition(dst), src);
1482       }
1483       break;
1484    }
1485    case nir_op_inot: {
1486       Temp src = get_alu_src(ctx, instr->src[0]);
1487       if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1488          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1489       } else if (dst.regClass() == v2) {
1490          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1491          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1492          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1493          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1494          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1495       } else if (dst.type() == RegType::sgpr) {
1496          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1497          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1498       } else {
1499          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1500       }
1501       break;
1502    }
1503    case nir_op_iabs: {
1504       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1505          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
1506
1507          unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
1508          unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
1509
1510          Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
1511                               src, opsel_lo, opsel_hi);
1512          bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
1513          break;
1514       }
1515       Temp src = get_alu_src(ctx, instr->src[0]);
1516       if (dst.regClass() == s1) {
1517          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1518       } else if (dst.regClass() == v1) {
1519          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1520                   bld.vsub32(bld.def(v1), Operand::zero(), src));
1521       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1522          bld.vop3(
1523             aco_opcode::v_max_i16_e64, Definition(dst), src,
1524             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1525       } else if (dst.regClass() == v2b) {
1526          src = as_vgpr(ctx, src);
1527          bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
1528                   bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1529       } else {
1530          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1531       }
1532       break;
1533    }
1534    case nir_op_isign: {
1535       Temp src = get_alu_src(ctx, instr->src[0]);
1536       if (dst.regClass() == s1) {
1537          Temp tmp =
1538             bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1539          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1540       } else if (dst.regClass() == s2) {
1541          Temp neg =
1542             bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1543          Temp neqz;
1544          if (ctx->program->gfx_level >= GFX8)
1545             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1546          else
1547             neqz =
1548                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1549                   .def(1)
1550                   .getTemp();
1551          /* SCC gets zero-extended to 64 bit */
1552          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1553       } else if (dst.regClass() == v1) {
1554          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1555       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
1556          bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
1557       } else if (dst.regClass() == v2b) {
1558          src = as_vgpr(ctx, src);
1559          bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
1560                   bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
1561       } else if (dst.regClass() == v2) {
1562          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1563          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1564          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
1565          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1566          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1567          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1568       } else {
1569          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1570       }
1571       break;
1572    }
1573    case nir_op_imax: {
1574       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1575          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1576       } else if (dst.regClass() == v2b) {
1577          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1578       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1579          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1580       } else if (dst.regClass() == v1) {
1581          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1582       } else if (dst.regClass() == s1) {
1583          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1584       } else {
1585          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1586       }
1587       break;
1588    }
1589    case nir_op_umax: {
1590       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1591          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1592       } else if (dst.regClass() == v2b) {
1593          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1594       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1595          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1596       } else if (dst.regClass() == v1) {
1597          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1598       } else if (dst.regClass() == s1) {
1599          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1600       } else {
1601          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1602       }
1603       break;
1604    }
1605    case nir_op_imin: {
1606       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1607          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1608       } else if (dst.regClass() == v2b) {
1609          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1610       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1611          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1612       } else if (dst.regClass() == v1) {
1613          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1614       } else if (dst.regClass() == s1) {
1615          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1616       } else {
1617          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1618       }
1619       break;
1620    }
1621    case nir_op_umin: {
1622       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1623          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1624       } else if (dst.regClass() == v2b) {
1625          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1626       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1627          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1628       } else if (dst.regClass() == v1) {
1629          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1630       } else if (dst.regClass() == s1) {
1631          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1632       } else {
1633          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1634       }
1635       break;
1636    }
1637    case nir_op_ior: {
1638       if (instr->def.bit_size == 1) {
1639          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1640       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1641          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1642       } else if (dst.regClass() == v2) {
1643          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1644       } else if (dst.regClass() == s1) {
1645          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1646       } else if (dst.regClass() == s2) {
1647          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1648       } else {
1649          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1650       }
1651       break;
1652    }
1653    case nir_op_iand: {
1654       if (instr->def.bit_size == 1) {
1655          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1656       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1657          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1658       } else if (dst.regClass() == v2) {
1659          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1660       } else if (dst.regClass() == s1) {
1661          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1662       } else if (dst.regClass() == s2) {
1663          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1664       } else {
1665          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1666       }
1667       break;
1668    }
1669    case nir_op_ixor: {
1670       if (instr->def.bit_size == 1) {
1671          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1672       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1673          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1674       } else if (dst.regClass() == v2) {
1675          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1676       } else if (dst.regClass() == s1) {
1677          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1678       } else if (dst.regClass() == s2) {
1679          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1680       } else {
1681          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1682       }
1683       break;
1684    }
1685    case nir_op_ushr: {
1686       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1687          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1688       } else if (dst.regClass() == v2b) {
1689          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1690       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1691          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1692       } else if (dst.regClass() == v1) {
1693          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1694       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1695          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1696                   get_alu_src(ctx, instr->src[0]));
1697       } else if (dst.regClass() == v2) {
1698          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1699       } else if (dst.regClass() == s2) {
1700          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1701       } else if (dst.regClass() == s1) {
1702          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1703       } else {
1704          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1705       }
1706       break;
1707    }
1708    case nir_op_ishl: {
1709       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1710          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1711       } else if (dst.regClass() == v2b) {
1712          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1713       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1714          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1715       } else if (dst.regClass() == v1) {
1716          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1717                                false, 2);
1718       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1719          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1720                   get_alu_src(ctx, instr->src[0]));
1721       } else if (dst.regClass() == v2) {
1722          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1723       } else if (dst.regClass() == s1) {
1724          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1725       } else if (dst.regClass() == s2) {
1726          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1727       } else {
1728          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1729       }
1730       break;
1731    }
1732    case nir_op_ishr: {
1733       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1734          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1735       } else if (dst.regClass() == v2b) {
1736          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1737       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1738          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1739       } else if (dst.regClass() == v1) {
1740          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1741       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1742          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1743                   get_alu_src(ctx, instr->src[0]));
1744       } else if (dst.regClass() == v2) {
1745          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1746       } else if (dst.regClass() == s1) {
1747          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1748       } else if (dst.regClass() == s2) {
1749          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1750       } else {
1751          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1752       }
1753       break;
1754    }
1755    case nir_op_find_lsb: {
1756       Temp src = get_alu_src(ctx, instr->src[0]);
1757       if (src.regClass() == s1) {
1758          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1759       } else if (src.regClass() == v1) {
1760          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1761       } else if (src.regClass() == s2) {
1762          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1763       } else {
1764          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1765       }
1766       break;
1767    }
1768    case nir_op_ufind_msb:
1769    case nir_op_ifind_msb: {
1770       Temp src = get_alu_src(ctx, instr->src[0]);
1771       if (src.regClass() == s1 || src.regClass() == s2) {
1772          aco_opcode op = src.regClass() == s2
1773                             ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1774                                                              : aco_opcode::s_flbit_i32_i64)
1775                             : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1776                                                              : aco_opcode::s_flbit_i32);
1777          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1778
1779          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1780                                         Operand::c32(src.size() * 32u - 1u), msb_rev);
1781          Temp msb = sub.def(0).getTemp();
1782          Temp carry = sub.def(1).getTemp();
1783
1784          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1785                   bld.scc(carry));
1786       } else if (src.regClass() == v1) {
1787          aco_opcode op =
1788             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1789          Temp msb_rev = bld.tmp(v1);
1790          emit_vop1_instruction(ctx, instr, op, msb_rev);
1791          Temp msb = bld.tmp(v1);
1792          Temp carry =
1793             bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1794          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1795       } else if (src.regClass() == v2) {
1796          aco_opcode op =
1797             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1798
1799          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1800          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1801
1802          lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1803                          bld.vop1(op, bld.def(v1), lo));
1804          hi = bld.vop1(op, bld.def(v1), hi);
1805          Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1806
1807          Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1808
1809          Temp msb = bld.tmp(v1);
1810          Temp carry =
1811             bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1812          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1813       } else {
1814          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1815       }
1816       break;
1817    }
1818    case nir_op_ufind_msb_rev:
1819    case nir_op_ifind_msb_rev: {
1820       Temp src = get_alu_src(ctx, instr->src[0]);
1821       if (src.regClass() == s1) {
1822          aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
1823                                                            : aco_opcode::s_flbit_i32;
1824          bld.sop1(op, Definition(dst), src);
1825       } else if (src.regClass() == v1) {
1826          aco_opcode op =
1827             instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1828          emit_vop1_instruction(ctx, instr, op, dst);
1829       } else {
1830          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1831       }
1832       break;
1833    }
1834    case nir_op_bitfield_reverse: {
1835       if (dst.regClass() == s1) {
1836          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1837       } else if (dst.regClass() == v1) {
1838          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1839       } else {
1840          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1841       }
1842       break;
1843    }
1844    case nir_op_iadd: {
1845       if (dst.regClass() == s1) {
1846          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1847          break;
1848       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
1849          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1850          break;
1851       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
1852          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1853          break;
1854       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1855          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1856          break;
1857       }
1858
1859       Temp src0 = get_alu_src(ctx, instr->src[0]);
1860       Temp src1 = get_alu_src(ctx, instr->src[1]);
1861       if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1862          if (instr->no_unsigned_wrap)
1863             bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
1864          else
1865             bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1866          break;
1867       }
1868
1869       assert(src0.size() == 2 && src1.size() == 2);
1870       Temp src00 = bld.tmp(src0.type(), 1);
1871       Temp src01 = bld.tmp(dst.type(), 1);
1872       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1873       Temp src10 = bld.tmp(src1.type(), 1);
1874       Temp src11 = bld.tmp(dst.type(), 1);
1875       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1876
1877       if (dst.regClass() == s2) {
1878          Temp carry = bld.tmp(s1);
1879          Temp dst0 =
1880             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1881          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1882                               bld.scc(carry));
1883          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1884       } else if (dst.regClass() == v2) {
1885          Temp dst0 = bld.tmp(v1);
1886          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1887          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1888          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1889       } else {
1890          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1891       }
1892       break;
1893    }
1894    case nir_op_uadd_sat: {
1895       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1896          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1897          add_instr->valu().clamp = 1;
1898          break;
1899       }
1900       Temp src0 = get_alu_src(ctx, instr->src[0]);
1901       Temp src1 = get_alu_src(ctx, instr->src[1]);
1902       if (dst.regClass() == s1) {
1903          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1904          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1905          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1906                   bld.scc(carry));
1907          break;
1908       } else if (dst.regClass() == v2b) {
1909          Instruction* add_instr;
1910          if (ctx->program->gfx_level >= GFX10) {
1911             add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1912          } else {
1913             if (src1.type() == RegType::sgpr)
1914                std::swap(src0, src1);
1915             add_instr =
1916                bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1917          }
1918          add_instr->valu().clamp = 1;
1919          break;
1920       } else if (dst.regClass() == v1) {
1921          uadd32_sat(bld, Definition(dst), src0, src1);
1922          break;
1923       }
1924
1925       assert(src0.size() == 2 && src1.size() == 2);
1926
1927       Temp src00 = bld.tmp(src0.type(), 1);
1928       Temp src01 = bld.tmp(src0.type(), 1);
1929       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1930       Temp src10 = bld.tmp(src1.type(), 1);
1931       Temp src11 = bld.tmp(src1.type(), 1);
1932       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1933
1934       if (dst.regClass() == s2) {
1935          Temp carry0 = bld.tmp(s1);
1936          Temp carry1 = bld.tmp(s1);
1937
1938          Temp no_sat0 =
1939             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
1940          Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
1941                                  src01, src11, bld.scc(carry0));
1942
1943          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
1944
1945          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
1946                   bld.scc(carry1));
1947       } else if (dst.regClass() == v2) {
1948          Temp no_sat0 = bld.tmp(v1);
1949          Temp dst0 = bld.tmp(v1);
1950          Temp dst1 = bld.tmp(v1);
1951
1952          Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
1953          Temp carry1;
1954
1955          if (ctx->program->gfx_level >= GFX8) {
1956             carry1 = bld.tmp(bld.lm);
1957             bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
1958                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
1959                ->valu()
1960                .clamp = 1;
1961          } else {
1962             Temp no_sat1 = bld.tmp(v1);
1963             carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
1964             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
1965                          carry1);
1966          }
1967
1968          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
1969                       carry1);
1970          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1971       } else {
1972          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1973       }
1974       break;
1975    }
1976    case nir_op_iadd_sat: {
1977       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1978          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
1979          add_instr->valu().clamp = 1;
1980          break;
1981       }
1982       Temp src0 = get_alu_src(ctx, instr->src[0]);
1983       Temp src1 = get_alu_src(ctx, instr->src[1]);
1984       if (dst.regClass() == s1) {
1985          Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
1986          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
1987                                Operand::c32(INT32_MAX), cond);
1988          Temp overflow = bld.tmp(s1);
1989          Temp add =
1990             bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
1991          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
1992          break;
1993       }
1994
1995       src1 = as_vgpr(ctx, src1);
1996
1997       if (dst.regClass() == v2b) {
1998          Instruction* add_instr =
1999             bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
2000          add_instr->valu().clamp = 1;
2001       } else if (dst.regClass() == v1) {
2002          Instruction* add_instr =
2003             bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
2004          add_instr->valu().clamp = 1;
2005       } else {
2006          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2007       }
2008       break;
2009    }
2010    case nir_op_uadd_carry: {
2011       Temp src0 = get_alu_src(ctx, instr->src[0]);
2012       Temp src1 = get_alu_src(ctx, instr->src[1]);
2013       if (dst.regClass() == s1) {
2014          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2015          break;
2016       }
2017       if (dst.regClass() == v1) {
2018          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
2019          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2020                       carry);
2021          break;
2022       }
2023
2024       Temp src00 = bld.tmp(src0.type(), 1);
2025       Temp src01 = bld.tmp(dst.type(), 1);
2026       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2027       Temp src10 = bld.tmp(src1.type(), 1);
2028       Temp src11 = bld.tmp(dst.type(), 1);
2029       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2030       if (dst.regClass() == s2) {
2031          Temp carry = bld.tmp(s1);
2032          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
2033          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2034                           bld.scc(carry))
2035                     .def(1)
2036                     .getTemp();
2037          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2038       } else if (dst.regClass() == v2) {
2039          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
2040          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
2041          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2042                               Operand::c32(1u), carry);
2043          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2044       } else {
2045          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2046       }
2047       break;
2048    }
2049    case nir_op_isub: {
2050       if (dst.regClass() == s1) {
2051          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
2052          break;
2053       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2054          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2055          break;
2056       }
2057
2058       Temp src0 = get_alu_src(ctx, instr->src[0]);
2059       Temp src1 = get_alu_src(ctx, instr->src[1]);
2060       if (dst.regClass() == v1) {
2061          bld.vsub32(Definition(dst), src0, src1);
2062          break;
2063       } else if (dst.bytes() <= 2) {
2064          if (ctx->program->gfx_level >= GFX10)
2065             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
2066          else if (src1.type() == RegType::sgpr)
2067             bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
2068          else if (ctx->program->gfx_level >= GFX8)
2069             bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
2070          else
2071             bld.vsub32(Definition(dst), src0, src1);
2072          break;
2073       }
2074
2075       Temp src00 = bld.tmp(src0.type(), 1);
2076       Temp src01 = bld.tmp(dst.type(), 1);
2077       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2078       Temp src10 = bld.tmp(src1.type(), 1);
2079       Temp src11 = bld.tmp(dst.type(), 1);
2080       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2081       if (dst.regClass() == s2) {
2082          Temp borrow = bld.tmp(s1);
2083          Temp dst0 =
2084             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2085          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
2086                               bld.scc(borrow));
2087          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2088       } else if (dst.regClass() == v2) {
2089          Temp lower = bld.tmp(v1);
2090          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
2091          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
2092          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2093       } else {
2094          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2095       }
2096       break;
2097    }
2098    case nir_op_usub_borrow: {
2099       Temp src0 = get_alu_src(ctx, instr->src[0]);
2100       Temp src1 = get_alu_src(ctx, instr->src[1]);
2101       if (dst.regClass() == s1) {
2102          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2103          break;
2104       } else if (dst.regClass() == v1) {
2105          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
2106          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2107                       borrow);
2108          break;
2109       }
2110
2111       Temp src00 = bld.tmp(src0.type(), 1);
2112       Temp src01 = bld.tmp(dst.type(), 1);
2113       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2114       Temp src10 = bld.tmp(src1.type(), 1);
2115       Temp src11 = bld.tmp(dst.type(), 1);
2116       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2117       if (dst.regClass() == s2) {
2118          Temp borrow = bld.tmp(s1);
2119          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2120          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2121                            bld.scc(borrow))
2122                      .def(1)
2123                      .getTemp();
2124          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2125       } else if (dst.regClass() == v2) {
2126          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
2127          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
2128          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2129                                Operand::c32(1u), borrow);
2130          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2131       } else {
2132          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2133       }
2134       break;
2135    }
2136    case nir_op_usub_sat: {
2137       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2138          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2139          sub_instr->valu().clamp = 1;
2140          break;
2141       }
2142       Temp src0 = get_alu_src(ctx, instr->src[0]);
2143       Temp src1 = get_alu_src(ctx, instr->src[1]);
2144       if (dst.regClass() == s1) {
2145          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
2146          bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
2147          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
2148          break;
2149       } else if (dst.regClass() == v2b) {
2150          Instruction* sub_instr;
2151          if (ctx->program->gfx_level >= GFX10) {
2152             sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
2153          } else {
2154             aco_opcode op = aco_opcode::v_sub_u16;
2155             if (src1.type() == RegType::sgpr) {
2156                std::swap(src0, src1);
2157                op = aco_opcode::v_subrev_u16;
2158             }
2159             sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
2160          }
2161          sub_instr->valu().clamp = 1;
2162          break;
2163       } else if (dst.regClass() == v1) {
2164          usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
2165          break;
2166       }
2167
2168       assert(src0.size() == 2 && src1.size() == 2);
2169       Temp src00 = bld.tmp(src0.type(), 1);
2170       Temp src01 = bld.tmp(src0.type(), 1);
2171       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2172       Temp src10 = bld.tmp(src1.type(), 1);
2173       Temp src11 = bld.tmp(src1.type(), 1);
2174       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2175
2176       if (dst.regClass() == s2) {
2177          Temp carry0 = bld.tmp(s1);
2178          Temp carry1 = bld.tmp(s1);
2179
2180          Temp no_sat0 =
2181             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
2182          Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
2183                                  src01, src11, bld.scc(carry0));
2184
2185          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
2186
2187          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
2188                   bld.scc(carry1));
2189       } else if (dst.regClass() == v2) {
2190          Temp no_sat0 = bld.tmp(v1);
2191          Temp dst0 = bld.tmp(v1);
2192          Temp dst1 = bld.tmp(v1);
2193
2194          Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
2195          Temp carry1;
2196
2197          if (ctx->program->gfx_level >= GFX8) {
2198             carry1 = bld.tmp(bld.lm);
2199             bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
2200                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
2201                ->valu()
2202                .clamp = 1;
2203          } else {
2204             Temp no_sat1 = bld.tmp(v1);
2205             carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
2206             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
2207                          carry1);
2208          }
2209
2210          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
2211                       carry1);
2212          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2213       } else {
2214          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2215       }
2216       break;
2217    }
2218    case nir_op_isub_sat: {
2219       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2220          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
2221          sub_instr->valu().clamp = 1;
2222          break;
2223       }
2224       Temp src0 = get_alu_src(ctx, instr->src[0]);
2225       Temp src1 = get_alu_src(ctx, instr->src[1]);
2226       if (dst.regClass() == s1) {
2227          Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
2228          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2229                                Operand::c32(INT32_MAX), cond);
2230          Temp overflow = bld.tmp(s1);
2231          Temp sub =
2232             bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2233          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
2234          break;
2235       }
2236
2237       src1 = as_vgpr(ctx, src1);
2238
2239       if (dst.regClass() == v2b) {
2240          Instruction* sub_instr =
2241             bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
2242          sub_instr->valu().clamp = 1;
2243       } else if (dst.regClass() == v1) {
2244          Instruction* sub_instr =
2245             bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
2246          sub_instr->valu().clamp = 1;
2247       } else {
2248          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2249       }
2250       break;
2251    }
2252    case nir_op_imul: {
2253       if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
2254          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
2255       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
2256          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
2257       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2258          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
2259       } else if (dst.type() == RegType::vgpr) {
2260          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2261          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2262
2263          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2264             bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
2265             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
2266                                   true /* commutative */, false, false, nuw_16bit);
2267          } else if (nir_src_is_const(instr->src[0].src)) {
2268             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
2269                           nir_src_as_uint(instr->src[0].src), false);
2270          } else if (nir_src_is_const(instr->src[1].src)) {
2271             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
2272                           nir_src_as_uint(instr->src[1].src), false);
2273          } else {
2274             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
2275          }
2276       } else if (dst.regClass() == s1) {
2277          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
2278       } else {
2279          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2280       }
2281       break;
2282    }
2283    case nir_op_umul_high: {
2284       if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2285          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
2286       } else if (dst.bytes() == 4) {
2287          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2288          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2289
2290          Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2291          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2292             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2293          } else {
2294             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2295          }
2296
2297          if (dst.regClass() == s1)
2298             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2299       } else {
2300          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2301       }
2302       break;
2303    }
2304    case nir_op_imul_high: {
2305       if (dst.regClass() == v1) {
2306          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2307       } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2308          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2309       } else if (dst.regClass() == s1) {
2310          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2311                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2312          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2313       } else {
2314          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2315       }
2316       break;
2317    }
2318    case nir_op_fmul: {
2319       if (dst.regClass() == v2b) {
2320          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2321       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2322          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2323       } else if (dst.regClass() == v1) {
2324          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2325       } else if (dst.regClass() == v2) {
2326          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2327       } else {
2328          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2329       }
2330       break;
2331    }
2332    case nir_op_fmulz: {
2333       if (dst.regClass() == v1) {
2334          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
2335       } else {
2336          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2337       }
2338       break;
2339    }
2340    case nir_op_fadd: {
2341       if (dst.regClass() == v2b) {
2342          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2343       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2344          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2345       } else if (dst.regClass() == v1) {
2346          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2347       } else if (dst.regClass() == v2) {
2348          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2349       } else {
2350          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2351       }
2352       break;
2353    }
2354    case nir_op_fsub: {
2355       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2356          Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2357          VALU_instruction& sub = add->valu();
2358          sub.neg_lo[1] = true;
2359          sub.neg_hi[1] = true;
2360          break;
2361       }
2362
2363       Temp src0 = get_alu_src(ctx, instr->src[0]);
2364       Temp src1 = get_alu_src(ctx, instr->src[1]);
2365       if (dst.regClass() == v2b) {
2366          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2367             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2368          else
2369             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2370       } else if (dst.regClass() == v1) {
2371          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2372             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2373          else
2374             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2375       } else if (dst.regClass() == v2) {
2376          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2377                                      as_vgpr(ctx, src1));
2378          add->valu().neg[1] = true;
2379       } else {
2380          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2381       }
2382       break;
2383    }
2384    case nir_op_ffma: {
2385       if (dst.regClass() == v2b) {
2386          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
2387       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2388          assert(instr->def.num_components == 2);
2389
2390          Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
2391          Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
2392          Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
2393
2394          /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
2395          unsigned opsel_lo = 0, opsel_hi = 0;
2396          for (unsigned i = 0; i < 3; i++) {
2397             opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
2398             opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
2399          }
2400
2401          bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
2402       } else if (dst.regClass() == v1) {
2403          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
2404                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2405       } else if (dst.regClass() == v2) {
2406          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
2407       } else {
2408          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2409       }
2410       break;
2411    }
2412    case nir_op_ffmaz: {
2413       if (dst.regClass() == v1) {
2414          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
2415                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2416       } else {
2417          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2418       }
2419       break;
2420    }
2421    case nir_op_fmax: {
2422       if (dst.regClass() == v2b) {
2423          // TODO: check fp_mode.must_flush_denorms16_64
2424          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
2425       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2426          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2427       } else if (dst.regClass() == v1) {
2428          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2429                                ctx->block->fp_mode.must_flush_denorms32);
2430       } else if (dst.regClass() == v2) {
2431          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2432                                 ctx->block->fp_mode.must_flush_denorms16_64);
2433       } else {
2434          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2435       }
2436       break;
2437    }
2438    case nir_op_fmin: {
2439       if (dst.regClass() == v2b) {
2440          // TODO: check fp_mode.must_flush_denorms16_64
2441          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
2442       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2443          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2444       } else if (dst.regClass() == v1) {
2445          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2446                                ctx->block->fp_mode.must_flush_denorms32);
2447       } else if (dst.regClass() == v2) {
2448          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2449                                 ctx->block->fp_mode.must_flush_denorms16_64);
2450       } else {
2451          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2452       }
2453       break;
2454    }
2455    case nir_op_sdot_4x8_iadd: {
2456       if (ctx->options->gfx_level >= GFX11)
2457          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
2458       else
2459          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2460       break;
2461    }
2462    case nir_op_sdot_4x8_iadd_sat: {
2463       if (ctx->options->gfx_level >= GFX11)
2464          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
2465       else
2466          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2467       break;
2468    }
2469    case nir_op_sudot_4x8_iadd: {
2470       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
2471       break;
2472    }
2473    case nir_op_sudot_4x8_iadd_sat: {
2474       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
2475       break;
2476    }
2477    case nir_op_udot_4x8_uadd: {
2478       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2479       break;
2480    }
2481    case nir_op_udot_4x8_uadd_sat: {
2482       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2483       break;
2484    }
2485    case nir_op_sdot_2x16_iadd: {
2486       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2487       break;
2488    }
2489    case nir_op_sdot_2x16_iadd_sat: {
2490       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2491       break;
2492    }
2493    case nir_op_udot_2x16_uadd: {
2494       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2495       break;
2496    }
2497    case nir_op_udot_2x16_uadd_sat: {
2498       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2499       break;
2500    }
2501    case nir_op_cube_amd: {
2502       Temp in = get_alu_src(ctx, instr->src[0], 3);
2503       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2504                      emit_extract_vector(ctx, in, 2, v1)};
2505       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2506       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2507       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2508       Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
2509       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
2510       break;
2511    }
2512    case nir_op_bcsel: {
2513       emit_bcsel(ctx, instr, dst);
2514       break;
2515    }
2516    case nir_op_frsq: {
2517       if (dst.regClass() == v2b) {
2518          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2519       } else if (dst.regClass() == v1) {
2520          Temp src = get_alu_src(ctx, instr->src[0]);
2521          emit_rsq(ctx, bld, Definition(dst), src);
2522       } else if (dst.regClass() == v2) {
2523          /* Lowered at NIR level for precision reasons. */
2524          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2525       } else {
2526          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2527       }
2528       break;
2529    }
2530    case nir_op_fneg: {
2531       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2532          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2533          Instruction* vop3p =
2534             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2535                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2536          vop3p->valu().neg_lo[0] = true;
2537          vop3p->valu().neg_hi[0] = true;
2538          break;
2539       }
2540       Temp src = get_alu_src(ctx, instr->src[0]);
2541       if (dst.regClass() == v2b) {
2542          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2543       } else if (dst.regClass() == v1) {
2544          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2545                   as_vgpr(ctx, src));
2546       } else if (dst.regClass() == v2) {
2547          if (ctx->block->fp_mode.must_flush_denorms16_64)
2548             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2549                            as_vgpr(ctx, src));
2550          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2551          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2552          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2553          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2554       } else {
2555          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2556       }
2557       break;
2558    }
2559    case nir_op_fabs: {
2560       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2561          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2562          Instruction* vop3p =
2563             bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
2564                       instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
2565                .instr;
2566          vop3p->valu().neg_lo[1] = true;
2567          vop3p->valu().neg_hi[1] = true;
2568          break;
2569       }
2570       Temp src = get_alu_src(ctx, instr->src[0]);
2571       if (dst.regClass() == v2b) {
2572          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2573                                          Operand::c16(0x3c00), as_vgpr(ctx, src))
2574                                .instr;
2575          mul->valu().abs[1] = true;
2576       } else if (dst.regClass() == v1) {
2577          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2578                                          Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2579                                .instr;
2580          mul->valu().abs[1] = true;
2581       } else if (dst.regClass() == v2) {
2582          if (ctx->block->fp_mode.must_flush_denorms16_64)
2583             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2584                            as_vgpr(ctx, src));
2585          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2586          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2587          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2588          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2589       } else {
2590          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2591       }
2592       break;
2593    }
2594    case nir_op_fsat: {
2595       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2596          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2597          Instruction* vop3p =
2598             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2599                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2600          vop3p->valu().clamp = true;
2601          break;
2602       }
2603       Temp src = get_alu_src(ctx, instr->src[0]);
2604       if (dst.regClass() == v2b) {
2605          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2606                   src);
2607       } else if (dst.regClass() == v1) {
2608          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2609                   Operand::c32(0x3f800000u), src);
2610          /* apparently, it is not necessary to flush denorms if this instruction is used with these
2611           * operands */
2612          // TODO: confirm that this holds under any circumstances
2613       } else if (dst.regClass() == v2) {
2614          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2615          add->valu().clamp = true;
2616       } else {
2617          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2618       }
2619       break;
2620    }
2621    case nir_op_flog2: {
2622       if (dst.regClass() == v2b) {
2623          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2624       } else if (dst.regClass() == v1) {
2625          Temp src = get_alu_src(ctx, instr->src[0]);
2626          emit_log2(ctx, bld, Definition(dst), src);
2627       } else {
2628          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2629       }
2630       break;
2631    }
2632    case nir_op_frcp: {
2633       if (dst.regClass() == v2b) {
2634          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2635       } else if (dst.regClass() == v1) {
2636          Temp src = get_alu_src(ctx, instr->src[0]);
2637          emit_rcp(ctx, bld, Definition(dst), src);
2638       } else if (dst.regClass() == v2) {
2639          /* Lowered at NIR level for precision reasons. */
2640          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2641       } else {
2642          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2643       }
2644       break;
2645    }
2646    case nir_op_fexp2: {
2647       if (dst.regClass() == v2b) {
2648          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2649       } else if (dst.regClass() == v1) {
2650          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2651       } else {
2652          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2653       }
2654       break;
2655    }
2656    case nir_op_fsqrt: {
2657       if (dst.regClass() == v2b) {
2658          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2659       } else if (dst.regClass() == v1) {
2660          Temp src = get_alu_src(ctx, instr->src[0]);
2661          emit_sqrt(ctx, bld, Definition(dst), src);
2662       } else if (dst.regClass() == v2) {
2663          /* Lowered at NIR level for precision reasons. */
2664          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2665       } else {
2666          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2667       }
2668       break;
2669    }
2670    case nir_op_ffract: {
2671       if (dst.regClass() == v2b) {
2672          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2673       } else if (dst.regClass() == v1) {
2674          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2675       } else if (dst.regClass() == v2) {
2676          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2677       } else {
2678          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2679       }
2680       break;
2681    }
2682    case nir_op_ffloor: {
2683       if (dst.regClass() == v2b) {
2684          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2685       } else if (dst.regClass() == v1) {
2686          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2687       } else if (dst.regClass() == v2) {
2688          Temp src = get_alu_src(ctx, instr->src[0]);
2689          emit_floor_f64(ctx, bld, Definition(dst), src);
2690       } else {
2691          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2692       }
2693       break;
2694    }
2695    case nir_op_fceil: {
2696       if (dst.regClass() == v2b) {
2697          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2698       } else if (dst.regClass() == v1) {
2699          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2700       } else if (dst.regClass() == v2) {
2701          if (ctx->options->gfx_level >= GFX7) {
2702             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2703          } else {
2704             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2705             /* trunc = trunc(src0)
2706              * if (src0 > 0.0 && src0 != trunc)
2707              *    trunc += 1.0
2708              */
2709             Temp src0 = get_alu_src(ctx, instr->src[0]);
2710             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2711             Temp tmp0 =
2712                bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2713             Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
2714             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
2715             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2716                                 bld.copy(bld.def(v1), Operand::zero()),
2717                                 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2718             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2719                              bld.copy(bld.def(v1), Operand::zero()), add);
2720             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2721          }
2722       } else {
2723          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2724       }
2725       break;
2726    }
2727    case nir_op_ftrunc: {
2728       if (dst.regClass() == v2b) {
2729          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2730       } else if (dst.regClass() == v1) {
2731          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2732       } else if (dst.regClass() == v2) {
2733          Temp src = get_alu_src(ctx, instr->src[0]);
2734          emit_trunc_f64(ctx, bld, Definition(dst), src);
2735       } else {
2736          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2737       }
2738       break;
2739    }
2740    case nir_op_fround_even: {
2741       if (dst.regClass() == v2b) {
2742          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2743       } else if (dst.regClass() == v1) {
2744          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2745       } else if (dst.regClass() == v2) {
2746          if (ctx->options->gfx_level >= GFX7) {
2747             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2748          } else {
2749             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2750             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2751             Temp src0 = get_alu_src(ctx, instr->src[0]);
2752             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2753
2754             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2755                                     bld.copy(bld.def(s1), Operand::c32(-2u)));
2756             Temp bfi =
2757                bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2758                         bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2759             Temp tmp =
2760                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2761                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2762             Instruction* sub =
2763                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2764                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2765             sub->valu().neg[1] = true;
2766             tmp = sub->definitions[0].getTemp();
2767
2768             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2769                                 Operand::c32(0x432fffffu));
2770             Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
2771             vop3->valu().abs[0] = true;
2772             Temp cond = vop3->definitions[0].getTemp();
2773
2774             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2775             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2776             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2777                                      as_vgpr(ctx, src0_lo), cond);
2778             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2779                                      as_vgpr(ctx, src0_hi), cond);
2780
2781             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2782          }
2783       } else {
2784          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2785       }
2786       break;
2787    }
2788    case nir_op_fsin_amd:
2789    case nir_op_fcos_amd: {
2790       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2791       aco_ptr<Instruction> norm;
2792       if (dst.regClass() == v2b) {
2793          aco_opcode opcode =
2794             instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2795          bld.vop1(opcode, Definition(dst), src);
2796       } else if (dst.regClass() == v1) {
2797          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2798          if (ctx->options->gfx_level < GFX9)
2799             src = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), src);
2800
2801          aco_opcode opcode =
2802             instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2803          bld.vop1(opcode, Definition(dst), src);
2804       } else {
2805          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2806       }
2807       break;
2808    }
2809    case nir_op_ldexp: {
2810       if (dst.regClass() == v2b) {
2811          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2812       } else if (dst.regClass() == v1) {
2813          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2814       } else if (dst.regClass() == v2) {
2815          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2816       } else {
2817          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2818       }
2819       break;
2820    }
2821    case nir_op_frexp_sig: {
2822       if (dst.regClass() == v2b) {
2823          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2824       } else if (dst.regClass() == v1) {
2825          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2826       } else if (dst.regClass() == v2) {
2827          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2828       } else {
2829          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2830       }
2831       break;
2832    }
2833    case nir_op_frexp_exp: {
2834       if (instr->src[0].src.ssa->bit_size == 16) {
2835          Temp src = get_alu_src(ctx, instr->src[0]);
2836          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2837          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2838          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2839       } else if (instr->src[0].src.ssa->bit_size == 32) {
2840          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2841       } else if (instr->src[0].src.ssa->bit_size == 64) {
2842          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2843       } else {
2844          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2845       }
2846       break;
2847    }
2848    case nir_op_fsign: {
2849       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2850       if (dst.regClass() == v2b) {
2851          assert(ctx->program->gfx_level >= GFX9);
2852          /* replace negative zero with positive zero */
2853          src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2854          src =
2855             bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u));
2856          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2857       } else if (dst.regClass() == v1) {
2858          src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
2859          src =
2860             bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
2861          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2862       } else if (dst.regClass() == v2) {
2863          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
2864          Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2865          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2866                                    emit_extract_vector(ctx, src, 1, v1), cond);
2867
2868          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
2869          tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2870          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2871
2872          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2873       } else {
2874          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2875       }
2876       break;
2877    }
2878    case nir_op_f2f16:
2879    case nir_op_f2f16_rtne: {
2880       Temp src = get_alu_src(ctx, instr->src[0]);
2881       if (instr->src[0].src.ssa->bit_size == 64)
2882          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2883       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2884          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2885           * keep value numbering and the scheduler simpler.
2886           */
2887          bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2888       else
2889          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2890       break;
2891    }
2892    case nir_op_f2f16_rtz: {
2893       Temp src = get_alu_src(ctx, instr->src[0]);
2894       if (instr->src[0].src.ssa->bit_size == 64)
2895          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2896       if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2897          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2898       else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
2899          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2900       else
2901          bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2902       break;
2903    }
2904    case nir_op_f2f32: {
2905       if (instr->src[0].src.ssa->bit_size == 16) {
2906          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2907       } else if (instr->src[0].src.ssa->bit_size == 64) {
2908          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2909       } else {
2910          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2911       }
2912       break;
2913    }
2914    case nir_op_f2f64: {
2915       Temp src = get_alu_src(ctx, instr->src[0]);
2916       if (instr->src[0].src.ssa->bit_size == 16)
2917          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2918       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2919       break;
2920    }
2921    case nir_op_i2f16: {
2922       assert(dst.regClass() == v2b);
2923       Temp src = get_alu_src(ctx, instr->src[0]);
2924       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2925       if (input_size <= 16) {
2926          /* Expand integer to the size expected by the uint→float converter used below */
2927          unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2928          if (input_size != target_size) {
2929             src = convert_int(ctx, bld, src, input_size, target_size, true);
2930          }
2931       }
2932
2933       if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2934          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2935       } else {
2936          /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
2937           *
2938           * This is also the fallback-path taken on GFX7 and earlier, which
2939           * do not support direct f16⟷i16 conversions.
2940           */
2941          src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2942          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2943       }
2944       break;
2945    }
2946    case nir_op_i2f32: {
2947       assert(dst.size() == 1);
2948       Temp src = get_alu_src(ctx, instr->src[0]);
2949       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2950       if (input_size <= 32) {
2951          if (input_size <= 16) {
2952             /* Sign-extend to 32-bits */
2953             src = convert_int(ctx, bld, src, input_size, 32, true);
2954          }
2955          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2956       } else {
2957          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2958       }
2959       break;
2960    }
2961    case nir_op_i2f64: {
2962       if (instr->src[0].src.ssa->bit_size <= 32) {
2963          Temp src = get_alu_src(ctx, instr->src[0]);
2964          if (instr->src[0].src.ssa->bit_size <= 16)
2965             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2966          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2967       } else {
2968          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2969       }
2970       break;
2971    }
2972    case nir_op_u2f16: {
2973       assert(dst.regClass() == v2b);
2974       Temp src = get_alu_src(ctx, instr->src[0]);
2975       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2976       if (input_size <= 16) {
2977          /* Expand integer to the size expected by the uint→float converter used below */
2978          unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2979          if (input_size != target_size) {
2980             src = convert_int(ctx, bld, src, input_size, target_size, false);
2981          }
2982       }
2983
2984       if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2985          bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2986       } else {
2987          /* Large 32bit inputs need to return inf/FLOAT_MAX.
2988           *
2989           * This is also the fallback-path taken on GFX7 and earlier, which
2990           * do not support direct f16⟷u16 conversions.
2991           */
2992          src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
2993          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2994       }
2995       break;
2996    }
2997    case nir_op_u2f32: {
2998       assert(dst.size() == 1);
2999       Temp src = get_alu_src(ctx, instr->src[0]);
3000       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3001       if (input_size == 8) {
3002          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
3003       } else if (input_size <= 32) {
3004          if (input_size == 16)
3005             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3006          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
3007       } else {
3008          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3009       }
3010       break;
3011    }
3012    case nir_op_u2f64: {
3013       if (instr->src[0].src.ssa->bit_size <= 32) {
3014          Temp src = get_alu_src(ctx, instr->src[0]);
3015          if (instr->src[0].src.ssa->bit_size <= 16)
3016             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3017          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
3018       } else {
3019          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3020       }
3021       break;
3022    }
3023    case nir_op_f2i8:
3024    case nir_op_f2i16: {
3025       if (instr->src[0].src.ssa->bit_size == 16) {
3026          if (ctx->program->gfx_level >= GFX8) {
3027             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
3028          } else {
3029             /* GFX7 and earlier do not support direct f16⟷i16 conversions */
3030             Temp tmp = bld.tmp(v1);
3031             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3032             tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
3033             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3034                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3035             if (dst.type() == RegType::sgpr) {
3036                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3037             }
3038          }
3039       } else if (instr->src[0].src.ssa->bit_size == 32) {
3040          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3041       } else {
3042          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3043       }
3044       break;
3045    }
3046    case nir_op_f2u8:
3047    case nir_op_f2u16: {
3048       if (instr->src[0].src.ssa->bit_size == 16) {
3049          if (ctx->program->gfx_level >= GFX8) {
3050             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
3051          } else {
3052             /* GFX7 and earlier do not support direct f16⟷u16 conversions */
3053             Temp tmp = bld.tmp(v1);
3054             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3055             tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
3056             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3057                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3058             if (dst.type() == RegType::sgpr) {
3059                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3060             }
3061          }
3062       } else if (instr->src[0].src.ssa->bit_size == 32) {
3063          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3064       } else {
3065          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3066       }
3067       break;
3068    }
3069    case nir_op_f2i32: {
3070       Temp src = get_alu_src(ctx, instr->src[0]);
3071       if (instr->src[0].src.ssa->bit_size == 16) {
3072          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3073          if (dst.type() == RegType::vgpr) {
3074             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
3075          } else {
3076             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3077                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
3078          }
3079       } else if (instr->src[0].src.ssa->bit_size == 32) {
3080          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3081       } else if (instr->src[0].src.ssa->bit_size == 64) {
3082          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3083       } else {
3084          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3085       }
3086       break;
3087    }
3088    case nir_op_f2u32: {
3089       Temp src = get_alu_src(ctx, instr->src[0]);
3090       if (instr->src[0].src.ssa->bit_size == 16) {
3091          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3092          if (dst.type() == RegType::vgpr) {
3093             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
3094          } else {
3095             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3096                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
3097          }
3098       } else if (instr->src[0].src.ssa->bit_size == 32) {
3099          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3100       } else if (instr->src[0].src.ssa->bit_size == 64) {
3101          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3102       } else {
3103          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3104       }
3105       break;
3106    }
3107    case nir_op_b2f16: {
3108       Temp src = get_alu_src(ctx, instr->src[0]);
3109       assert(src.regClass() == bld.lm);
3110
3111       if (dst.regClass() == s1) {
3112          src = bool_to_scalar_condition(ctx, src);
3113          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3114       } else if (dst.regClass() == v2b) {
3115          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3116          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3117       } else {
3118          unreachable("Wrong destination register class for nir_op_b2f16.");
3119       }
3120       break;
3121    }
3122    case nir_op_b2f32: {
3123       Temp src = get_alu_src(ctx, instr->src[0]);
3124       assert(src.regClass() == bld.lm);
3125
3126       if (dst.regClass() == s1) {
3127          src = bool_to_scalar_condition(ctx, src);
3128          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3129       } else if (dst.regClass() == v1) {
3130          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3131                       Operand::c32(0x3f800000u), src);
3132       } else {
3133          unreachable("Wrong destination register class for nir_op_b2f32.");
3134       }
3135       break;
3136    }
3137    case nir_op_b2f64: {
3138       Temp src = get_alu_src(ctx, instr->src[0]);
3139       assert(src.regClass() == bld.lm);
3140
3141       if (dst.regClass() == s2) {
3142          src = bool_to_scalar_condition(ctx, src);
3143          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3144                   Operand::zero(), bld.scc(src));
3145       } else if (dst.regClass() == v2) {
3146          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3147          Temp upper =
3148             bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3149          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3150       } else {
3151          unreachable("Wrong destination register class for nir_op_b2f64.");
3152       }
3153       break;
3154    }
3155    case nir_op_i2i8:
3156    case nir_op_i2i16:
3157    case nir_op_i2i32: {
3158       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3159          /* no need to do the extract in get_alu_src() */
3160          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3161                                      ? sgpr_extract_sext
3162                                      : sgpr_extract_undef;
3163          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3164       } else {
3165          const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3166          const unsigned output_bitsize = instr->def.bit_size;
3167          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3168                      output_bitsize > input_bitsize, dst);
3169       }
3170       break;
3171    }
3172    case nir_op_u2u8:
3173    case nir_op_u2u16:
3174    case nir_op_u2u32: {
3175       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3176          /* no need to do the extract in get_alu_src() */
3177          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3178                                      ? sgpr_extract_zext
3179                                      : sgpr_extract_undef;
3180          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3181       } else {
3182          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3183                      instr->def.bit_size, false, dst);
3184       }
3185       break;
3186    }
3187    case nir_op_b2b32:
3188    case nir_op_b2i8:
3189    case nir_op_b2i16:
3190    case nir_op_b2i32: {
3191       Temp src = get_alu_src(ctx, instr->src[0]);
3192       assert(src.regClass() == bld.lm);
3193
3194       if (dst.regClass() == s1) {
3195          bool_to_scalar_condition(ctx, src, dst);
3196       } else if (dst.type() == RegType::vgpr) {
3197          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
3198                       src);
3199       } else {
3200          unreachable("Invalid register class for b2i32");
3201       }
3202       break;
3203    }
3204    case nir_op_b2b1: {
3205       Temp src = get_alu_src(ctx, instr->src[0]);
3206       assert(dst.regClass() == bld.lm);
3207
3208       if (src.type() == RegType::vgpr) {
3209          assert(src.regClass() == v1 || src.regClass() == v2);
3210          assert(dst.regClass() == bld.lm);
3211          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3212                   Definition(dst), Operand::zero(), src);
3213       } else {
3214          assert(src.regClass() == s1 || src.regClass() == s2);
3215          Temp tmp;
3216          if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
3217             tmp =
3218                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3219                   .def(1)
3220                   .getTemp();
3221          } else {
3222             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3223                            bld.scc(bld.def(s1)), Operand::zero(), src);
3224          }
3225          bool_to_vector_condition(ctx, tmp, dst);
3226       }
3227       break;
3228    }
3229    case nir_op_unpack_64_2x32:
3230    case nir_op_unpack_32_2x16:
3231    case nir_op_unpack_64_4x16:
3232    case nir_op_unpack_32_4x8:
3233       bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3234       emit_split_vector(
3235          ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3236       break;
3237    case nir_op_pack_64_2x32_split: {
3238       Temp src0 = get_alu_src(ctx, instr->src[0]);
3239       Temp src1 = get_alu_src(ctx, instr->src[1]);
3240
3241       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3242       break;
3243    }
3244    case nir_op_unpack_64_2x32_split_x:
3245       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3246                  get_alu_src(ctx, instr->src[0]));
3247       break;
3248    case nir_op_unpack_64_2x32_split_y:
3249       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3250                  get_alu_src(ctx, instr->src[0]));
3251       break;
3252    case nir_op_unpack_32_2x16_split_x:
3253       if (dst.type() == RegType::vgpr) {
3254          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3255                     get_alu_src(ctx, instr->src[0]));
3256       } else {
3257          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3258       }
3259       break;
3260    case nir_op_unpack_32_2x16_split_y:
3261       if (dst.type() == RegType::vgpr) {
3262          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3263                     get_alu_src(ctx, instr->src[0]));
3264       } else {
3265          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3266                     get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3267                     Operand::zero());
3268       }
3269       break;
3270    case nir_op_pack_32_2x16_split: {
3271       Temp src0 = get_alu_src(ctx, instr->src[0]);
3272       Temp src1 = get_alu_src(ctx, instr->src[1]);
3273       if (dst.regClass() == v1) {
3274          src0 = emit_extract_vector(ctx, src0, 0, v2b);
3275          src1 = emit_extract_vector(ctx, src1, 0, v2b);
3276          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3277       } else {
3278          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3279                          Operand::c32(0xFFFFu));
3280          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3281                          Operand::c32(16u));
3282          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3283       }
3284       break;
3285    }
3286    case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3287    case nir_op_pack_half_2x16_rtz_split:
3288    case nir_op_pack_half_2x16_split: {
3289       if (dst.regClass() == v1) {
3290          if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
3291             emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3292          else
3293             emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3294       } else {
3295          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3296       }
3297       break;
3298    }
3299    case nir_op_pack_unorm_2x16:
3300    case nir_op_pack_snorm_2x16: {
3301       unsigned bit_size = instr->src[0].src.ssa->bit_size;
3302       /* Only support 16 and 32bit. */
3303       assert(bit_size == 32 || bit_size == 16);
3304
3305       RegClass src_rc = bit_size == 32 ? v1 : v2b;
3306       Temp src = get_alu_src(ctx, instr->src[0], 2);
3307       Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
3308       Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
3309
3310       /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
3311       if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
3312          src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
3313          src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
3314          bit_size = 32;
3315       }
3316
3317       aco_opcode opcode;
3318       if (bit_size == 32) {
3319          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
3320                                                       : aco_opcode::v_cvt_pknorm_i16_f32;
3321       } else {
3322          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
3323                                                       : aco_opcode::v_cvt_pknorm_i16_f16;
3324       }
3325       bld.vop3(opcode, Definition(dst), src0, src1);
3326       break;
3327    }
3328    case nir_op_pack_uint_2x16:
3329    case nir_op_pack_sint_2x16: {
3330       Temp src = get_alu_src(ctx, instr->src[0], 2);
3331       Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3332       Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3333       aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
3334                                                              : aco_opcode::v_cvt_pk_i16_i32;
3335       bld.vop3(opcode, Definition(dst), src0, src1);
3336       break;
3337    }
3338    case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3339    case nir_op_unpack_half_2x16_split_x: {
3340       Temp src = get_alu_src(ctx, instr->src[0]);
3341       if (src.regClass() == v1)
3342          src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3343       if (dst.regClass() == v1) {
3344          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3345                 (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3346          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3347       } else {
3348          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3349       }
3350       break;
3351    }
3352    case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3353    case nir_op_unpack_half_2x16_split_y: {
3354       Temp src = get_alu_src(ctx, instr->src[0]);
3355       if (src.regClass() == s1)
3356          src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
3357                           Operand::c32(1u), Operand::c32(16u), Operand::zero());
3358       else
3359          src =
3360             bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3361       if (dst.regClass() == v1) {
3362          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3363                 (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3364          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3365       } else {
3366          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3367       }
3368       break;
3369    }
3370    case nir_op_sad_u8x4: {
3371       assert(dst.regClass() == v1);
3372       emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
3373       break;
3374    }
3375    case nir_op_fquantize2f16: {
3376       Temp src = get_alu_src(ctx, instr->src[0]);
3377       Temp f16;
3378       if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3379          f16 = bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, bld.def(v2b), src);
3380       else
3381          f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
3382       Temp f32, cmp_res;
3383
3384       if (ctx->program->gfx_level >= GFX8) {
3385          Temp mask = bld.copy(
3386             bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3387          cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, mask);
3388          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3389       } else {
3390          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3391           * so compare the result and flush to 0 if it's smaller.
3392           */
3393          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3394          Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3395          Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3396          tmp0->valu().abs[0] = true;
3397          Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f32, bld.def(bld.lm), Operand::zero(), f32);
3398          cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3399                             tmp0->definitions[0].getTemp(), tmp1);
3400       }
3401
3402       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3403          Temp copysign_0 =
3404             bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3405          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3406       } else {
3407          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3408       }
3409       break;
3410    }
3411    case nir_op_bfm: {
3412       Temp bits = get_alu_src(ctx, instr->src[0]);
3413       Temp offset = get_alu_src(ctx, instr->src[1]);
3414
3415       if (dst.regClass() == s1) {
3416          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3417       } else if (dst.regClass() == v1) {
3418          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3419       } else {
3420          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3421       }
3422       break;
3423    }
3424    case nir_op_bitfield_select: {
3425
3426       /* dst = (insert & bitmask) | (base & ~bitmask) */
3427       if (dst.regClass() == s1) {
3428          Temp bitmask = get_alu_src(ctx, instr->src[0]);
3429          Temp insert = get_alu_src(ctx, instr->src[1]);
3430          Temp base = get_alu_src(ctx, instr->src[2]);
3431          aco_ptr<Instruction> sop2;
3432          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3433          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3434          Operand lhs;
3435          if (const_insert && const_bitmask) {
3436             lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3437          } else {
3438             insert =
3439                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3440             lhs = Operand(insert);
3441          }
3442
3443          Operand rhs;
3444          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3445          if (const_base && const_bitmask) {
3446             rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3447          } else {
3448             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3449             rhs = Operand(base);
3450          }
3451
3452          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3453
3454       } else if (dst.regClass() == v1) {
3455          emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3456       } else {
3457          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3458       }
3459       break;
3460    }
3461    case nir_op_ubfe:
3462    case nir_op_ibfe: {
3463       if (dst.bytes() != 4)
3464          unreachable("Unsupported BFE bit size");
3465
3466       if (dst.type() == RegType::sgpr) {
3467          Temp base = get_alu_src(ctx, instr->src[0]);
3468
3469          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3470          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3471          aco_opcode opcode =
3472             instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3473          if (const_offset && const_bits) {
3474             uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
3475             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3476             break;
3477          }
3478
3479          Temp offset = get_alu_src(ctx, instr->src[1]);
3480          Temp bits = get_alu_src(ctx, instr->src[2]);
3481
3482          if (ctx->program->gfx_level >= GFX9) {
3483             Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
3484                                          : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3485                                                     bld.def(s1, scc), bits, Operand::c32(0x1fu));
3486             Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
3487             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
3488          } else if (instr->op == nir_op_ubfe) {
3489             Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3490             Temp masked =
3491                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3492             bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3493          } else {
3494             Operand bits_op = const_bits
3495                                  ? Operand::c32((const_bits->u32 & 0x1f) << 16)
3496                                  : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
3497                                             bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3498                                                      bld.def(s1, scc), bits, Operand::c32(0x1fu)),
3499                                             Operand::c32(16u));
3500             Operand offset_op = const_offset
3501                                    ? Operand::c32(const_offset->u32 & 0x1fu)
3502                                    : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3503                                               offset, Operand::c32(0x1fu));
3504
3505             Temp extract =
3506                bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3507             bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3508          }
3509
3510       } else {
3511          aco_opcode opcode =
3512             instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3513          emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3514       }
3515       break;
3516    }
3517    case nir_op_extract_u8:
3518    case nir_op_extract_i8:
3519    case nir_op_extract_u16:
3520    case nir_op_extract_i16: {
3521       bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3522       unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3523       uint32_t bits = comp == 4 ? 8 : 16;
3524       unsigned index = nir_src_as_uint(instr->src[1].src);
3525       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3526          assert(index == 0);
3527          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3528       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
3529          Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3530          unsigned swizzle = instr->src[0].swizzle[0];
3531          if (vec.size() > 1) {
3532             vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3533             swizzle = swizzle & 1;
3534          }
3535          index += swizzle * instr->def.bit_size / bits;
3536          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3537                     Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3538       } else {
3539          Temp src = get_alu_src(ctx, instr->src[0]);
3540          Definition def(dst);
3541          if (dst.bytes() == 8) {
3542             src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3543             index %= comp;
3544             def = bld.def(src.type(), 1);
3545          }
3546          assert(def.bytes() <= 4);
3547          if (def.regClass() == s1) {
3548             bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3549                        Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3550          } else {
3551             src = emit_extract_vector(ctx, src, 0, def.regClass());
3552             bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3553                        Operand::c32(bits), Operand::c32(is_signed));
3554          }
3555          if (dst.size() == 2)
3556             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3557                        Operand::zero());
3558       }
3559       break;
3560    }
3561    case nir_op_insert_u8:
3562    case nir_op_insert_u16: {
3563       unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3564       uint32_t bits = comp == 4 ? 8 : 16;
3565       unsigned index = nir_src_as_uint(instr->src[1].src);
3566       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3567          assert(index == 0);
3568          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3569       } else {
3570          Temp src = get_alu_src(ctx, instr->src[0]);
3571          Definition def(dst);
3572          bool swap = false;
3573          if (dst.bytes() == 8) {
3574             src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3575             swap = index >= comp;
3576             index %= comp;
3577             def = bld.def(src.type(), 1);
3578          }
3579          if (def.regClass() == s1) {
3580             bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3581                        Operand::c32(index), Operand::c32(bits));
3582          } else {
3583             src = emit_extract_vector(ctx, src, 0, def.regClass());
3584             bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3585                        Operand::c32(bits));
3586          }
3587          if (dst.size() == 2 && swap)
3588             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3589                        def.getTemp());
3590          else if (dst.size() == 2)
3591             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3592                        Operand::zero());
3593       }
3594       break;
3595    }
3596    case nir_op_bit_count: {
3597       Temp src = get_alu_src(ctx, instr->src[0]);
3598       if (src.regClass() == s1) {
3599          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3600       } else if (src.regClass() == v1) {
3601          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3602       } else if (src.regClass() == v2) {
3603          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3604                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3605                            emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3606       } else if (src.regClass() == s2) {
3607          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3608       } else {
3609          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3610       }
3611       break;
3612    }
3613    case nir_op_flt: {
3614       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3615                       aco_opcode::v_cmp_lt_f64);
3616       break;
3617    }
3618    case nir_op_fge: {
3619       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3620                       aco_opcode::v_cmp_ge_f64);
3621       break;
3622    }
3623    case nir_op_feq: {
3624       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3625                       aco_opcode::v_cmp_eq_f64);
3626       break;
3627    }
3628    case nir_op_fneu: {
3629       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3630                       aco_opcode::v_cmp_neq_f64);
3631       break;
3632    }
3633    case nir_op_ilt: {
3634       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3635                       aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3636       break;
3637    }
3638    case nir_op_ige: {
3639       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3640                       aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3641       break;
3642    }
3643    case nir_op_ieq: {
3644       if (instr->src[0].src.ssa->bit_size == 1)
3645          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3646       else
3647          emit_comparison(
3648             ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3649             aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3650             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3651       break;
3652    }
3653    case nir_op_ine: {
3654       if (instr->src[0].src.ssa->bit_size == 1)
3655          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3656       else
3657          emit_comparison(
3658             ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3659             aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3660             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3661       break;
3662    }
3663    case nir_op_ult: {
3664       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3665                       aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3666       break;
3667    }
3668    case nir_op_uge: {
3669       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3670                       aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3671       break;
3672    }
3673    case nir_op_bitz:
3674    case nir_op_bitnz: {
3675       assert(instr->src[0].src.ssa->bit_size != 1);
3676       bool test0 = instr->op == nir_op_bitz;
3677       Temp src0 = get_alu_src(ctx, instr->src[0]);
3678       Temp src1 = get_alu_src(ctx, instr->src[1]);
3679       bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
3680       if (!use_valu) {
3681          aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
3682                                                                : aco_opcode::s_bitcmp1_b32;
3683          if (test0)
3684             op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
3685                                                        : aco_opcode::s_bitcmp0_b32;
3686          emit_sopc_instruction(ctx, instr, op, dst);
3687          break;
3688       }
3689
3690       /* We do not have a VALU version of s_bitcmp.
3691        * But if the second source is constant, we can use
3692        * v_cmp_class_f32's LUT to check the bit.
3693        * The LUT only has 10 entries, so extract a higher byte if we have to.
3694        * For sign bits comparision with 0 is better because v_cmp_class
3695        * can't be inverted.
3696        */
3697       if (nir_src_is_const(instr->src[1].src)) {
3698          uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
3699          bit &= instr->src[0].src.ssa->bit_size - 1;
3700          src0 = as_vgpr(ctx, src0);
3701
3702          if (src0.regClass() == v2) {
3703             src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
3704             bit &= 31;
3705          }
3706
3707          if (bit == 31) {
3708             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3709                      Operand::c32(0), src0);
3710             break;
3711          }
3712
3713          if (bit == 15 && ctx->program->gfx_level >= GFX8) {
3714             bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
3715                      Operand::c32(0), src0);
3716             break;
3717          }
3718
3719          /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
3720          const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
3721          const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
3722          const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
3723          if (use_opsel) {
3724             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
3725                               Operand::c32(16), Operand::c32(0));
3726             bit &= 0xf;
3727          }
3728
3729          /* If we can use sdwa the extract is free, while test0's s_not is not. */
3730          if (bit == 7 && test0 && can_sdwa) {
3731             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
3732                               Operand::c32(8), Operand::c32(1));
3733             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3734                      Operand::c32(0), src0);
3735             break;
3736          }
3737
3738          if (bit > max_bit) {
3739             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
3740                               Operand::c32(8), Operand::c32(0));
3741             bit &= 0x7;
3742          }
3743
3744          /* denorm and snan/qnan inputs are preserved using all float control modes. */
3745          static const struct {
3746             uint32_t fp32;
3747             uint32_t fp16;
3748             bool negate;
3749          } float_lut[10] = {
3750             {0x7f800001, 0x7c01, false}, /* snan */
3751             {~0u, ~0u, false},           /* qnan */
3752             {0xff800000, 0xfc00, false}, /* -inf */
3753             {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
3754             {1, 1, true},                /* -denormal */
3755             {0, 0, true},                /* -0.0 */
3756             {0, 0, false},               /* +0.0 */
3757             {1, 1, false},               /* +denormal */
3758             {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
3759             {0x7f800000, 0x7c00, false}, /* +inf */
3760          };
3761
3762          Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
3763          /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
3764          const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
3765                                (ctx->program->gfx_level >= GFX11 && use_opsel);
3766          const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
3767          const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
3768
3769          VALU_instruction& res =
3770             bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
3771          if (float_lut[bit].negate) {
3772             res.format = asVOP3(res.format);
3773             res.neg[0] = true;
3774          }
3775
3776          if (test0)
3777             bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
3778
3779          break;
3780       }
3781
3782       Temp res;
3783       aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
3784       if (instr->src[0].src.ssa->bit_size == 16) {
3785          op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
3786          if (ctx->program->gfx_level < GFX10)
3787             res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
3788          else
3789             res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
3790
3791          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
3792       } else if (instr->src[0].src.ssa->bit_size == 32) {
3793          res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
3794       } else if (instr->src[0].src.ssa->bit_size == 64) {
3795          if (ctx->program->gfx_level < GFX8)
3796             res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
3797          else
3798             res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
3799
3800          res = emit_extract_vector(ctx, res, 0, v1);
3801          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
3802       } else {
3803          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3804       }
3805       bld.vopc(op, Definition(dst), Operand::c32(0), res);
3806       break;
3807    }
3808    case nir_op_fddx:
3809    case nir_op_fddy:
3810    case nir_op_fddx_fine:
3811    case nir_op_fddy_fine:
3812    case nir_op_fddx_coarse:
3813    case nir_op_fddy_coarse: {
3814       if (!nir_src_is_divergent(instr->src[0].src)) {
3815          /* Source is the same in all lanes, so the derivative is zero.
3816           * This also avoids emitting invalid IR.
3817           */
3818          bld.copy(Definition(dst), Operand::zero());
3819          break;
3820       }
3821
3822       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3823       uint16_t dpp_ctrl1, dpp_ctrl2;
3824       if (instr->op == nir_op_fddx_fine) {
3825          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3826          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3827       } else if (instr->op == nir_op_fddy_fine) {
3828          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3829          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3830       } else {
3831          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3832          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3833             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3834          else
3835             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3836       }
3837
3838       Temp tmp;
3839       if (ctx->program->gfx_level >= GFX8) {
3840          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3841          tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3842       } else {
3843          Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3844          Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3845          tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3846       }
3847       emit_wqm(bld, tmp, dst, true);
3848       break;
3849    }
3850    default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3851    }
3852 }
3853
3854 void
3855 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3856 {
3857    Temp dst = get_ssa_temp(ctx, &instr->def);
3858
3859    // TODO: we really want to have the resulting type as this would allow for 64bit literals
3860    // which get truncated the lsb if double and msb if int
3861    // for now, we only use s_mov_b64 with 64bit inline constants
3862    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3863    assert(dst.type() == RegType::sgpr);
3864
3865    Builder bld(ctx->program, ctx->block);
3866
3867    if (instr->def.bit_size == 1) {
3868       assert(dst.regClass() == bld.lm);
3869       int val = instr->value[0].b ? -1 : 0;
3870       Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3871       bld.copy(Definition(dst), op);
3872    } else if (instr->def.bit_size == 8) {
3873       bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3874    } else if (instr->def.bit_size == 16) {
3875       /* sign-extend to use s_movk_i32 instead of a literal */
3876       bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3877    } else if (dst.size() == 1) {
3878       bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3879    } else {
3880       assert(dst.size() != 1);
3881       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3882          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3883       if (instr->def.bit_size == 64)
3884          for (unsigned i = 0; i < dst.size(); i++)
3885             vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3886       else {
3887          for (unsigned i = 0; i < dst.size(); i++)
3888             vec->operands[i] = Operand::c32(instr->value[i].u32);
3889       }
3890       vec->definitions[0] = Definition(dst);
3891       ctx->block->instructions.emplace_back(std::move(vec));
3892    }
3893 }
3894
3895 bool
3896 can_use_byte_align_for_global_load(unsigned num_components, unsigned component_size,
3897                                    unsigned align_, bool support_12_byte)
3898 {
3899    /* Only use byte-align for 8/16-bit loads if we won't have to increase it's size and won't have
3900     * to use unsupported load sizes.
3901     */
3902    assert(util_is_power_of_two_nonzero(align_));
3903    if (align_ < 4) {
3904       assert(component_size < 4);
3905       unsigned load_size = num_components * component_size;
3906       uint32_t new_size = align(load_size + (4 - align_), 4);
3907       return new_size == align(load_size, 4) && (new_size != 12 || support_12_byte);
3908    }
3909    return true;
3910 }
3911
3912 struct LoadEmitInfo {
3913    Operand offset;
3914    Temp dst;
3915    unsigned num_components;
3916    unsigned component_size;
3917    Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */
3918    Temp idx = Temp(0, v1);      /* buffer index */
3919    unsigned component_stride = 0;
3920    unsigned const_offset = 0;
3921    unsigned align_mul = 0;
3922    unsigned align_offset = 0;
3923    pipe_format format;
3924
3925    bool glc = false;
3926    bool slc = false;
3927    bool split_by_component_stride = true;
3928    unsigned swizzle_component_size = 0;
3929    memory_sync_info sync;
3930    Temp soffset = Temp(0, s1);
3931 };
3932
3933 struct EmitLoadParameters {
3934    using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
3935                              unsigned bytes_needed, unsigned align, unsigned const_offset,
3936                              Temp dst_hint);
3937
3938    Callback callback;
3939    bool byte_align_loads;
3940    bool supports_8bit_16bit_loads;
3941    unsigned max_const_offset_plus_one;
3942 };
3943
3944 void
3945 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
3946           const EmitLoadParameters& params)
3947 {
3948    unsigned load_size = info.num_components * info.component_size;
3949    unsigned component_size = info.component_size;
3950
3951    unsigned num_vals = 0;
3952    Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
3953
3954    unsigned const_offset = info.const_offset;
3955
3956    const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
3957    unsigned align_offset = info.align_offset % align_mul;
3958
3959    unsigned bytes_read = 0;
3960    while (bytes_read < load_size) {
3961       unsigned bytes_needed = load_size - bytes_read;
3962
3963       /* add buffer for unaligned loads */
3964       int byte_align = 0;
3965       if (params.byte_align_loads) {
3966          byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3967       }
3968
3969       if (byte_align) {
3970          if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3971              !params.supports_8bit_16bit_loads) {
3972             if (info.component_stride) {
3973                assert(params.supports_8bit_16bit_loads && "unimplemented");
3974                bytes_needed = 2;
3975                byte_align = 0;
3976             } else {
3977                bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
3978                bytes_needed = align(bytes_needed, 4);
3979             }
3980          } else {
3981             byte_align = 0;
3982          }
3983       }
3984
3985       if (info.split_by_component_stride) {
3986          if (info.swizzle_component_size)
3987             bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
3988          if (info.component_stride)
3989             bytes_needed = MIN2(bytes_needed, info.component_size);
3990       }
3991
3992       bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3993
3994       /* reduce constant offset */
3995       Operand offset = info.offset;
3996       unsigned reduced_const_offset = const_offset;
3997       bool remove_const_offset_completely = need_to_align_offset;
3998       if (const_offset &&
3999           (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
4000          unsigned to_add = const_offset;
4001          if (remove_const_offset_completely) {
4002             reduced_const_offset = 0;
4003          } else {
4004             to_add =
4005                const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
4006             reduced_const_offset %= params.max_const_offset_plus_one;
4007          }
4008          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4009          if (offset.isConstant()) {
4010             offset = Operand::c32(offset.constantValue() + to_add);
4011          } else if (offset.isUndefined()) {
4012             offset = Operand::c32(to_add);
4013          } else if (offset_tmp.regClass() == s1) {
4014             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
4015                               Operand::c32(to_add));
4016          } else if (offset_tmp.regClass() == v1) {
4017             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
4018          } else {
4019             Temp lo = bld.tmp(offset_tmp.type(), 1);
4020             Temp hi = bld.tmp(offset_tmp.type(), 1);
4021             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4022
4023             if (offset_tmp.regClass() == s2) {
4024                Temp carry = bld.tmp(s1);
4025                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
4026                              Operand::c32(to_add));
4027                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
4028                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
4029             } else {
4030                Temp new_lo = bld.tmp(v1);
4031                Temp carry =
4032                   bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
4033                hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
4034                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
4035             }
4036          }
4037       }
4038
4039       /* align offset down if needed */
4040       Operand aligned_offset = offset;
4041       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
4042       if (need_to_align_offset) {
4043          align = 4;
4044          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4045          if (offset.isConstant()) {
4046             aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
4047          } else if (offset.isUndefined()) {
4048             aligned_offset = Operand::zero();
4049          } else if (offset_tmp.regClass() == s1) {
4050             aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
4051                                       Operand::c32(0xfffffffcu), offset_tmp);
4052          } else if (offset_tmp.regClass() == s2) {
4053             aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
4054                                       Operand::c64(0xfffffffffffffffcllu), offset_tmp);
4055          } else if (offset_tmp.regClass() == v1) {
4056             aligned_offset =
4057                bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
4058          } else if (offset_tmp.regClass() == v2) {
4059             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
4060             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4061             lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
4062             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
4063          }
4064       }
4065       Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp()
4066                                 : aligned_offset.isConstant()
4067                                    ? bld.copy(bld.def(s1), aligned_offset)
4068                                    : Temp(0, s1);
4069
4070       Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
4071                                  reduced_const_offset, byte_align ? Temp() : info.dst);
4072
4073       /* the callback wrote directly to dst */
4074       if (val == info.dst) {
4075          assert(num_vals == 0);
4076          emit_split_vector(ctx, info.dst, info.num_components);
4077          return;
4078       }
4079
4080       /* shift result right if needed */
4081       if (params.byte_align_loads && info.component_size < 4) {
4082          Operand byte_align_off = Operand::c32(byte_align);
4083          if (byte_align == -1) {
4084             if (offset.isConstant())
4085                byte_align_off = Operand::c32(offset.constantValue() % 4u);
4086             else if (offset.isUndefined())
4087                byte_align_off = Operand::zero();
4088             else if (offset.size() == 2)
4089                byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
4090                                                             RegClass(offset.getTemp().type(), 1)));
4091             else
4092                byte_align_off = offset;
4093          }
4094
4095          assert(val.bytes() >= load_size && "unimplemented");
4096          if (val.type() == RegType::sgpr)
4097             byte_align_scalar(ctx, val, byte_align_off, info.dst);
4098          else
4099             byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
4100          return;
4101       }
4102
4103       /* add result to list and advance */
4104       if (info.component_stride) {
4105          assert(val.bytes() % info.component_size == 0);
4106          unsigned num_loaded_components = val.bytes() / info.component_size;
4107          unsigned advance_bytes = info.component_stride * num_loaded_components;
4108          const_offset += advance_bytes;
4109          align_offset = (align_offset + advance_bytes) % align_mul;
4110       } else {
4111          const_offset += val.bytes();
4112          align_offset = (align_offset + val.bytes()) % align_mul;
4113       }
4114       bytes_read += val.bytes();
4115       vals[num_vals++] = val;
4116    }
4117
4118    /* create array of components */
4119    unsigned components_split = 0;
4120    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4121    bool has_vgprs = false;
4122    for (unsigned i = 0; i < num_vals;) {
4123       Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
4124       unsigned num_tmps = 0;
4125       unsigned tmp_size = 0;
4126       RegType reg_type = RegType::sgpr;
4127       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
4128          if (vals[i].type() == RegType::vgpr)
4129             reg_type = RegType::vgpr;
4130          tmp_size += vals[i].bytes();
4131          tmp[num_tmps++] = vals[i++];
4132       }
4133       if (num_tmps > 1) {
4134          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
4135             aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
4136          for (unsigned j = 0; j < num_tmps; j++)
4137             vec->operands[j] = Operand(tmp[j]);
4138          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
4139          vec->definitions[0] = Definition(tmp[0]);
4140          bld.insert(std::move(vec));
4141       }
4142
4143       if (tmp[0].bytes() % component_size) {
4144          /* trim tmp[0] */
4145          assert(i == num_vals);
4146          RegClass new_rc =
4147             RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
4148          tmp[0] =
4149             bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
4150       }
4151
4152       RegClass elem_rc = RegClass::get(reg_type, component_size);
4153
4154       unsigned start = components_split;
4155
4156       if (tmp_size == elem_rc.bytes()) {
4157          allocated_vec[components_split++] = tmp[0];
4158       } else {
4159          assert(tmp_size % elem_rc.bytes() == 0);
4160          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
4161             aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
4162          for (auto& def : split->definitions) {
4163             Temp component = bld.tmp(elem_rc);
4164             allocated_vec[components_split++] = component;
4165             def = Definition(component);
4166          }
4167          split->operands[0] = Operand(tmp[0]);
4168          bld.insert(std::move(split));
4169       }
4170
4171       /* try to p_as_uniform early so we can create more optimizable code and
4172        * also update allocated_vec */
4173       for (unsigned j = start; j < components_split; j++) {
4174          if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
4175             allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
4176          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
4177       }
4178    }
4179
4180    /* concatenate components and p_as_uniform() result if needed */
4181    if (info.dst.type() == RegType::vgpr || !has_vgprs)
4182       ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
4183
4184    int padding_bytes =
4185       MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
4186
4187    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
4188       aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
4189    for (unsigned i = 0; i < info.num_components; i++)
4190       vec->operands[i] = Operand(allocated_vec[i]);
4191    if (padding_bytes)
4192       vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
4193    if (info.dst.type() == RegType::sgpr && has_vgprs) {
4194       Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
4195       vec->definitions[0] = Definition(tmp);
4196       bld.insert(std::move(vec));
4197       bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
4198    } else {
4199       vec->definitions[0] = Definition(info.dst);
4200       bld.insert(std::move(vec));
4201    }
4202 }
4203
4204 Operand
4205 load_lds_size_m0(Builder& bld)
4206 {
4207    /* m0 does not need to be initialized on GFX9+ */
4208    if (bld.program->gfx_level >= GFX9)
4209       return Operand(s1);
4210
4211    return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
4212 }
4213
4214 Temp
4215 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4216                   unsigned align, unsigned const_offset, Temp dst_hint)
4217 {
4218    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
4219
4220    Operand m = load_lds_size_m0(bld);
4221
4222    bool large_ds_read = bld.program->gfx_level >= GFX7;
4223    bool usable_read2 = bld.program->gfx_level >= GFX7;
4224
4225    bool read2 = false;
4226    unsigned size = 0;
4227    aco_opcode op;
4228    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
4229       size = 16;
4230       op = aco_opcode::ds_read_b128;
4231    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
4232       size = 16;
4233       read2 = true;
4234       op = aco_opcode::ds_read2_b64;
4235    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
4236       size = 12;
4237       op = aco_opcode::ds_read_b96;
4238    } else if (bytes_needed >= 8 && align % 8 == 0) {
4239       size = 8;
4240       op = aco_opcode::ds_read_b64;
4241    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
4242       size = 8;
4243       read2 = true;
4244       op = aco_opcode::ds_read2_b32;
4245    } else if (bytes_needed >= 4 && align % 4 == 0) {
4246       size = 4;
4247       op = aco_opcode::ds_read_b32;
4248    } else if (bytes_needed >= 2 && align % 2 == 0) {
4249       size = 2;
4250       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
4251    } else {
4252       size = 1;
4253       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
4254    }
4255
4256    unsigned const_offset_unit = read2 ? size / 2u : 1u;
4257    unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
4258
4259    if (const_offset > (const_offset_range - const_offset_unit)) {
4260       unsigned excess = const_offset - (const_offset % const_offset_range);
4261       offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
4262       const_offset -= excess;
4263    }
4264
4265    const_offset /= const_offset_unit;
4266
4267    RegClass rc = RegClass::get(RegType::vgpr, size);
4268    Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
4269    Instruction* instr;
4270    if (read2)
4271       instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
4272    else
4273       instr = bld.ds(op, Definition(val), offset, m, const_offset);
4274    instr->ds().sync = info.sync;
4275
4276    if (m.isUndefined())
4277       instr->operands.pop_back();
4278
4279    return val;
4280 }
4281
4282 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
4283
4284 Temp
4285 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4286                    unsigned align, unsigned const_offset, Temp dst_hint)
4287 {
4288    assert(align >= 4u);
4289
4290    bld.program->has_smem_buffer_or_global_loads = true;
4291
4292    bool buffer = info.resource.id() && info.resource.bytes() == 16;
4293    Temp addr = info.resource;
4294    if (!buffer && !addr.id()) {
4295       addr = offset;
4296       offset = Temp();
4297    }
4298
4299    bytes_needed = MIN2(bytes_needed, 64);
4300    unsigned needed_round_up = util_next_power_of_two(bytes_needed);
4301    unsigned needed_round_down = needed_round_up >> (needed_round_up != bytes_needed ? 1 : 0);
4302    /* Only round-up global loads if it's aligned so that it won't cross pages */
4303    bytes_needed = buffer || align % needed_round_up == 0 ? needed_round_up : needed_round_down;
4304
4305    aco_opcode op;
4306    if (bytes_needed <= 4) {
4307       op = buffer ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4308    } else if (bytes_needed <= 8) {
4309       op = buffer ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4310    } else if (bytes_needed <= 16) {
4311       op = buffer ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4312    } else if (bytes_needed <= 32) {
4313       op = buffer ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4314    } else {
4315       assert(bytes_needed == 64);
4316       op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4317    }
4318
4319    aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4320    if (buffer) {
4321       if (const_offset)
4322          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4323                            Operand::c32(const_offset));
4324       load->operands[0] = Operand(info.resource);
4325       load->operands[1] = Operand(offset);
4326    } else {
4327       load->operands[0] = Operand(addr);
4328       if (offset.id() && const_offset)
4329          load->operands[1] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4330                                       Operand::c32(const_offset));
4331       else if (offset.id())
4332          load->operands[1] = Operand(offset);
4333       else
4334          load->operands[1] = Operand::c32(const_offset);
4335    }
4336    RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
4337    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4338    load->definitions[0] = Definition(val);
4339    load->glc = info.glc;
4340    load->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4341    load->sync = info.sync;
4342    bld.insert(std::move(load));
4343    return val;
4344 }
4345
4346 const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4347
4348 Temp
4349 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4350                     unsigned align_, unsigned const_offset, Temp dst_hint)
4351 {
4352    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4353    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4354
4355    if (info.soffset.id()) {
4356       if (soffset.isTemp())
4357          vaddr = bld.copy(bld.def(v1), soffset);
4358       soffset = Operand(info.soffset);
4359    }
4360
4361    if (soffset.isUndefined())
4362       soffset = Operand::zero();
4363
4364    bool offen = !vaddr.isUndefined();
4365    bool idxen = info.idx.id();
4366
4367    if (offen && idxen)
4368       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4369    else if (idxen)
4370       vaddr = Operand(info.idx);
4371
4372    unsigned bytes_size = 0;
4373    aco_opcode op;
4374    if (bytes_needed == 1 || align_ % 2) {
4375       bytes_size = 1;
4376       op = aco_opcode::buffer_load_ubyte;
4377    } else if (bytes_needed == 2 || align_ % 4) {
4378       bytes_size = 2;
4379       op = aco_opcode::buffer_load_ushort;
4380    } else if (bytes_needed <= 4) {
4381       bytes_size = 4;
4382       op = aco_opcode::buffer_load_dword;
4383    } else if (bytes_needed <= 8) {
4384       bytes_size = 8;
4385       op = aco_opcode::buffer_load_dwordx2;
4386    } else if (bytes_needed <= 12 && bld.program->gfx_level > GFX6) {
4387       bytes_size = 12;
4388       op = aco_opcode::buffer_load_dwordx3;
4389    } else {
4390       bytes_size = 16;
4391       op = aco_opcode::buffer_load_dwordx4;
4392    }
4393    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4394    mubuf->operands[0] = Operand(info.resource);
4395    mubuf->operands[1] = vaddr;
4396    mubuf->operands[2] = soffset;
4397    mubuf->offen = offen;
4398    mubuf->idxen = idxen;
4399    mubuf->glc = info.glc;
4400    mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4401    mubuf->slc = info.slc;
4402    mubuf->sync = info.sync;
4403    mubuf->offset = const_offset;
4404    mubuf->swizzled = info.swizzle_component_size != 0;
4405    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4406    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4407    mubuf->definitions[0] = Definition(val);
4408    bld.insert(std::move(mubuf));
4409
4410    return val;
4411 }
4412
4413 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4414
4415 Temp
4416 mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
4417                            unsigned bytes_needed, unsigned align_, unsigned const_offset,
4418                            Temp dst_hint)
4419 {
4420    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4421    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4422
4423    if (info.soffset.id()) {
4424       if (soffset.isTemp())
4425          vaddr = bld.copy(bld.def(v1), soffset);
4426       soffset = Operand(info.soffset);
4427    }
4428
4429    if (soffset.isUndefined())
4430       soffset = Operand::zero();
4431
4432    bool offen = !vaddr.isUndefined();
4433    bool idxen = info.idx.id();
4434
4435    if (offen && idxen)
4436       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4437    else if (idxen)
4438       vaddr = Operand(info.idx);
4439
4440    aco_opcode op = aco_opcode::num_opcodes;
4441    if (info.component_size == 2) {
4442       switch (bytes_needed) {
4443       case 2: op = aco_opcode::buffer_load_format_d16_x; break;
4444       case 4: op = aco_opcode::buffer_load_format_d16_xy; break;
4445       case 6: op = aco_opcode::buffer_load_format_d16_xyz; break;
4446       case 8: op = aco_opcode::buffer_load_format_d16_xyzw; break;
4447       default: unreachable("invalid buffer load format size"); break;
4448       }
4449    } else {
4450       assert(info.component_size == 4);
4451       switch (bytes_needed) {
4452       case 4: op = aco_opcode::buffer_load_format_x; break;
4453       case 8: op = aco_opcode::buffer_load_format_xy; break;
4454       case 12: op = aco_opcode::buffer_load_format_xyz; break;
4455       case 16: op = aco_opcode::buffer_load_format_xyzw; break;
4456       default: unreachable("invalid buffer load format size"); break;
4457       }
4458    }
4459
4460    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4461    mubuf->operands[0] = Operand(info.resource);
4462    mubuf->operands[1] = vaddr;
4463    mubuf->operands[2] = soffset;
4464    mubuf->offen = offen;
4465    mubuf->idxen = idxen;
4466    mubuf->glc = info.glc;
4467    mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4468    mubuf->slc = info.slc;
4469    mubuf->sync = info.sync;
4470    mubuf->offset = const_offset;
4471    RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
4472    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4473    mubuf->definitions[0] = Definition(val);
4474    bld.insert(std::move(mubuf));
4475
4476    return val;
4477 }
4478
4479 const EmitLoadParameters mubuf_load_format_params{mubuf_load_format_callback, false, true, 4096};
4480
4481 Temp
4482 scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4483                       unsigned align_, unsigned const_offset, Temp dst_hint)
4484 {
4485    unsigned bytes_size = 0;
4486    aco_opcode op;
4487    if (bytes_needed == 1 || align_ % 2u) {
4488       bytes_size = 1;
4489       op = aco_opcode::scratch_load_ubyte;
4490    } else if (bytes_needed == 2 || align_ % 4u) {
4491       bytes_size = 2;
4492       op = aco_opcode::scratch_load_ushort;
4493    } else if (bytes_needed <= 4) {
4494       bytes_size = 4;
4495       op = aco_opcode::scratch_load_dword;
4496    } else if (bytes_needed <= 8) {
4497       bytes_size = 8;
4498       op = aco_opcode::scratch_load_dwordx2;
4499    } else if (bytes_needed <= 12) {
4500       bytes_size = 12;
4501       op = aco_opcode::scratch_load_dwordx3;
4502    } else {
4503       bytes_size = 16;
4504       op = aco_opcode::scratch_load_dwordx4;
4505    }
4506    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4507    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4508    aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, Format::SCRATCH, 2, 1)};
4509    flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
4510    flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
4511    flat->sync = info.sync;
4512    flat->offset = const_offset;
4513    flat->definitions[0] = Definition(val);
4514    bld.insert(std::move(flat));
4515
4516    return val;
4517 }
4518
4519 const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, false, true, 4096};
4520 const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, false, true, 2048};
4521
4522 Temp
4523 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4524 {
4525    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4526                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4527
4528    if (addr.type() == RegType::vgpr)
4529       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4530                         Operand::c32(-1u), Operand::c32(rsrc_conf));
4531    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4532                      Operand::c32(rsrc_conf));
4533 }
4534
4535 Temp
4536 add64_32(Builder& bld, Temp src0, Temp src1)
4537 {
4538    Temp src00 = bld.tmp(src0.type(), 1);
4539    Temp src01 = bld.tmp(src0.type(), 1);
4540    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
4541
4542    if (src0.type() == RegType::vgpr || src1.type() == RegType::vgpr) {
4543       Temp dst0 = bld.tmp(v1);
4544       Temp carry = bld.vadd32(Definition(dst0), src00, src1, true).def(1).getTemp();
4545       Temp dst1 = bld.vadd32(bld.def(v1), src01, Operand::zero(), false, carry);
4546       return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
4547    } else {
4548       Temp carry = bld.tmp(s1);
4549       Temp dst0 =
4550          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src1);
4551       Temp dst1 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), src01, carry);
4552       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), dst0, dst1);
4553    }
4554 }
4555
4556 void
4557 lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout,
4558                      uint32_t* const_offset_inout, Temp* offset_inout)
4559 {
4560    Temp address = *address_inout;
4561    uint64_t const_offset = *const_offset_inout + offset_in;
4562    Temp offset = *offset_inout;
4563
4564    uint64_t max_const_offset_plus_one =
4565       1; /* GFX7/8/9: FLAT loads do not support constant offsets */
4566    if (bld.program->gfx_level >= GFX9)
4567       max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max;
4568    else if (bld.program->gfx_level == GFX6)
4569       max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */
4570    uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one);
4571    const_offset %= max_const_offset_plus_one;
4572
4573    if (!offset.id()) {
4574       while (unlikely(excess_offset > UINT32_MAX)) {
4575          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(UINT32_MAX)));
4576          excess_offset -= UINT32_MAX;
4577       }
4578       if (excess_offset)
4579          offset = bld.copy(bld.def(s1), Operand::c32(excess_offset));
4580    } else {
4581       /* If we add to "offset", we would transform the indended
4582        * "address + u2u64(offset) + u2u64(const_offset)" into
4583        * "address + u2u64(offset + const_offset)", so add to the address.
4584        * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition,
4585        * but that should be really rare.
4586        */
4587       while (excess_offset) {
4588          uint32_t src2 = MIN2(excess_offset, UINT32_MAX);
4589          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(src2)));
4590          excess_offset -= src2;
4591       }
4592    }
4593
4594    if (bld.program->gfx_level == GFX6) {
4595       /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */
4596       if (offset.type() != RegType::sgpr) {
4597          address = add64_32(bld, address, offset);
4598          offset = Temp();
4599       }
4600       offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero());
4601    } else if (bld.program->gfx_level <= GFX8) {
4602       /* GFX7,8 (FLAT): VGPR address */
4603       if (offset.id()) {
4604          address = add64_32(bld, address, offset);
4605          offset = Temp();
4606       }
4607       address = as_vgpr(bld, address);
4608    } else {
4609       /* GFX9+ (GLOBAL): (VGPR address), or (SGPR address and VGPR offset) */
4610       if (address.type() == RegType::vgpr && offset.id()) {
4611          address = add64_32(bld, address, offset);
4612          offset = Temp();
4613       } else if (address.type() == RegType::sgpr && offset.id()) {
4614          offset = as_vgpr(bld, offset);
4615       }
4616       if (address.type() == RegType::sgpr && !offset.id())
4617          offset = bld.copy(bld.def(v1), bld.copy(bld.def(s1), Operand::zero()));
4618    }
4619
4620    *address_inout = address;
4621    *const_offset_inout = const_offset;
4622    *offset_inout = offset;
4623 }
4624
4625 Temp
4626 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4627                      unsigned align_, unsigned const_offset, Temp dst_hint)
4628 {
4629    Temp addr = info.resource;
4630    if (!addr.id()) {
4631       addr = offset;
4632       offset = Temp();
4633    }
4634    lower_global_address(bld, 0, &addr, &const_offset, &offset);
4635
4636    unsigned bytes_size = 0;
4637    bool use_mubuf = bld.program->gfx_level == GFX6;
4638    bool global = bld.program->gfx_level >= GFX9;
4639    aco_opcode op;
4640    if (bytes_needed == 1 || align_ % 2u) {
4641       bytes_size = 1;
4642       op = use_mubuf ? aco_opcode::buffer_load_ubyte
4643            : global  ? aco_opcode::global_load_ubyte
4644                      : aco_opcode::flat_load_ubyte;
4645    } else if (bytes_needed == 2 || align_ % 4u) {
4646       bytes_size = 2;
4647       op = use_mubuf ? aco_opcode::buffer_load_ushort
4648            : global  ? aco_opcode::global_load_ushort
4649                      : aco_opcode::flat_load_ushort;
4650    } else if (bytes_needed <= 4) {
4651       bytes_size = 4;
4652       op = use_mubuf ? aco_opcode::buffer_load_dword
4653            : global  ? aco_opcode::global_load_dword
4654                      : aco_opcode::flat_load_dword;
4655    } else if (bytes_needed <= 8 || (bytes_needed <= 12 && use_mubuf)) {
4656       bytes_size = 8;
4657       op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4658            : global  ? aco_opcode::global_load_dwordx2
4659                      : aco_opcode::flat_load_dwordx2;
4660    } else if (bytes_needed <= 12 && !use_mubuf) {
4661       bytes_size = 12;
4662       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4663    } else {
4664       bytes_size = 16;
4665       op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4666            : global  ? aco_opcode::global_load_dwordx4
4667                      : aco_opcode::flat_load_dwordx4;
4668    }
4669    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4670    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4671    if (use_mubuf) {
4672       aco_ptr<MUBUF_instruction> mubuf{
4673          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4674       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
4675       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
4676       mubuf->operands[2] = Operand(offset);
4677       mubuf->glc = info.glc;
4678       mubuf->dlc = false;
4679       mubuf->offset = const_offset;
4680       mubuf->addr64 = addr.type() == RegType::vgpr;
4681       mubuf->disable_wqm = false;
4682       mubuf->sync = info.sync;
4683       mubuf->definitions[0] = Definition(val);
4684       bld.insert(std::move(mubuf));
4685    } else {
4686       aco_ptr<FLAT_instruction> flat{
4687          create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4688       if (addr.regClass() == s2) {
4689          assert(global && offset.id() && offset.type() == RegType::vgpr);
4690          flat->operands[0] = Operand(offset);
4691          flat->operands[1] = Operand(addr);
4692       } else {
4693          assert(addr.type() == RegType::vgpr && !offset.id());
4694          flat->operands[0] = Operand(addr);
4695          flat->operands[1] = Operand(s1);
4696       }
4697       flat->glc = info.glc;
4698       flat->dlc =
4699          info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4700       flat->sync = info.sync;
4701       assert(global || !const_offset);
4702       flat->offset = const_offset;
4703       flat->definitions[0] = Definition(val);
4704       bld.insert(std::move(flat));
4705    }
4706
4707    return val;
4708 }
4709
4710 const EmitLoadParameters global_load_params{global_load_callback, true, true, UINT32_MAX};
4711
4712 Temp
4713 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4714          Temp address, unsigned base_offset, unsigned align)
4715 {
4716    assert(util_is_power_of_two_nonzero(align));
4717
4718    Builder bld(ctx->program, ctx->block);
4719
4720    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4721    info.align_mul = align;
4722    info.align_offset = 0;
4723    info.sync = memory_sync_info(storage_shared);
4724    info.const_offset = base_offset;
4725    emit_load(ctx, bld, info, lds_load_params);
4726
4727    return dst;
4728 }
4729
4730 void
4731 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4732                  Temp src)
4733 {
4734    if (!count)
4735       return;
4736
4737    Builder bld(ctx->program, ctx->block);
4738
4739    /* count == 1 fast path */
4740    if (count == 1) {
4741       if (dst_type == RegType::sgpr)
4742          dst[0] = bld.as_uniform(src);
4743       else
4744          dst[0] = as_vgpr(ctx, src);
4745       return;
4746    }
4747
4748    /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4749    unsigned elem_size_bytes =
4750       1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4751
4752    ASSERTED bool is_subdword = elem_size_bytes < 4;
4753    assert(!is_subdword || dst_type == RegType::vgpr);
4754
4755    for (unsigned i = 0; i < count; i++)
4756       dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4757
4758    std::vector<Temp> temps;
4759    /* use allocated_vec if possible */
4760    auto it = ctx->allocated_vec.find(src.id());
4761    if (it != ctx->allocated_vec.end()) {
4762       if (!it->second[0].id())
4763          goto split;
4764       unsigned elem_size = it->second[0].bytes();
4765       assert(src.bytes() % elem_size == 0);
4766
4767       for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4768          if (!it->second[i].id())
4769             goto split;
4770       }
4771       if (elem_size_bytes % elem_size)
4772          goto split;
4773
4774       temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4775       elem_size_bytes = elem_size;
4776    }
4777
4778 split:
4779    /* split src if necessary */
4780    if (temps.empty()) {
4781       if (is_subdword && src.type() == RegType::sgpr)
4782          src = as_vgpr(ctx, src);
4783       if (dst_type == RegType::sgpr)
4784          src = bld.as_uniform(src);
4785
4786       unsigned num_elems = src.bytes() / elem_size_bytes;
4787       aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4788          aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4789       split->operands[0] = Operand(src);
4790       for (unsigned i = 0; i < num_elems; i++) {
4791          temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4792          split->definitions[i] = Definition(temps.back());
4793       }
4794       bld.insert(std::move(split));
4795    }
4796
4797    unsigned idx = 0;
4798    for (unsigned i = 0; i < count; i++) {
4799       unsigned op_count = dst[i].bytes() / elem_size_bytes;
4800       if (op_count == 1) {
4801          if (dst_type == RegType::sgpr)
4802             dst[i] = bld.as_uniform(temps[idx++]);
4803          else
4804             dst[i] = as_vgpr(ctx, temps[idx++]);
4805          continue;
4806       }
4807
4808       aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4809                                                                       Format::PSEUDO, op_count, 1)};
4810       for (unsigned j = 0; j < op_count; j++) {
4811          Temp tmp = temps[idx++];
4812          if (dst_type == RegType::sgpr)
4813             tmp = bld.as_uniform(tmp);
4814          vec->operands[j] = Operand(tmp);
4815       }
4816       vec->definitions[0] = Definition(dst[i]);
4817       bld.insert(std::move(vec));
4818    }
4819    return;
4820 }
4821
4822 bool
4823 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4824 {
4825    unsigned start_elem = ffs(todo_mask) - 1;
4826    bool skip = !(mask & (1 << start_elem));
4827    if (skip)
4828       mask = ~mask & todo_mask;
4829
4830    mask &= todo_mask;
4831
4832    u_bit_scan_consecutive_range(&mask, start, count);
4833
4834    return !skip;
4835 }
4836
4837 void
4838 advance_write_mask(uint32_t* todo_mask, int start, int count)
4839 {
4840    *todo_mask &= ~u_bit_consecutive(0, count) << start;
4841 }
4842
4843 void
4844 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4845           unsigned base_offset, unsigned align)
4846 {
4847    assert(util_is_power_of_two_nonzero(align));
4848    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4849
4850    Builder bld(ctx->program, ctx->block);
4851    bool large_ds_write = ctx->options->gfx_level >= GFX7;
4852    bool usable_write2 = ctx->options->gfx_level >= GFX7;
4853
4854    unsigned write_count = 0;
4855    Temp write_datas[32];
4856    unsigned offsets[32];
4857    unsigned bytes[32];
4858    aco_opcode opcodes[32];
4859
4860    wrmask = util_widen_mask(wrmask, elem_size_bytes);
4861
4862    const unsigned wrmask_bitcnt = util_bitcount(wrmask);
4863    uint32_t todo = u_bit_consecutive(0, data.bytes());
4864
4865    if (u_bit_consecutive(0, wrmask_bitcnt) == wrmask)
4866       todo = MIN2(todo, wrmask);
4867
4868    while (todo) {
4869       int offset, byte;
4870       if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
4871          offsets[write_count] = offset;
4872          bytes[write_count] = byte;
4873          opcodes[write_count] = aco_opcode::num_opcodes;
4874          write_count++;
4875          advance_write_mask(&todo, offset, byte);
4876          continue;
4877       }
4878
4879       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
4880       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
4881       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
4882       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
4883
4884       // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
4885       aco_opcode op = aco_opcode::num_opcodes;
4886       if (byte >= 16 && aligned16 && large_ds_write) {
4887          op = aco_opcode::ds_write_b128;
4888          byte = 16;
4889       } else if (byte >= 12 && aligned16 && large_ds_write) {
4890          op = aco_opcode::ds_write_b96;
4891          byte = 12;
4892       } else if (byte >= 8 && aligned8) {
4893          op = aco_opcode::ds_write_b64;
4894          byte = 8;
4895       } else if (byte >= 4 && aligned4) {
4896          op = aco_opcode::ds_write_b32;
4897          byte = 4;
4898       } else if (byte >= 2 && aligned2) {
4899          op = aco_opcode::ds_write_b16;
4900          byte = 2;
4901       } else if (byte >= 1) {
4902          op = aco_opcode::ds_write_b8;
4903          byte = 1;
4904       } else {
4905          assert(false);
4906       }
4907
4908       offsets[write_count] = offset;
4909       bytes[write_count] = byte;
4910       opcodes[write_count] = op;
4911       write_count++;
4912       advance_write_mask(&todo, offset, byte);
4913    }
4914
4915    Operand m = load_lds_size_m0(bld);
4916
4917    split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
4918
4919    for (unsigned i = 0; i < write_count; i++) {
4920       aco_opcode op = opcodes[i];
4921       if (op == aco_opcode::num_opcodes)
4922          continue;
4923
4924       Temp split_data = write_datas[i];
4925
4926       unsigned second = write_count;
4927       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
4928          for (second = i + 1; second < write_count; second++) {
4929             if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
4930                op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4931                opcodes[second] = aco_opcode::num_opcodes;
4932                break;
4933             }
4934          }
4935       }
4936
4937       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
4938       unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
4939
4940       unsigned inline_offset = base_offset + offsets[i];
4941       unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
4942       Temp address_offset = address;
4943       if (inline_offset > max_offset) {
4944          address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
4945          inline_offset = offsets[i];
4946       }
4947
4948       /* offsets[i] shouldn't be large enough for this to happen */
4949       assert(inline_offset <= max_offset);
4950
4951       Instruction* instr;
4952       if (write2) {
4953          Temp second_data = write_datas[second];
4954          inline_offset /= split_data.bytes();
4955          instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
4956                         inline_offset + write2_off);
4957       } else {
4958          instr = bld.ds(op, address_offset, split_data, m, inline_offset);
4959       }
4960       instr->ds().sync = memory_sync_info(storage_shared);
4961
4962       if (m.isUndefined())
4963          instr->operands.pop_back();
4964    }
4965 }
4966
4967 aco_opcode
4968 get_buffer_store_op(unsigned bytes)
4969 {
4970    switch (bytes) {
4971    case 1: return aco_opcode::buffer_store_byte;
4972    case 2: return aco_opcode::buffer_store_short;
4973    case 4: return aco_opcode::buffer_store_dword;
4974    case 8: return aco_opcode::buffer_store_dwordx2;
4975    case 12: return aco_opcode::buffer_store_dwordx3;
4976    case 16: return aco_opcode::buffer_store_dwordx4;
4977    }
4978    unreachable("Unexpected store size");
4979    return aco_opcode::num_opcodes;
4980 }
4981
4982 void
4983 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
4984                    Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
4985                    Temp* write_datas, unsigned* offsets)
4986 {
4987    unsigned write_count_with_skips = 0;
4988    bool skips[16];
4989    unsigned bytes[16];
4990
4991    /* determine how to split the data */
4992    unsigned todo = u_bit_consecutive(0, data.bytes());
4993    while (todo) {
4994       int offset, byte;
4995       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
4996       offsets[write_count_with_skips] = offset;
4997       if (skips[write_count_with_skips]) {
4998          bytes[write_count_with_skips] = byte;
4999          advance_write_mask(&todo, offset, byte);
5000          write_count_with_skips++;
5001          continue;
5002       }
5003
5004       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
5005        * larger than swizzle_element_size */
5006       byte = MIN2(byte, swizzle_element_size);
5007       if (byte % 4)
5008          byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
5009
5010       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
5011       if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12)
5012          byte = 8;
5013
5014       /* dword or larger stores have to be dword-aligned */
5015       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
5016       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
5017       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
5018       if (!dword_aligned)
5019          byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
5020
5021       bytes[write_count_with_skips] = byte;
5022       advance_write_mask(&todo, offset, byte);
5023       write_count_with_skips++;
5024    }
5025
5026    /* actually split data */
5027    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
5028
5029    /* remove skips */
5030    for (unsigned i = 0; i < write_count_with_skips; i++) {
5031       if (skips[i])
5032          continue;
5033       write_datas[*write_count] = write_datas[i];
5034       offsets[*write_count] = offsets[i];
5035       (*write_count)++;
5036    }
5037 }
5038
5039 Temp
5040 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
5041                       unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
5042 {
5043    Builder bld(ctx->program, ctx->block);
5044    unsigned dword_size = elem_size_bytes / 4;
5045
5046    if (!dst.id())
5047       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
5048
5049    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
5050    aco_ptr<Pseudo_instruction> instr{
5051       create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
5052    instr->definitions[0] = Definition(dst);
5053
5054    for (unsigned i = 0; i < cnt; ++i) {
5055       if (arr[i].id()) {
5056          assert(arr[i].size() == dword_size);
5057          allocated_vec[i] = arr[i];
5058          instr->operands[i] = Operand(arr[i]);
5059       } else {
5060          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
5061                               Operand::zero(dword_size == 2 ? 8 : 4));
5062          allocated_vec[i] = zero;
5063          instr->operands[i] = Operand(zero);
5064       }
5065    }
5066
5067    bld.insert(std::move(instr));
5068
5069    if (split_cnt)
5070       emit_split_vector(ctx, dst, split_cnt);
5071    else
5072       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
5073
5074    return dst;
5075 }
5076
5077 inline unsigned
5078 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
5079 {
5080    if (const_offset >= 4096) {
5081       unsigned excess_const_offset = const_offset / 4096u * 4096u;
5082       const_offset %= 4096u;
5083
5084       if (!voffset.id())
5085          voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
5086       else if (unlikely(voffset.regClass() == s1))
5087          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
5088                             Operand::c32(excess_const_offset), Operand(voffset));
5089       else if (likely(voffset.regClass() == v1))
5090          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
5091       else
5092          unreachable("Unsupported register class of voffset");
5093    }
5094
5095    return const_offset;
5096 }
5097
5098 void
5099 emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
5100                         Temp vdata, unsigned const_offset, memory_sync_info sync, bool glc,
5101                         bool slc, bool swizzled)
5102 {
5103    assert(vdata.id());
5104    assert(vdata.size() != 3 || ctx->program->gfx_level != GFX6);
5105    assert(vdata.size() >= 1 && vdata.size() <= 4);
5106
5107    Builder bld(ctx->program, ctx->block);
5108    aco_opcode op = get_buffer_store_op(vdata.bytes());
5109    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
5110
5111    bool offen = voffset.id();
5112    bool idxen = idx.id();
5113
5114    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
5115    glc &= ctx->program->gfx_level < GFX11;
5116
5117    Operand vaddr_op(v1);
5118    if (offen && idxen)
5119       vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, voffset);
5120    else if (offen)
5121       vaddr_op = Operand(voffset);
5122    else if (idxen)
5123       vaddr_op = Operand(idx);
5124
5125    Builder::Result r =
5126       bld.mubuf(op, Operand(descriptor), vaddr_op, soffset_op, Operand(vdata), const_offset, offen,
5127                 swizzled, idxen, /* addr64 */ false, /* disable_wqm */ false, glc,
5128                 /* dlc*/ false, slc);
5129
5130    r->mubuf().sync = sync;
5131 }
5132
5133 void
5134 store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
5135                  unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
5136                  bool swizzled, memory_sync_info sync, bool glc, bool slc)
5137 {
5138    Builder bld(ctx->program, ctx->block);
5139    assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 ||
5140           elem_size_bytes == 8);
5141    assert(write_mask);
5142    write_mask = util_widen_mask(write_mask, elem_size_bytes);
5143
5144    unsigned write_count = 0;
5145    Temp write_datas[32];
5146    unsigned offsets[32];
5147    split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
5148                       swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
5149                       write_datas, offsets);
5150
5151    for (unsigned i = 0; i < write_count; i++) {
5152       unsigned const_offset = offsets[i] + base_const_offset;
5153       emit_single_mubuf_store(ctx, descriptor, voffset, soffset, idx, write_datas[i], const_offset,
5154                               sync, glc, slc, swizzled);
5155    }
5156 }
5157
5158 Temp
5159 wave_id_in_threadgroup(isel_context* ctx)
5160 {
5161    Builder bld(ctx->program, ctx->block);
5162    return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
5163                    get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(24u | (4u << 16)));
5164 }
5165
5166 Temp
5167 thread_id_in_threadgroup(isel_context* ctx)
5168 {
5169    /* tid_in_tg = wave_id * wave_size + tid_in_wave */
5170
5171    Builder bld(ctx->program, ctx->block);
5172    Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
5173
5174    if (ctx->program->workgroup_size <= ctx->program->wave_size)
5175       return tid_in_wave;
5176
5177    Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
5178    Temp num_pre_threads =
5179       bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
5180                Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
5181    return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
5182 }
5183
5184 bool
5185 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
5186 {
5187    unsigned write_mask = nir_intrinsic_write_mask(instr);
5188    unsigned component = nir_intrinsic_component(instr);
5189    nir_src offset = *nir_get_io_offset_src(instr);
5190
5191    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5192       return false;
5193
5194    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5195
5196    if (instr->src[0].ssa->bit_size == 64)
5197       write_mask = util_widen_mask(write_mask, 2);
5198
5199    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
5200
5201    /* Use semantic location as index. radv already uses it as intrinsic base
5202     * but radeonsi does not. We need to make LS output and TCS input index
5203     * match each other, so need to use semantic location explicitly. Also for
5204     * TCS epilog to index tess factor temps using semantic location directly.
5205     */
5206    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5207    unsigned base = sem.location + sem.dual_source_blend_index;
5208    unsigned idx = base * 4u + component;
5209
5210    for (unsigned i = 0; i < 8; ++i) {
5211       if (write_mask & (1 << i)) {
5212          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
5213          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
5214       }
5215       idx++;
5216    }
5217
5218    if (ctx->stage == fragment_fs && ctx->program->info.has_epilog) {
5219       unsigned index = base - FRAG_RESULT_DATA0;
5220
5221       if (nir_intrinsic_src_type(instr) == nir_type_float16) {
5222          ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2);
5223       } else if (nir_intrinsic_src_type(instr) == nir_type_int16) {
5224          ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2);
5225       } else if (nir_intrinsic_src_type(instr) == nir_type_uint16) {
5226          ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2);
5227       }
5228    }
5229
5230    return true;
5231 }
5232
5233 bool
5234 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
5235 {
5236    /* Only TCS per-vertex inputs are supported by this function.
5237     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
5238     * is the same.
5239     */
5240    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
5241       return false;
5242
5243    nir_src* off_src = nir_get_io_offset_src(instr);
5244    nir_src* vertex_index_src = nir_get_io_arrayed_index_src(instr);
5245    nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
5246    bool can_use_temps =
5247       nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
5248       nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
5249
5250    if (!can_use_temps)
5251       return false;
5252
5253    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5254
5255    unsigned idx =
5256       sem.location * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
5257    Temp* src = &ctx->inputs.temps[idx];
5258    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
5259
5260    return true;
5261 }
5262
5263 void
5264 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
5265 {
5266    /* LS pass output to TCS by temp if they have same in/out patch size. */
5267    bool ls_need_output = ctx->stage == vertex_tess_control_hs &&
5268                          ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq;
5269
5270    bool tcs_need_output = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL &&
5271                           ctx->program->info.has_epilog &&
5272                           ctx->program->info.tcs.pass_tessfactors_by_reg;
5273
5274    bool ps_need_output = ctx->stage == fragment_fs;
5275
5276    if (ls_need_output || tcs_need_output || ps_need_output) {
5277       bool stored_to_temps = store_output_to_temps(ctx, instr);
5278       if (!stored_to_temps) {
5279          isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
5280          abort();
5281       }
5282    } else {
5283       unreachable("Shader stage not implemented");
5284    }
5285 }
5286
5287 bool
5288 in_exec_divergent_or_in_loop(isel_context* ctx)
5289 {
5290    return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
5291           ctx->cf_info.had_divergent_discard;
5292 }
5293
5294 void
5295 emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5296                         Temp prim_mask)
5297 {
5298    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5299    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5300
5301    Builder bld(ctx->program, ctx->block);
5302
5303    if (in_exec_divergent_or_in_loop(ctx)) {
5304       Operand prim_mask_op = bld.m0(prim_mask);
5305       prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
5306       Operand coord2_op(coord2);
5307       coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */
5308       bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5309                  Operand::c32(idx), Operand::c32(component), coord1, coord2_op, prim_mask_op);
5310       return;
5311    }
5312
5313    Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5314
5315    Temp res;
5316    if (dst.regClass() == v2b) {
5317       Temp p10 =
5318          bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1, p);
5319       res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v1), p, coord2, p10);
5320    } else {
5321       Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
5322       res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), p, coord2, p10);
5323    }
5324    /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5325    if (dst.regClass() != v2b)
5326       emit_wqm(bld, res, dst, true);
5327    else
5328       emit_extract_vector(ctx, emit_wqm(bld, res, Temp(0, s1), true), 0, dst);
5329 }
5330
5331 void
5332 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5333                   Temp prim_mask)
5334 {
5335    if (ctx->options->gfx_level >= GFX11) {
5336       emit_interp_instr_gfx11(ctx, idx, component, src, dst, prim_mask);
5337       return;
5338    }
5339
5340    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5341    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5342
5343    Builder bld(ctx->program, ctx->block);
5344
5345    if (dst.regClass() == v2b) {
5346       if (ctx->program->dev.has_16bank_lds) {
5347          assert(ctx->options->gfx_level <= GFX8);
5348          Builder::Result interp_p1 =
5349             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
5350                        bld.m0(prim_mask), idx, component);
5351          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
5352                                 bld.m0(prim_mask), interp_p1, idx, component);
5353          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
5354                     interp_p1, idx, component);
5355       } else {
5356          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
5357
5358          if (ctx->options->gfx_level == GFX8)
5359             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
5360
5361          Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
5362                                                 bld.m0(prim_mask), idx, component);
5363          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
5364                     component);
5365       }
5366    } else {
5367       Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
5368                                              bld.m0(prim_mask), idx, component);
5369
5370       if (ctx->program->dev.has_16bank_lds)
5371          interp_p1->operands[0].setLateKill(true);
5372
5373       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
5374                  idx, component);
5375    }
5376 }
5377
5378 void
5379 emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsigned vertex_id,
5380                       Temp dst, Temp prim_mask)
5381 {
5382    Builder bld(ctx->program, ctx->block);
5383    if (ctx->options->gfx_level >= GFX11) {
5384       uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
5385       if (in_exec_divergent_or_in_loop(ctx)) {
5386          Operand prim_mask_op = bld.m0(prim_mask);
5387          prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
5388          bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5389                     Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
5390                     prim_mask_op);
5391       } else {
5392          Temp p =
5393             bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5394          Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
5395
5396          /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5397          if (dst.regClass() != v2b)
5398             emit_wqm(bld, res, dst, true);
5399          else
5400             emit_extract_vector(ctx, emit_wqm(bld, res, Temp(0, s1), true), 0, dst);
5401       }
5402    } else {
5403       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32((vertex_id + 2) % 3),
5404                  bld.m0(prim_mask), idx, component);
5405    }
5406 }
5407
5408 void
5409 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
5410 {
5411    Builder bld(ctx->program, ctx->block);
5412
5413    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
5414       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
5415    for (unsigned i = 0; i < num_components; i++) {
5416       if (ctx->args->frag_pos[i].used)
5417          vec->operands[i] = Operand(get_arg(ctx, ctx->args->frag_pos[i]));
5418       else
5419          vec->operands[i] = Operand(v1);
5420    }
5421    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
5422       assert(num_components == 4);
5423       vec->operands[3] =
5424          bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->frag_pos[3]));
5425    }
5426
5427    for (Operand& op : vec->operands)
5428       op = op.isUndefined() ? Operand::zero() : op;
5429
5430    vec->definitions[0] = Definition(dst);
5431    ctx->block->instructions.emplace_back(std::move(vec));
5432    emit_split_vector(ctx, dst, num_components);
5433    return;
5434 }
5435
5436 void
5437 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
5438 {
5439    Builder bld(ctx->program, ctx->block);
5440    Temp cond;
5441
5442    /* VRS Rate X = Ancillary[2:3]
5443     * VRS Rate Y = Ancillary[4:5]
5444     */
5445    Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5446                           Operand::c32(2u), Operand::c32(2u));
5447    Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5448                           Operand::c32(4u), Operand::c32(2u));
5449
5450    /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
5451    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
5452    x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5453                      bld.copy(bld.def(v1), Operand::c32(4u)), cond);
5454
5455    /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
5456    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
5457    y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5458                      bld.copy(bld.def(v1), Operand::c32(1u)), cond);
5459
5460    bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
5461 }
5462
5463 void
5464 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
5465 {
5466    Temp dst = get_ssa_temp(ctx, &instr->def);
5467    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
5468    unsigned idx = nir_intrinsic_base(instr);
5469    unsigned component = nir_intrinsic_component(instr);
5470    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5471
5472    assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
5473
5474    if (instr->def.num_components == 1) {
5475       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
5476    } else {
5477       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
5478          aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1));
5479       for (unsigned i = 0; i < instr->def.num_components; i++) {
5480          Temp tmp = ctx->program->allocateTmp(instr->def.bit_size == 16 ? v2b : v1);
5481          emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
5482          vec->operands[i] = Operand(tmp);
5483       }
5484       vec->definitions[0] = Definition(dst);
5485       ctx->block->instructions.emplace_back(std::move(vec));
5486    }
5487 }
5488
5489 Temp
5490 mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
5491                     unsigned alignment, unsigned const_offset, Temp dst_hint)
5492 {
5493    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5494    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
5495
5496    if (info.soffset.id()) {
5497       if (soffset.isTemp())
5498          vaddr = bld.copy(bld.def(v1), soffset);
5499       soffset = Operand(info.soffset);
5500    }
5501
5502    if (soffset.isUndefined())
5503       soffset = Operand::zero();
5504
5505    const bool offen = !vaddr.isUndefined();
5506    const bool idxen = info.idx.id();
5507
5508    if (offen && idxen)
5509       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
5510    else if (idxen)
5511       vaddr = Operand(info.idx);
5512
5513    /* Determine number of fetched components.
5514     * Note, ACO IR works with GFX6-8 nfmt + dfmt fields, these are later converted for GFX10+.
5515     */
5516    const struct ac_vtx_format_info* vtx_info =
5517       ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, info.format);
5518    /* The number of channels in the format determines the memory range. */
5519    const unsigned max_components = vtx_info->num_channels;
5520    /* Calculate maximum number of components loaded according to alignment. */
5521    unsigned max_fetched_components = bytes_needed / info.component_size;
5522    max_fetched_components =
5523       ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
5524                              alignment, max_fetched_components);
5525    const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
5526    /* Adjust bytes needed in case we need to do a smaller load due to alignment.
5527     * If a larger format is selected, it's still OK to load a smaller amount from it.
5528     */
5529    bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
5530    unsigned bytes_size = 0;
5531    const unsigned bit_size = info.component_size * 8;
5532    aco_opcode op = aco_opcode::num_opcodes;
5533
5534    if (bytes_needed == 2) {
5535       bytes_size = 2;
5536       op = aco_opcode::tbuffer_load_format_d16_x;
5537    } else if (bytes_needed <= 4) {
5538       bytes_size = 4;
5539       if (bit_size == 16)
5540          op = aco_opcode::tbuffer_load_format_d16_xy;
5541       else
5542          op = aco_opcode::tbuffer_load_format_x;
5543    } else if (bytes_needed <= 6) {
5544       bytes_size = 6;
5545       if (bit_size == 16)
5546          op = aco_opcode::tbuffer_load_format_d16_xyz;
5547       else
5548          op = aco_opcode::tbuffer_load_format_xy;
5549    } else if (bytes_needed <= 8) {
5550       bytes_size = 8;
5551       if (bit_size == 16)
5552          op = aco_opcode::tbuffer_load_format_d16_xyzw;
5553       else
5554          op = aco_opcode::tbuffer_load_format_xy;
5555    } else if (bytes_needed <= 12) {
5556       bytes_size = 12;
5557       op = aco_opcode::tbuffer_load_format_xyz;
5558    } else {
5559       bytes_size = 16;
5560       op = aco_opcode::tbuffer_load_format_xyzw;
5561    }
5562
5563    /* Abort when suitable opcode wasn't found so we don't compile buggy shaders. */
5564    if (op == aco_opcode::num_opcodes) {
5565       aco_err(bld.program, "unsupported bit size for typed buffer load");
5566       abort();
5567    }
5568
5569    aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(op, Format::MTBUF, 3, 1)};
5570    mtbuf->operands[0] = Operand(info.resource);
5571    mtbuf->operands[1] = vaddr;
5572    mtbuf->operands[2] = soffset;
5573    mtbuf->offen = offen;
5574    mtbuf->idxen = idxen;
5575    mtbuf->glc = info.glc;
5576    mtbuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
5577    mtbuf->slc = info.slc;
5578    mtbuf->sync = info.sync;
5579    mtbuf->offset = const_offset;
5580    mtbuf->dfmt = fetch_fmt & 0xf;
5581    mtbuf->nfmt = fetch_fmt >> 4;
5582    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5583    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5584    mtbuf->definitions[0] = Definition(val);
5585    bld.insert(std::move(mtbuf));
5586
5587    return val;
5588 }
5589
5590 const EmitLoadParameters mtbuf_load_params{mtbuf_load_callback, false, true, 4096};
5591
5592 void
5593 visit_load_fs_input(isel_context* ctx, nir_intrinsic_instr* instr)
5594 {
5595    Builder bld(ctx->program, ctx->block);
5596    Temp dst = get_ssa_temp(ctx, &instr->def);
5597    nir_src offset = *nir_get_io_offset_src(instr);
5598
5599    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5600       isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
5601
5602    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5603
5604    unsigned idx = nir_intrinsic_base(instr);
5605    unsigned component = nir_intrinsic_component(instr);
5606    unsigned vertex_id = 0; /* P0 */
5607
5608    if (instr->intrinsic == nir_intrinsic_load_input_vertex)
5609       vertex_id = nir_src_as_uint(instr->src[0]);
5610
5611    if (instr->def.num_components == 1 && instr->def.bit_size != 64) {
5612       emit_interp_mov_instr(ctx, idx, component, vertex_id, dst, prim_mask);
5613    } else {
5614       unsigned num_components = instr->def.num_components;
5615       if (instr->def.bit_size == 64)
5616          num_components *= 2;
5617       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5618          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5619       for (unsigned i = 0; i < num_components; i++) {
5620          unsigned chan_component = (component + i) % 4;
5621          unsigned chan_idx = idx + (component + i) / 4;
5622          vec->operands[i] = Operand(bld.tmp(instr->def.bit_size == 16 ? v2b : v1));
5623          emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(),
5624                                prim_mask);
5625       }
5626       vec->definitions[0] = Definition(dst);
5627       bld.insert(std::move(vec));
5628    }
5629 }
5630
5631 void
5632 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5633 {
5634    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5635
5636    Builder bld(ctx->program, ctx->block);
5637    Temp dst = get_ssa_temp(ctx, &instr->def);
5638
5639    if (load_input_from_temps(ctx, instr, dst))
5640       return;
5641
5642    unreachable("LDS-based TCS input should have been lowered in NIR.");
5643 }
5644
5645 void
5646 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5647 {
5648    switch (ctx->shader->info.stage) {
5649    case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5650    default: unreachable("Unimplemented shader stage");
5651    }
5652 }
5653
5654 void
5655 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5656 {
5657    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5658
5659    Builder bld(ctx->program, ctx->block);
5660    Temp dst = get_ssa_temp(ctx, &instr->def);
5661
5662    Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5663    Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5664    Operand tes_w = Operand::zero();
5665
5666    if (ctx->shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
5667       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5668       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5669       tes_w = Operand(tmp);
5670    }
5671
5672    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5673    emit_split_vector(ctx, tess_coord, 3);
5674 }
5675
5676 void
5677 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5678             Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5679             bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5680 {
5681    Builder bld(ctx->program, ctx->block);
5682
5683    bool use_smem =
5684       dst.type() != RegType::vgpr && (!glc || ctx->options->gfx_level >= GFX8) && allow_smem;
5685    if (use_smem)
5686       offset = bld.as_uniform(offset);
5687    else {
5688       /* GFX6-7 are affected by a hw bug that prevents address clamping to
5689        * work correctly when the SGPR offset is used.
5690        */
5691       if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
5692          offset = as_vgpr(ctx, offset);
5693    }
5694
5695    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5696    info.glc = glc;
5697    info.sync = sync;
5698    info.align_mul = align_mul;
5699    info.align_offset = align_offset;
5700    if (use_smem)
5701       emit_load(ctx, bld, info, smem_load_params);
5702    else
5703       emit_load(ctx, bld, info, mubuf_load_params);
5704 }
5705
5706 void
5707 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5708 {
5709    Temp dst = get_ssa_temp(ctx, &instr->def);
5710    Builder bld(ctx->program, ctx->block);
5711    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5712
5713    unsigned size = instr->def.bit_size / 8;
5714    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5715                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5716 }
5717
5718 void
5719 visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5720 {
5721    Builder bld(ctx->program, ctx->block);
5722    Temp dst = get_ssa_temp(ctx, &instr->def);
5723    unsigned offset = nir_intrinsic_base(instr);
5724    unsigned count = instr->def.num_components;
5725    nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5726
5727    if (instr->def.bit_size == 64)
5728       count *= 2;
5729
5730    if (index_cv && instr->def.bit_size >= 32) {
5731       unsigned start = (offset + index_cv->u32) / 4u;
5732       uint64_t mask = BITFIELD64_MASK(count) << start;
5733       if ((ctx->args->inline_push_const_mask | mask) == ctx->args->inline_push_const_mask &&
5734           start + count <= (sizeof(ctx->args->inline_push_const_mask) * 8u)) {
5735          std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5736          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5737             aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5738          unsigned arg_index =
5739             util_bitcount64(ctx->args->inline_push_const_mask & BITFIELD64_MASK(start));
5740          for (unsigned i = 0; i < count; ++i) {
5741             elems[i] = get_arg(ctx, ctx->args->inline_push_consts[arg_index++]);
5742             vec->operands[i] = Operand{elems[i]};
5743          }
5744          vec->definitions[0] = Definition(dst);
5745          ctx->block->instructions.emplace_back(std::move(vec));
5746          ctx->allocated_vec.emplace(dst.id(), elems);
5747          return;
5748       }
5749    }
5750
5751    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5752    if (offset != 0) // TODO check if index != 0 as well
5753       index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5754                              Operand::c32(offset), index);
5755    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->push_constants));
5756    Temp vec = dst;
5757    bool trim = false;
5758    bool aligned = true;
5759
5760    if (instr->def.bit_size == 8) {
5761       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5762       bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5763       if (!aligned)
5764          vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5765    } else if (instr->def.bit_size == 16) {
5766       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5767       if (!aligned)
5768          vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5769    }
5770
5771    aco_opcode op;
5772
5773    switch (vec.size()) {
5774    case 1: op = aco_opcode::s_load_dword; break;
5775    case 2: op = aco_opcode::s_load_dwordx2; break;
5776    case 3:
5777       vec = bld.tmp(s4);
5778       trim = true;
5779       FALLTHROUGH;
5780    case 4: op = aco_opcode::s_load_dwordx4; break;
5781    case 6:
5782       vec = bld.tmp(s8);
5783       trim = true;
5784       FALLTHROUGH;
5785    case 8: op = aco_opcode::s_load_dwordx8; break;
5786    default: unreachable("unimplemented or forbidden load_push_constant.");
5787    }
5788
5789    bld.smem(op, Definition(vec), ptr, index);
5790
5791    if (!aligned) {
5792       Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5793       byte_align_scalar(ctx, vec, byte_offset, dst);
5794       return;
5795    }
5796
5797    if (trim) {
5798       emit_split_vector(ctx, vec, 4);
5799       RegClass rc = dst.size() == 3 ? s1 : s2;
5800       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5801                  emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5802    }
5803    emit_split_vector(ctx, dst, instr->def.num_components);
5804 }
5805
5806 void
5807 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5808 {
5809    Temp dst = get_ssa_temp(ctx, &instr->def);
5810
5811    Builder bld(ctx->program, ctx->block);
5812
5813    uint32_t desc_type =
5814       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5815       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5816    if (ctx->options->gfx_level >= GFX10) {
5817       desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5818                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5819                    S_008F0C_RESOURCE_LEVEL(ctx->options->gfx_level < GFX11);
5820    } else {
5821       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5822                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5823    }
5824
5825    unsigned base = nir_intrinsic_base(instr);
5826    unsigned range = nir_intrinsic_range(instr);
5827
5828    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5829    if (base && offset.type() == RegType::sgpr)
5830       offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5831                               Operand::c32(base));
5832    else if (base && offset.type() == RegType::vgpr)
5833       offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5834
5835    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5836                           bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5837                                      Operand::c32(ctx->constant_data_offset)),
5838                           Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5839                           Operand::c32(desc_type));
5840    unsigned size = instr->def.bit_size / 8;
5841    // TODO: get alignment information for subdword constants
5842    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5843 }
5844
5845 /* Packs multiple Temps of different sizes in to a vector of v1 Temps.
5846  * The byte count of each input Temp must be a multiple of 2.
5847  */
5848 static std::vector<Temp>
5849 emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
5850 {
5851    Builder bld(ctx->program, ctx->block);
5852    std::vector<Temp> packed;
5853    Temp low = Temp();
5854    for (Temp tmp : unpacked) {
5855       assert(tmp.bytes() % 2 == 0);
5856       unsigned byte_idx = 0;
5857       while (byte_idx < tmp.bytes()) {
5858          if (low != Temp()) {
5859             Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5860             Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high);
5861             low = Temp();
5862             packed.push_back(dword);
5863             byte_idx += 2;
5864          } else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) {
5865             packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
5866             byte_idx += 4;
5867          } else {
5868             low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5869             byte_idx += 2;
5870          }
5871       }
5872    }
5873    if (low != Temp()) {
5874       Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b));
5875       packed.push_back(dword);
5876    }
5877    return packed;
5878 }
5879
5880 static bool
5881 should_declare_array(ac_image_dim dim)
5882 {
5883    return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5884           dim == ac_image_2darraymsaa;
5885 }
5886
5887 static int
5888 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5889 {
5890    switch (dim) {
5891    case GLSL_SAMPLER_DIM_BUF: return 1;
5892    case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5893    case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5894    case GLSL_SAMPLER_DIM_MS: return array ? 3 : 2;
5895    case GLSL_SAMPLER_DIM_3D:
5896    case GLSL_SAMPLER_DIM_CUBE: return 3;
5897    case GLSL_SAMPLER_DIM_RECT:
5898    case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5899    case GLSL_SAMPLER_DIM_SUBPASS_MS: return 2;
5900    default: break;
5901    }
5902    return 0;
5903 }
5904
5905 static MIMG_instruction*
5906 emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector<Temp> coords,
5907           bool needs_wqm = false, Operand vdata = Operand(v1))
5908 {
5909    size_t nsa_size = bld.program->dev.max_nsa_vgprs;
5910    nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
5911
5912    const bool strict_wqm = coords[0].regClass().is_linear_vgpr();
5913    if (strict_wqm)
5914       nsa_size = coords.size();
5915
5916    for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {
5917       if (!coords[i].id())
5918          continue;
5919
5920       coords[i] = as_vgpr(bld, coords[i]);
5921    }
5922
5923    if (nsa_size < coords.size()) {
5924       Temp coord = coords[nsa_size];
5925       if (coords.size() - nsa_size > 1) {
5926          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5927             aco_opcode::p_create_vector, Format::PSEUDO, coords.size() - nsa_size, 1)};
5928
5929          unsigned coord_size = 0;
5930          for (unsigned i = nsa_size; i < coords.size(); i++) {
5931             vec->operands[i - nsa_size] = Operand(coords[i]);
5932             coord_size += coords[i].size();
5933          }
5934
5935          coord = bld.tmp(RegType::vgpr, coord_size);
5936          vec->definitions[0] = Definition(coord);
5937          bld.insert(std::move(vec));
5938       } else {
5939          coord = as_vgpr(bld, coord);
5940       }
5941
5942       coords[nsa_size] = coord;
5943       coords.resize(nsa_size + 1);
5944    }
5945
5946    bool has_dst = dst.id() != 0;
5947    assert(!needs_wqm || has_dst);
5948    Temp tmp_dst = needs_wqm ? bld.tmp(dst.regClass()) : dst;
5949
5950    aco_ptr<MIMG_instruction> mimg{
5951       create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), has_dst)};
5952    if (has_dst)
5953       mimg->definitions[0] = Definition(tmp_dst);
5954    mimg->operands[0] = Operand(rsrc);
5955    mimg->operands[1] = samp;
5956    mimg->operands[2] = vdata;
5957    for (unsigned i = 0; i < coords.size(); i++)
5958       mimg->operands[3 + i] = Operand(coords[i]);
5959    mimg->strict_wqm = strict_wqm;
5960
5961    MIMG_instruction* res = mimg.get();
5962    bld.insert(std::move(mimg));
5963    if (needs_wqm)
5964       emit_wqm(bld, tmp_dst, dst, true);
5965    return res;
5966 }
5967
5968 void
5969 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5970 {
5971    Builder bld(ctx->program, ctx->block);
5972    Temp dst = get_ssa_temp(ctx, &instr->def);
5973    Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
5974    Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
5975    Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
5976    Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
5977    Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
5978    Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
5979
5980    /* On GFX11 image_bvh64_intersect_ray has a special vaddr layout with NSA:
5981     * There are five smaller vector groups:
5982     * node_pointer, ray_extent, ray_origin, ray_dir, ray_inv_dir.
5983     * These directly match the NIR intrinsic sources.
5984     */
5985    std::vector<Temp> args = {
5986       node, tmax, origin, dir, inv_dir,
5987    };
5988
5989    if (bld.program->gfx_level == GFX10_3) {
5990       std::vector<Temp> scalar_args;
5991       for (Temp tmp : args) {
5992          for (unsigned i = 0; i < tmp.size(); i++)
5993             scalar_args.push_back(emit_extract_vector(ctx, tmp, i, v1));
5994       }
5995       args = std::move(scalar_args);
5996    }
5997
5998    MIMG_instruction* mimg =
5999       emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, dst, resource, Operand(s4), args);
6000    mimg->dim = ac_image_1d;
6001    mimg->dmask = 0xf;
6002    mimg->unrm = true;
6003    mimg->r128 = true;
6004
6005    emit_split_vector(ctx, dst, instr->def.num_components);
6006 }
6007
6008 static std::vector<Temp>
6009 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6010 {
6011
6012    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6013    bool a16 = instr->src[1].ssa->bit_size == 16;
6014    RegClass rc = a16 ? v2b : v1;
6015    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6016    bool is_array = nir_intrinsic_image_array(instr);
6017    ASSERTED bool add_frag_pos =
6018       (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6019    assert(!add_frag_pos && "Input attachments should be lowered.");
6020    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6021    bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6022    int count = image_type_to_components_count(dim, is_array);
6023    std::vector<Temp> coords;
6024    Builder bld(ctx->program, ctx->block);
6025
6026    if (gfx9_1d) {
6027       coords.emplace_back(emit_extract_vector(ctx, src0, 0, rc));
6028       coords.emplace_back(bld.copy(bld.def(rc), Operand::zero(a16 ? 2 : 4)));
6029       if (is_array)
6030          coords.emplace_back(emit_extract_vector(ctx, src0, 1, rc));
6031    } else {
6032       for (int i = 0; i < count; i++)
6033          coords.emplace_back(emit_extract_vector(ctx, src0, i, rc));
6034    }
6035
6036    bool has_lod = false;
6037    Temp lod;
6038
6039    if (instr->intrinsic == nir_intrinsic_bindless_image_load ||
6040        instr->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
6041        instr->intrinsic == nir_intrinsic_bindless_image_store) {
6042       int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3;
6043       assert(instr->src[lod_index].ssa->bit_size == (a16 ? 16 : 32));
6044       has_lod =
6045          !nir_src_is_const(instr->src[lod_index]) || nir_src_as_uint(instr->src[lod_index]) != 0;
6046
6047       if (has_lod)
6048          lod = get_ssa_temp_tex(ctx, instr->src[lod_index].ssa, a16);
6049    }
6050
6051    if (ctx->program->info.image_2d_view_of_3d && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
6052       /* The hw can't bind a slice of a 3D image as a 2D image, because it
6053        * ignores BASE_ARRAY if the target is 3D. The workaround is to read
6054        * BASE_ARRAY and set it as the 3rd address operand for all 2D images.
6055        */
6056       assert(ctx->options->gfx_level == GFX9);
6057       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6058       Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1);
6059       /* Extract the BASE_ARRAY field [0:12] from the descriptor. */
6060       Temp first_layer = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), rsrc_word5, Operand::c32(0u),
6061                                   Operand::c32(13u));
6062
6063       if (has_lod) {
6064          /* If there's a lod parameter it matter if the image is 3d or 2d because
6065           * the hw reads either the fourth or third component as lod. So detect
6066           * 3d images and place the lod at the third component otherwise.
6067           * For non 3D descriptors we effectively add lod twice to coords,
6068           * but the hw will only read the first one, the second is ignored.
6069           */
6070          Temp rsrc_word3 = emit_extract_vector(ctx, rsrc, 3, s1);
6071          Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), rsrc_word3,
6072                               Operand::c32(28 | (4 << 16))); /* extract last 4 bits */
6073          Temp is_3d = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), type,
6074                                    Operand::c32(V_008F1C_SQ_RSRC_IMG_3D));
6075          first_layer =
6076             bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), as_vgpr(ctx, lod), first_layer, is_3d);
6077       }
6078
6079       if (a16)
6080          coords.emplace_back(emit_extract_vector(ctx, first_layer, 0, v2b));
6081       else
6082          coords.emplace_back(first_layer);
6083    }
6084
6085    if (is_ms && instr->intrinsic != nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6086       assert(instr->src[2].ssa->bit_size == (a16 ? 16 : 32));
6087       coords.emplace_back(get_ssa_temp_tex(ctx, instr->src[2].ssa, a16));
6088    }
6089
6090    if (has_lod)
6091       coords.emplace_back(lod);
6092
6093    return emit_pack_v1(ctx, coords);
6094 }
6095
6096 memory_sync_info
6097 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6098 {
6099    /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6100    if (semantics & semantic_atomicrmw)
6101       return memory_sync_info(storage, semantics);
6102
6103    unsigned access = nir_intrinsic_access(instr);
6104
6105    if (access & ACCESS_VOLATILE)
6106       semantics |= semantic_volatile;
6107    if (access & ACCESS_CAN_REORDER)
6108       semantics |= semantic_can_reorder | semantic_private;
6109
6110    return memory_sync_info(storage, semantics);
6111 }
6112
6113 Operand
6114 emit_tfe_init(Builder& bld, Temp dst)
6115 {
6116    Temp tmp = bld.tmp(dst.regClass());
6117
6118    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6119       aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6120    for (unsigned i = 0; i < dst.size(); i++)
6121       vec->operands[i] = Operand::zero();
6122    vec->definitions[0] = Definition(tmp);
6123    /* Since this is fixed to an instruction's definition register, any CSE will
6124     * just create copies. Copying costs about the same as zero-initialization,
6125     * but these copies can break up clauses.
6126     */
6127    vec->definitions[0].setNoCSE(true);
6128    bld.insert(std::move(vec));
6129
6130    return Operand(tmp);
6131 }
6132
6133 void
6134 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6135 {
6136    Builder bld(ctx->program, ctx->block);
6137    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6138    bool is_array = nir_intrinsic_image_array(instr);
6139    bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
6140    Temp dst = get_ssa_temp(ctx, &instr->def);
6141
6142    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6143    unsigned access = nir_intrinsic_access(instr);
6144
6145    unsigned result_size = instr->def.num_components - is_sparse;
6146    unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
6147    expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6148    if (dim == GLSL_SAMPLER_DIM_BUF)
6149       expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6150    unsigned dmask = expand_mask;
6151    if (instr->def.bit_size == 64) {
6152       expand_mask &= 0x9;
6153       /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6154       dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6155    }
6156    if (is_sparse)
6157       expand_mask |= 1 << result_size;
6158
6159    bool d16 = instr->def.bit_size == 16;
6160    assert(!d16 || !is_sparse);
6161
6162    unsigned num_bytes = util_bitcount(dmask) * (d16 ? 2 : 4) + is_sparse * 4;
6163
6164    Temp tmp;
6165    if (num_bytes == dst.bytes() && dst.type() == RegType::vgpr)
6166       tmp = dst;
6167    else
6168       tmp = bld.tmp(RegClass::get(RegType::vgpr, num_bytes));
6169
6170    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6171
6172    if (dim == GLSL_SAMPLER_DIM_BUF) {
6173       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6174
6175       aco_opcode opcode;
6176       if (!d16) {
6177          switch (util_bitcount(dmask)) {
6178          case 1: opcode = aco_opcode::buffer_load_format_x; break;
6179          case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6180          case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6181          case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6182          default: unreachable(">4 channel buffer image load");
6183          }
6184       } else {
6185          switch (util_bitcount(dmask)) {
6186          case 1: opcode = aco_opcode::buffer_load_format_d16_x; break;
6187          case 2: opcode = aco_opcode::buffer_load_format_d16_xy; break;
6188          case 3: opcode = aco_opcode::buffer_load_format_d16_xyz; break;
6189          case 4: opcode = aco_opcode::buffer_load_format_d16_xyzw; break;
6190          default: unreachable(">4 channel buffer image load");
6191          }
6192       }
6193       aco_ptr<MUBUF_instruction> load{
6194          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6195       load->operands[0] = Operand(resource);
6196       load->operands[1] = Operand(vindex);
6197       load->operands[2] = Operand::c32(0);
6198       load->definitions[0] = Definition(tmp);
6199       load->idxen = true;
6200       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6201       load->dlc =
6202          load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6203       load->sync = sync;
6204       load->tfe = is_sparse;
6205       if (load->tfe)
6206          load->operands[3] = emit_tfe_init(bld, tmp);
6207       ctx->block->instructions.emplace_back(std::move(load));
6208    } else {
6209       std::vector<Temp> coords = get_image_coords(ctx, instr);
6210
6211       aco_opcode opcode;
6212       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6213          opcode = aco_opcode::image_load;
6214       } else {
6215          bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6216          opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6217       }
6218
6219       Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6220       MIMG_instruction* load =
6221          emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, false, vdata);
6222       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6223       load->dlc =
6224          load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6225       load->a16 = instr->src[1].ssa->bit_size == 16;
6226       load->d16 = d16;
6227       load->dmask = dmask;
6228       load->unrm = true;
6229       load->tfe = is_sparse;
6230
6231       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6232          load->dim = is_array ? ac_image_2darray : ac_image_2d;
6233          load->da = is_array;
6234          load->sync = memory_sync_info();
6235       } else {
6236          ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6237          load->dim = sdim;
6238          load->da = should_declare_array(sdim);
6239          load->sync = sync;
6240       }
6241    }
6242
6243    if (is_sparse && instr->def.bit_size == 64) {
6244       /* The result components are 64-bit but the sparse residency code is
6245        * 32-bit. So add a zero to the end so expand_vector() works correctly.
6246        */
6247       tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6248                        Operand::zero());
6249    }
6250
6251    expand_vector(ctx, tmp, dst, instr->def.num_components, expand_mask, instr->def.bit_size == 64);
6252 }
6253
6254 void
6255 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6256 {
6257    Builder bld(ctx->program, ctx->block);
6258    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6259    bool is_array = nir_intrinsic_image_array(instr);
6260    Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6261    bool d16 = instr->src[3].ssa->bit_size == 16;
6262
6263    /* only R64_UINT and R64_SINT supported */
6264    if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6265       data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6266    data = as_vgpr(ctx, data);
6267
6268    uint32_t num_components = d16 ? instr->src[3].ssa->num_components : data.size();
6269
6270    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6271    unsigned access = nir_intrinsic_access(instr);
6272    bool glc = ctx->options->gfx_level == GFX6 ||
6273               ((access & (ACCESS_VOLATILE | ACCESS_COHERENT)) && ctx->program->gfx_level < GFX11);
6274
6275    if (dim == GLSL_SAMPLER_DIM_BUF) {
6276       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6277       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6278       aco_opcode opcode;
6279       if (!d16) {
6280          switch (num_components) {
6281          case 1: opcode = aco_opcode::buffer_store_format_x; break;
6282          case 2: opcode = aco_opcode::buffer_store_format_xy; break;
6283          case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
6284          case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
6285          default: unreachable(">4 channel buffer image store");
6286          }
6287       } else {
6288          switch (num_components) {
6289          case 1: opcode = aco_opcode::buffer_store_format_d16_x; break;
6290          case 2: opcode = aco_opcode::buffer_store_format_d16_xy; break;
6291          case 3: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
6292          case 4: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
6293          default: unreachable(">4 channel buffer image store");
6294          }
6295       }
6296       aco_ptr<MUBUF_instruction> store{
6297          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6298       store->operands[0] = Operand(rsrc);
6299       store->operands[1] = Operand(vindex);
6300       store->operands[2] = Operand::c32(0);
6301       store->operands[3] = Operand(data);
6302       store->idxen = true;
6303       store->glc = glc;
6304       store->dlc = false;
6305       store->disable_wqm = true;
6306       store->sync = sync;
6307       ctx->program->needs_exact = true;
6308       ctx->block->instructions.emplace_back(std::move(store));
6309       return;
6310    }
6311
6312    assert(data.type() == RegType::vgpr);
6313    std::vector<Temp> coords = get_image_coords(ctx, instr);
6314    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6315
6316    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6317    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6318
6319    uint32_t dmask = BITFIELD_MASK(num_components);
6320    /* remove zero/undef elements from data, components which aren't in dmask
6321     * are zeroed anyway
6322     */
6323    if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
6324       for (uint32_t i = 0; i < instr->num_components; i++) {
6325          nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
6326          if ((nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0) ||
6327              nir_scalar_is_undef(comp))
6328             dmask &= ~BITFIELD_BIT(i);
6329       }
6330
6331       /* dmask cannot be 0, at least one vgpr is always read */
6332       if (dmask == 0)
6333          dmask = 1;
6334
6335       if (dmask != BITFIELD_MASK(num_components)) {
6336          uint32_t dmask_count = util_bitcount(dmask);
6337          RegClass rc = d16 ? v2b : v1;
6338          if (dmask_count == 1) {
6339             data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
6340          } else {
6341             aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6342                aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
6343             uint32_t index = 0;
6344             u_foreach_bit (bit, dmask) {
6345                vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
6346             }
6347             data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
6348             vec->definitions[0] = Definition(data);
6349             bld.insert(std::move(vec));
6350          }
6351       }
6352    }
6353
6354    MIMG_instruction* store =
6355       emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, false, Operand(data));
6356    store->glc = glc;
6357    store->dlc = false;
6358    store->a16 = instr->src[1].ssa->bit_size == 16;
6359    store->d16 = d16;
6360    store->dmask = dmask;
6361    store->unrm = true;
6362    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6363    store->dim = sdim;
6364    store->da = should_declare_array(sdim);
6365    store->disable_wqm = true;
6366    store->sync = sync;
6367    ctx->program->needs_exact = true;
6368    return;
6369 }
6370
6371 void
6372 translate_buffer_image_atomic_op(const nir_atomic_op op, aco_opcode* buf_op, aco_opcode* buf_op64,
6373                                  aco_opcode* image_op)
6374 {
6375    switch (op) {
6376    case nir_atomic_op_iadd:
6377       *buf_op = aco_opcode::buffer_atomic_add;
6378       *buf_op64 = aco_opcode::buffer_atomic_add_x2;
6379       *image_op = aco_opcode::image_atomic_add;
6380       break;
6381    case nir_atomic_op_umin:
6382       *buf_op = aco_opcode::buffer_atomic_umin;
6383       *buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6384       *image_op = aco_opcode::image_atomic_umin;
6385       break;
6386    case nir_atomic_op_imin:
6387       *buf_op = aco_opcode::buffer_atomic_smin;
6388       *buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6389       *image_op = aco_opcode::image_atomic_smin;
6390       break;
6391    case nir_atomic_op_umax:
6392       *buf_op = aco_opcode::buffer_atomic_umax;
6393       *buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6394       *image_op = aco_opcode::image_atomic_umax;
6395       break;
6396    case nir_atomic_op_imax:
6397       *buf_op = aco_opcode::buffer_atomic_smax;
6398       *buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6399       *image_op = aco_opcode::image_atomic_smax;
6400       break;
6401    case nir_atomic_op_iand:
6402       *buf_op = aco_opcode::buffer_atomic_and;
6403       *buf_op64 = aco_opcode::buffer_atomic_and_x2;
6404       *image_op = aco_opcode::image_atomic_and;
6405       break;
6406    case nir_atomic_op_ior:
6407       *buf_op = aco_opcode::buffer_atomic_or;
6408       *buf_op64 = aco_opcode::buffer_atomic_or_x2;
6409       *image_op = aco_opcode::image_atomic_or;
6410       break;
6411    case nir_atomic_op_ixor:
6412       *buf_op = aco_opcode::buffer_atomic_xor;
6413       *buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6414       *image_op = aco_opcode::image_atomic_xor;
6415       break;
6416    case nir_atomic_op_xchg:
6417       *buf_op = aco_opcode::buffer_atomic_swap;
6418       *buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6419       *image_op = aco_opcode::image_atomic_swap;
6420       break;
6421    case nir_atomic_op_cmpxchg:
6422       *buf_op = aco_opcode::buffer_atomic_cmpswap;
6423       *buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6424       *image_op = aco_opcode::image_atomic_cmpswap;
6425       break;
6426    case nir_atomic_op_inc_wrap:
6427       *buf_op = aco_opcode::buffer_atomic_inc;
6428       *buf_op64 = aco_opcode::buffer_atomic_inc_x2;
6429       *image_op = aco_opcode::image_atomic_inc;
6430       break;
6431    case nir_atomic_op_dec_wrap:
6432       *buf_op = aco_opcode::buffer_atomic_dec;
6433       *buf_op64 = aco_opcode::buffer_atomic_dec_x2;
6434       *image_op = aco_opcode::image_atomic_dec;
6435       break;
6436    case nir_atomic_op_fadd:
6437       *buf_op = aco_opcode::buffer_atomic_add_f32;
6438       *buf_op64 = aco_opcode::num_opcodes;
6439       *image_op = aco_opcode::num_opcodes;
6440       break;
6441    case nir_atomic_op_fmin:
6442       *buf_op = aco_opcode::buffer_atomic_fmin;
6443       *buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6444       *image_op = aco_opcode::image_atomic_fmin;
6445       break;
6446    case nir_atomic_op_fmax:
6447       *buf_op = aco_opcode::buffer_atomic_fmax;
6448       *buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6449       *image_op = aco_opcode::image_atomic_fmax;
6450       break;
6451    default: unreachable("unsupported atomic operation");
6452    }
6453 }
6454
6455 void
6456 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6457 {
6458    bool return_previous = !nir_def_is_unused(&instr->def);
6459    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6460    bool is_array = nir_intrinsic_image_array(instr);
6461    Builder bld(ctx->program, ctx->block);
6462
6463    const nir_atomic_op op = nir_intrinsic_atomic_op(instr);
6464    const bool cmpswap = op == nir_atomic_op_cmpxchg;
6465
6466    aco_opcode buf_op, buf_op64, image_op;
6467    translate_buffer_image_atomic_op(op, &buf_op, &buf_op64, &image_op);
6468
6469    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6470    bool is_64bit = data.bytes() == 8;
6471    assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6472
6473    if (cmpswap)
6474       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6475                         get_ssa_temp(ctx, instr->src[4].ssa), data);
6476
6477    Temp dst = get_ssa_temp(ctx, &instr->def);
6478    memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6479
6480    if (dim == GLSL_SAMPLER_DIM_BUF) {
6481       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6482       Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6483       // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
6484       // implemented.");
6485       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6486          is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6487       mubuf->operands[0] = Operand(resource);
6488       mubuf->operands[1] = Operand(vindex);
6489       mubuf->operands[2] = Operand::c32(0);
6490       mubuf->operands[3] = Operand(data);
6491       Definition def =
6492          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6493       if (return_previous)
6494          mubuf->definitions[0] = def;
6495       mubuf->offset = 0;
6496       mubuf->idxen = true;
6497       mubuf->glc = return_previous;
6498       mubuf->dlc = false; /* Not needed for atomics */
6499       mubuf->disable_wqm = true;
6500       mubuf->sync = sync;
6501       ctx->program->needs_exact = true;
6502       ctx->block->instructions.emplace_back(std::move(mubuf));
6503       if (return_previous && cmpswap)
6504          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6505       return;
6506    }
6507
6508    std::vector<Temp> coords = get_image_coords(ctx, instr);
6509    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6510    Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
6511    MIMG_instruction* mimg =
6512       emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, false, Operand(data));
6513    mimg->glc = return_previous;
6514    mimg->dlc = false; /* Not needed for atomics */
6515    mimg->dmask = (1 << data.size()) - 1;
6516    mimg->a16 = instr->src[1].ssa->bit_size == 16;
6517    mimg->unrm = true;
6518    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6519    mimg->dim = sdim;
6520    mimg->da = should_declare_array(sdim);
6521    mimg->disable_wqm = true;
6522    mimg->sync = sync;
6523    ctx->program->needs_exact = true;
6524    if (return_previous && cmpswap)
6525       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::zero());
6526    return;
6527 }
6528
6529 void
6530 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6531 {
6532    Builder bld(ctx->program, ctx->block);
6533    unsigned num_components = instr->num_components;
6534
6535    Temp dst = get_ssa_temp(ctx, &instr->def);
6536    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6537
6538    unsigned access = nir_intrinsic_access(instr);
6539    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6540    unsigned size = instr->def.bit_size / 8;
6541
6542    bool allow_smem = access & ACCESS_CAN_REORDER;
6543
6544    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6545                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6546                get_memory_sync_info(instr, storage_buffer, 0));
6547 }
6548
6549 void
6550 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6551 {
6552    Builder bld(ctx->program, ctx->block);
6553    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6554    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6555    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6556    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6557
6558    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6559
6560    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6561    bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) &&
6562               ctx->program->gfx_level < GFX11;
6563
6564    unsigned write_count = 0;
6565    Temp write_datas[32];
6566    unsigned offsets[32];
6567    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6568                       write_datas, offsets);
6569
6570    /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6571     * correctly when the SGPR offset is used.
6572     */
6573    if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
6574       offset = as_vgpr(ctx, offset);
6575
6576    for (unsigned i = 0; i < write_count; i++) {
6577       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6578
6579       aco_ptr<MUBUF_instruction> store{
6580          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6581       store->operands[0] = Operand(rsrc);
6582       store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6583       store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6584       store->operands[3] = Operand(write_datas[i]);
6585       store->offset = offsets[i];
6586       store->offen = (offset.type() == RegType::vgpr);
6587       store->glc = glc;
6588       store->dlc = false;
6589       store->disable_wqm = true;
6590       store->sync = sync;
6591       ctx->program->needs_exact = true;
6592       ctx->block->instructions.emplace_back(std::move(store));
6593    }
6594 }
6595
6596 void
6597 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6598 {
6599    Builder bld(ctx->program, ctx->block);
6600    bool return_previous = !nir_def_is_unused(&instr->def);
6601    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6602
6603    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6604    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6605
6606    aco_opcode op32, op64, image_op;
6607    translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6608
6609    if (cmpswap)
6610       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6611                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6612
6613    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6614    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6615    Temp dst = get_ssa_temp(ctx, &instr->def);
6616
6617    aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6618    aco_ptr<MUBUF_instruction> mubuf{
6619       create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6620    mubuf->operands[0] = Operand(rsrc);
6621    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6622    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6623    mubuf->operands[3] = Operand(data);
6624    Definition def =
6625       return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6626    if (return_previous)
6627       mubuf->definitions[0] = def;
6628    mubuf->offset = 0;
6629    mubuf->offen = (offset.type() == RegType::vgpr);
6630    mubuf->glc = return_previous;
6631    mubuf->dlc = false; /* Not needed for atomics */
6632    mubuf->disable_wqm = true;
6633    mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6634    ctx->program->needs_exact = true;
6635    ctx->block->instructions.emplace_back(std::move(mubuf));
6636    if (return_previous && cmpswap)
6637       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6638 }
6639
6640 void
6641 parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset,
6642              Temp* offset)
6643 {
6644    bool is_store = intrin->intrinsic == nir_intrinsic_store_global_amd;
6645    *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa);
6646
6647    *const_offset = nir_intrinsic_base(intrin);
6648
6649    unsigned num_src = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
6650    nir_src offset_src = intrin->src[num_src - 1];
6651    if (!nir_src_is_const(offset_src) || nir_src_as_uint(offset_src))
6652       *offset = get_ssa_temp(ctx, offset_src.ssa);
6653    else
6654       *offset = Temp();
6655 }
6656
6657 void
6658 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6659 {
6660    Builder bld(ctx->program, ctx->block);
6661    unsigned num_components = instr->num_components;
6662    unsigned component_size = instr->def.bit_size / 8;
6663
6664    Temp addr, offset;
6665    uint32_t const_offset;
6666    parse_global(ctx, instr, &addr, &const_offset, &offset);
6667
6668    LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->def), num_components,
6669                         component_size};
6670    if (offset.id()) {
6671       info.resource = addr;
6672       info.offset = Operand(offset);
6673    }
6674    info.const_offset = const_offset;
6675    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6676    info.align_mul = nir_intrinsic_align_mul(instr);
6677    info.align_offset = nir_intrinsic_align_offset(instr);
6678    info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6679
6680    /* Don't expand global loads when they use MUBUF or SMEM.
6681     * Global loads don't have the bounds checking that buffer loads have that
6682     * makes this safe.
6683     */
6684    unsigned align = nir_intrinsic_align(instr);
6685    bool byte_align_for_smem_mubuf =
6686       can_use_byte_align_for_global_load(num_components, component_size, align, false);
6687
6688    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6689     * it's safe to use SMEM */
6690    bool can_use_smem =
6691       (nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE) && byte_align_for_smem_mubuf;
6692    if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->gfx_level < GFX8) ||
6693        !can_use_smem) {
6694       EmitLoadParameters params = global_load_params;
6695       params.byte_align_loads = ctx->options->gfx_level > GFX6 || byte_align_for_smem_mubuf;
6696       emit_load(ctx, bld, info, params);
6697    } else {
6698       if (info.resource.id())
6699          info.resource = bld.as_uniform(info.resource);
6700       info.offset = Operand(bld.as_uniform(info.offset));
6701       emit_load(ctx, bld, info, smem_load_params);
6702    }
6703 }
6704
6705 void
6706 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6707 {
6708    Builder bld(ctx->program, ctx->block);
6709    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6710    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6711
6712    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6713    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6714    bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) &&
6715               ctx->program->gfx_level < GFX11;
6716
6717    unsigned write_count = 0;
6718    Temp write_datas[32];
6719    unsigned offsets[32];
6720    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6721                       write_datas, offsets);
6722
6723    Temp addr, offset;
6724    uint32_t const_offset;
6725    parse_global(ctx, instr, &addr, &const_offset, &offset);
6726
6727    for (unsigned i = 0; i < write_count; i++) {
6728       Temp write_address = addr;
6729       uint32_t write_const_offset = const_offset;
6730       Temp write_offset = offset;
6731       lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset);
6732
6733       if (ctx->options->gfx_level >= GFX7) {
6734          bool global = ctx->options->gfx_level >= GFX9;
6735          aco_opcode op;
6736          switch (write_datas[i].bytes()) {
6737          case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6738          case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6739          case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6740          case 8:
6741             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6742             break;
6743          case 12:
6744             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6745             break;
6746          case 16:
6747             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6748             break;
6749          default: unreachable("store_global not implemented for this size.");
6750          }
6751
6752          aco_ptr<FLAT_instruction> flat{
6753             create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6754          if (write_address.regClass() == s2) {
6755             assert(global && write_offset.id() && write_offset.type() == RegType::vgpr);
6756             flat->operands[0] = Operand(write_offset);
6757             flat->operands[1] = Operand(write_address);
6758          } else {
6759             assert(write_address.type() == RegType::vgpr && !write_offset.id());
6760             flat->operands[0] = Operand(write_address);
6761             flat->operands[1] = Operand(s1);
6762          }
6763          flat->operands[2] = Operand(write_datas[i]);
6764          flat->glc = glc;
6765          flat->dlc = false;
6766          assert(global || !write_const_offset);
6767          flat->offset = write_const_offset;
6768          flat->disable_wqm = true;
6769          flat->sync = sync;
6770          ctx->program->needs_exact = true;
6771          ctx->block->instructions.emplace_back(std::move(flat));
6772       } else {
6773          assert(ctx->options->gfx_level == GFX6);
6774
6775          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6776
6777          Temp rsrc = get_gfx6_global_rsrc(bld, write_address);
6778
6779          aco_ptr<MUBUF_instruction> mubuf{
6780             create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6781          mubuf->operands[0] = Operand(rsrc);
6782          mubuf->operands[1] =
6783             write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
6784          mubuf->operands[2] = Operand(write_offset);
6785          mubuf->operands[3] = Operand(write_datas[i]);
6786          mubuf->glc = glc;
6787          mubuf->dlc = false;
6788          mubuf->offset = write_const_offset;
6789          mubuf->addr64 = write_address.type() == RegType::vgpr;
6790          mubuf->disable_wqm = true;
6791          mubuf->sync = sync;
6792          ctx->program->needs_exact = true;
6793          ctx->block->instructions.emplace_back(std::move(mubuf));
6794       }
6795    }
6796 }
6797
6798 void
6799 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6800 {
6801    Builder bld(ctx->program, ctx->block);
6802    bool return_previous = !nir_def_is_unused(&instr->def);
6803    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6804
6805    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6806    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6807
6808    if (cmpswap)
6809       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6810                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6811
6812    Temp dst = get_ssa_temp(ctx, &instr->def);
6813
6814    aco_opcode op32, op64;
6815
6816    Temp addr, offset;
6817    uint32_t const_offset;
6818    parse_global(ctx, instr, &addr, &const_offset, &offset);
6819    lower_global_address(bld, 0, &addr, &const_offset, &offset);
6820
6821    if (ctx->options->gfx_level >= GFX7) {
6822       bool global = ctx->options->gfx_level >= GFX9;
6823       switch (nir_op) {
6824       case nir_atomic_op_iadd:
6825          op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6826          op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6827          break;
6828       case nir_atomic_op_imin:
6829          op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6830          op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6831          break;
6832       case nir_atomic_op_umin:
6833          op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6834          op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6835          break;
6836       case nir_atomic_op_imax:
6837          op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6838          op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6839          break;
6840       case nir_atomic_op_umax:
6841          op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6842          op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6843          break;
6844       case nir_atomic_op_iand:
6845          op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6846          op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6847          break;
6848       case nir_atomic_op_ior:
6849          op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6850          op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6851          break;
6852       case nir_atomic_op_ixor:
6853          op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6854          op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6855          break;
6856       case nir_atomic_op_xchg:
6857          op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6858          op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6859          break;
6860       case nir_atomic_op_cmpxchg:
6861          op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6862          op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6863          break;
6864       case nir_atomic_op_fadd:
6865          op32 = global ? aco_opcode::global_atomic_add_f32 : aco_opcode::flat_atomic_add_f32;
6866          op64 = aco_opcode::num_opcodes;
6867          break;
6868       case nir_atomic_op_fmin:
6869          op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
6870          op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
6871          break;
6872       case nir_atomic_op_fmax:
6873          op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
6874          op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
6875          break;
6876       default: unreachable("unsupported atomic operation");
6877       }
6878
6879       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6880       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
6881          op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6882       if (addr.regClass() == s2) {
6883          assert(global && offset.id() && offset.type() == RegType::vgpr);
6884          flat->operands[0] = Operand(offset);
6885          flat->operands[1] = Operand(addr);
6886       } else {
6887          assert(addr.type() == RegType::vgpr && !offset.id());
6888          flat->operands[0] = Operand(addr);
6889          flat->operands[1] = Operand(s1);
6890       }
6891       flat->operands[2] = Operand(data);
6892       if (return_previous)
6893          flat->definitions[0] = Definition(dst);
6894       flat->glc = return_previous;
6895       flat->dlc = false; /* Not needed for atomics */
6896       assert(global || !const_offset);
6897       flat->offset = const_offset;
6898       flat->disable_wqm = true;
6899       flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6900       ctx->program->needs_exact = true;
6901       ctx->block->instructions.emplace_back(std::move(flat));
6902    } else {
6903       assert(ctx->options->gfx_level == GFX6);
6904
6905       UNUSED aco_opcode image_op;
6906       translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6907
6908       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6909
6910       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6911
6912       aco_ptr<MUBUF_instruction> mubuf{
6913          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6914       mubuf->operands[0] = Operand(rsrc);
6915       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6916       mubuf->operands[2] = Operand(offset);
6917       mubuf->operands[3] = Operand(data);
6918       Definition def =
6919          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6920       if (return_previous)
6921          mubuf->definitions[0] = def;
6922       mubuf->glc = return_previous;
6923       mubuf->dlc = false;
6924       mubuf->offset = const_offset;
6925       mubuf->addr64 = addr.type() == RegType::vgpr;
6926       mubuf->disable_wqm = true;
6927       mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6928       ctx->program->needs_exact = true;
6929       ctx->block->instructions.emplace_back(std::move(mubuf));
6930       if (return_previous && cmpswap)
6931          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6932    }
6933 }
6934
6935 unsigned
6936 aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)
6937 {
6938    unsigned storage = storage_none;
6939
6940    if (mem_mode & nir_var_shader_out)
6941       storage |= storage_vmem_output;
6942    if ((mem_mode & nir_var_mem_ssbo) || (mem_mode & nir_var_mem_global))
6943       storage |= storage_buffer;
6944    if (mem_mode & nir_var_mem_task_payload)
6945       storage |= storage_task_payload;
6946    if (mem_mode & nir_var_mem_shared)
6947       storage |= storage_shared;
6948    if (mem_mode & nir_var_image)
6949       storage |= storage_image;
6950
6951    return storage;
6952 }
6953
6954 void
6955 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
6956 {
6957    Builder bld(ctx->program, ctx->block);
6958
6959    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
6960    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
6961    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
6962                 !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]);
6963    bool v_offset_zero = nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]);
6964    bool s_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
6965
6966    Temp dst = get_ssa_temp(ctx, &intrin->def);
6967    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
6968    Temp v_offset =
6969       v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
6970    Temp s_offset =
6971       s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
6972    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
6973
6974    bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
6975    bool slc = nir_intrinsic_access(intrin) & ACCESS_NON_TEMPORAL;
6976
6977    unsigned const_offset = nir_intrinsic_base(intrin);
6978    unsigned elem_size_bytes = intrin->def.bit_size / 8u;
6979    unsigned num_components = intrin->def.num_components;
6980
6981    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
6982    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
6983
6984    LoadEmitInfo info = {Operand(v_offset), dst, num_components, elem_size_bytes, descriptor};
6985    info.idx = idx;
6986    info.glc = glc;
6987    info.slc = slc;
6988    info.soffset = s_offset;
6989    info.const_offset = const_offset;
6990    info.sync = sync;
6991
6992    if (intrin->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
6993       const pipe_format format = nir_intrinsic_format(intrin);
6994       const struct ac_vtx_format_info* vtx_info =
6995          ac_get_vtx_format_info(ctx->program->gfx_level, ctx->program->family, format);
6996       const struct util_format_description* f = util_format_description(format);
6997       const unsigned align_mul = nir_intrinsic_align_mul(intrin);
6998       const unsigned align_offset = nir_intrinsic_align_offset(intrin);
6999
7000       /* Avoid splitting:
7001        * - non-array formats because that would result in incorrect code
7002        * - when element size is same as component size (to reduce instruction count)
7003        */
7004       const bool can_split = f->is_array && elem_size_bytes != vtx_info->chan_byte_size;
7005
7006       info.align_mul = align_mul;
7007       info.align_offset = align_offset;
7008       info.format = format;
7009       info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
7010       info.split_by_component_stride = false;
7011
7012       emit_load(ctx, bld, info, mtbuf_load_params);
7013    } else {
7014       assert(intrin->intrinsic == nir_intrinsic_load_buffer_amd);
7015
7016       if (nir_intrinsic_access(intrin) & ACCESS_USES_FORMAT_AMD) {
7017          assert(!swizzled);
7018
7019          emit_load(ctx, bld, info, mubuf_load_format_params);
7020       } else {
7021          const unsigned swizzle_element_size =
7022             swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
7023
7024          info.component_stride = swizzle_element_size;
7025          info.swizzle_component_size = swizzle_element_size ? 4 : 0;
7026          info.align_mul = MIN2(elem_size_bytes, 4);
7027          info.align_offset = 0;
7028
7029          emit_load(ctx, bld, info, mubuf_load_params);
7030       }
7031    }
7032 }
7033
7034 void
7035 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7036 {
7037    Builder bld(ctx->program, ctx->block);
7038
7039    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7040    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7041    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7042                 !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]);
7043    bool v_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
7044    bool s_offset_zero = nir_src_is_const(intrin->src[3]) && !nir_src_as_uint(intrin->src[3]);
7045
7046    Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7047    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa));
7048    Temp v_offset =
7049       v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa));
7050    Temp s_offset =
7051       s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
7052    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
7053
7054    bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
7055    bool slc = nir_intrinsic_access(intrin) & ACCESS_NON_TEMPORAL;
7056
7057    unsigned const_offset = nir_intrinsic_base(intrin);
7058    unsigned write_mask = nir_intrinsic_write_mask(intrin);
7059    unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7060
7061    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7062    /* GS outputs are only written once. */
7063    const bool written_once =
7064       mem_mode == nir_var_shader_out && ctx->shader->info.stage == MESA_SHADER_GEOMETRY;
7065    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode),
7066                          written_once ? semantic_can_reorder : semantic_none);
7067
7068    store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, idx, const_offset,
7069                     elem_size_bytes, write_mask, swizzled, sync, glc, slc);
7070 }
7071
7072 void
7073 visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr)
7074 {
7075    Builder bld(ctx->program, ctx->block);
7076    Temp dst = get_ssa_temp(ctx, &instr->def);
7077    Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
7078    Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7079
7080    /* If base address is 32bit, convert to 64bit with the high 32bit part. */
7081    if (base.bytes() == 4) {
7082       base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), base,
7083                         Operand::c32(ctx->options->address32_hi));
7084    }
7085
7086    aco_opcode opcode = aco_opcode::s_load_dword;
7087    unsigned size = 1;
7088
7089    assert(dst.bytes() <= 64);
7090
7091    if (dst.bytes() > 32) {
7092       opcode = aco_opcode::s_load_dwordx16;
7093       size = 16;
7094    } else if (dst.bytes() > 16) {
7095       opcode = aco_opcode::s_load_dwordx8;
7096       size = 8;
7097    } else if (dst.bytes() > 8) {
7098       opcode = aco_opcode::s_load_dwordx4;
7099       size = 4;
7100    } else if (dst.bytes() > 4) {
7101       opcode = aco_opcode::s_load_dwordx2;
7102       size = 2;
7103    }
7104
7105    if (dst.size() != size) {
7106       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst),
7107                  bld.smem(opcode, bld.def(RegType::sgpr, size), base, offset), Operand::c32(0u));
7108    } else {
7109       bld.smem(opcode, Definition(dst), base, offset);
7110    }
7111    emit_split_vector(ctx, dst, instr->def.num_components);
7112 }
7113
7114 sync_scope
7115 translate_nir_scope(mesa_scope scope)
7116 {
7117    switch (scope) {
7118    case SCOPE_NONE:
7119    case SCOPE_INVOCATION: return scope_invocation;
7120    case SCOPE_SUBGROUP: return scope_subgroup;
7121    case SCOPE_WORKGROUP: return scope_workgroup;
7122    case SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7123    case SCOPE_DEVICE: return scope_device;
7124    case SCOPE_SHADER_CALL: return scope_invocation;
7125    }
7126    unreachable("invalid scope");
7127 }
7128
7129 void
7130 emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7131 {
7132    Builder bld(ctx->program, ctx->block);
7133
7134    unsigned storage_allowed = storage_buffer | storage_image;
7135    unsigned semantics = 0;
7136    sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7137    sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7138
7139    /* We use shared storage for the following:
7140     * - compute shaders expose it in their API
7141     * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7142     * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7143     * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7144     */
7145    bool shared_storage_used =
7146       ctx->stage.hw == AC_HW_COMPUTE_SHADER || ctx->stage.hw == AC_HW_LOCAL_SHADER ||
7147       ctx->stage.hw == AC_HW_HULL_SHADER ||
7148       (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->program->gfx_level >= GFX9) ||
7149       ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7150
7151    if (shared_storage_used)
7152       storage_allowed |= storage_shared;
7153
7154    /* Task payload: Task Shader output, Mesh Shader input */
7155    if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS))
7156       storage_allowed |= storage_task_payload;
7157
7158    /* Allow VMEM output for all stages that can have outputs. */
7159    if ((ctx->stage.hw != AC_HW_COMPUTE_SHADER && ctx->stage.hw != AC_HW_PIXEL_SHADER) ||
7160        ctx->stage.has(SWStage::TS))
7161       storage_allowed |= storage_vmem_output;
7162
7163    /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7164     * They are allowed in CS, TCS, and in any NGG shader.
7165     */
7166    ASSERTED bool workgroup_scope_allowed = ctx->stage.hw == AC_HW_COMPUTE_SHADER ||
7167                                            ctx->stage.hw == AC_HW_HULL_SHADER ||
7168                                            ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7169
7170    unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7171    unsigned storage = aco_storage_mode_from_nir_mem_mode(nir_storage);
7172    storage &= storage_allowed;
7173
7174    unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7175    if (nir_semantics & NIR_MEMORY_ACQUIRE)
7176       semantics |= semantic_acquire | semantic_release;
7177    if (nir_semantics & NIR_MEMORY_RELEASE)
7178       semantics |= semantic_acquire | semantic_release;
7179
7180    assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7181    assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7182
7183    bld.barrier(aco_opcode::p_barrier,
7184                memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7185                exec_scope);
7186 }
7187
7188 void
7189 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7190 {
7191    // TODO: implement sparse reads using ds_read2_b32 and nir_def_components_read()
7192    Temp dst = get_ssa_temp(ctx, &instr->def);
7193    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7194    Builder bld(ctx->program, ctx->block);
7195
7196    unsigned elem_size_bytes = instr->def.bit_size / 8;
7197    unsigned num_components = instr->def.num_components;
7198    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7199    load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7200 }
7201
7202 void
7203 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7204 {
7205    unsigned writemask = nir_intrinsic_write_mask(instr);
7206    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7207    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7208    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7209
7210    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7211    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7212 }
7213
7214 void
7215 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7216 {
7217    unsigned offset = nir_intrinsic_base(instr);
7218    Builder bld(ctx->program, ctx->block);
7219    Operand m = load_lds_size_m0(bld);
7220    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7221    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7222
7223    unsigned num_operands = 3;
7224    aco_opcode op32, op64, op32_rtn, op64_rtn;
7225    switch (nir_intrinsic_atomic_op(instr)) {
7226    case nir_atomic_op_iadd:
7227       op32 = aco_opcode::ds_add_u32;
7228       op64 = aco_opcode::ds_add_u64;
7229       op32_rtn = aco_opcode::ds_add_rtn_u32;
7230       op64_rtn = aco_opcode::ds_add_rtn_u64;
7231       break;
7232    case nir_atomic_op_imin:
7233       op32 = aco_opcode::ds_min_i32;
7234       op64 = aco_opcode::ds_min_i64;
7235       op32_rtn = aco_opcode::ds_min_rtn_i32;
7236       op64_rtn = aco_opcode::ds_min_rtn_i64;
7237       break;
7238    case nir_atomic_op_umin:
7239       op32 = aco_opcode::ds_min_u32;
7240       op64 = aco_opcode::ds_min_u64;
7241       op32_rtn = aco_opcode::ds_min_rtn_u32;
7242       op64_rtn = aco_opcode::ds_min_rtn_u64;
7243       break;
7244    case nir_atomic_op_imax:
7245       op32 = aco_opcode::ds_max_i32;
7246       op64 = aco_opcode::ds_max_i64;
7247       op32_rtn = aco_opcode::ds_max_rtn_i32;
7248       op64_rtn = aco_opcode::ds_max_rtn_i64;
7249       break;
7250    case nir_atomic_op_umax:
7251       op32 = aco_opcode::ds_max_u32;
7252       op64 = aco_opcode::ds_max_u64;
7253       op32_rtn = aco_opcode::ds_max_rtn_u32;
7254       op64_rtn = aco_opcode::ds_max_rtn_u64;
7255       break;
7256    case nir_atomic_op_iand:
7257       op32 = aco_opcode::ds_and_b32;
7258       op64 = aco_opcode::ds_and_b64;
7259       op32_rtn = aco_opcode::ds_and_rtn_b32;
7260       op64_rtn = aco_opcode::ds_and_rtn_b64;
7261       break;
7262    case nir_atomic_op_ior:
7263       op32 = aco_opcode::ds_or_b32;
7264       op64 = aco_opcode::ds_or_b64;
7265       op32_rtn = aco_opcode::ds_or_rtn_b32;
7266       op64_rtn = aco_opcode::ds_or_rtn_b64;
7267       break;
7268    case nir_atomic_op_ixor:
7269       op32 = aco_opcode::ds_xor_b32;
7270       op64 = aco_opcode::ds_xor_b64;
7271       op32_rtn = aco_opcode::ds_xor_rtn_b32;
7272       op64_rtn = aco_opcode::ds_xor_rtn_b64;
7273       break;
7274    case nir_atomic_op_xchg:
7275       op32 = aco_opcode::ds_write_b32;
7276       op64 = aco_opcode::ds_write_b64;
7277       op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7278       op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7279       break;
7280    case nir_atomic_op_cmpxchg:
7281       op32 = aco_opcode::ds_cmpst_b32;
7282       op64 = aco_opcode::ds_cmpst_b64;
7283       op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7284       op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7285       num_operands = 4;
7286       break;
7287    case nir_atomic_op_fadd:
7288       op32 = aco_opcode::ds_add_f32;
7289       op32_rtn = aco_opcode::ds_add_rtn_f32;
7290       op64 = aco_opcode::num_opcodes;
7291       op64_rtn = aco_opcode::num_opcodes;
7292       break;
7293    case nir_atomic_op_fmin:
7294       op32 = aco_opcode::ds_min_f32;
7295       op32_rtn = aco_opcode::ds_min_rtn_f32;
7296       op64 = aco_opcode::ds_min_f64;
7297       op64_rtn = aco_opcode::ds_min_rtn_f64;
7298       break;
7299    case nir_atomic_op_fmax:
7300       op32 = aco_opcode::ds_max_f32;
7301       op32_rtn = aco_opcode::ds_max_rtn_f32;
7302       op64 = aco_opcode::ds_max_f64;
7303       op64_rtn = aco_opcode::ds_max_rtn_f64;
7304       break;
7305    default: unreachable("Unhandled shared atomic intrinsic");
7306    }
7307
7308    bool return_previous = !nir_def_is_unused(&instr->def);
7309
7310    aco_opcode op;
7311    if (data.size() == 1) {
7312       assert(instr->def.bit_size == 32);
7313       op = return_previous ? op32_rtn : op32;
7314    } else {
7315       assert(instr->def.bit_size == 64);
7316       op = return_previous ? op64_rtn : op64;
7317    }
7318
7319    if (offset > 65535) {
7320       address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7321       offset = 0;
7322    }
7323
7324    aco_ptr<DS_instruction> ds;
7325    ds.reset(
7326       create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7327    ds->operands[0] = Operand(address);
7328    ds->operands[1] = Operand(data);
7329    if (num_operands == 4) {
7330       Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7331       ds->operands[2] = Operand(data2);
7332       if (bld.program->gfx_level >= GFX11)
7333          std::swap(ds->operands[1], ds->operands[2]);
7334    }
7335    ds->operands[num_operands - 1] = m;
7336    ds->offset0 = offset;
7337    if (return_previous)
7338       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->def));
7339    ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7340
7341    if (m.isUndefined())
7342       ds->operands.pop_back();
7343
7344    ctx->block->instructions.emplace_back(std::move(ds));
7345 }
7346
7347 void
7348 visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
7349 {
7350    bool is_store = instr->intrinsic == nir_intrinsic_store_shared2_amd;
7351    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa));
7352    Builder bld(ctx->program, ctx->block);
7353
7354    assert(bld.program->gfx_level >= GFX7);
7355
7356    bool is64bit = (is_store ? instr->src[0].ssa->bit_size : instr->def.bit_size) == 64;
7357    uint8_t offset0 = nir_intrinsic_offset0(instr);
7358    uint8_t offset1 = nir_intrinsic_offset1(instr);
7359    bool st64 = nir_intrinsic_st64(instr);
7360
7361    Operand m = load_lds_size_m0(bld);
7362    Instruction* ds;
7363    if (is_store) {
7364       aco_opcode op = st64
7365                          ? (is64bit ? aco_opcode::ds_write2st64_b64 : aco_opcode::ds_write2st64_b32)
7366                          : (is64bit ? aco_opcode::ds_write2_b64 : aco_opcode::ds_write2_b32);
7367       Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7368       RegClass comp_rc = is64bit ? v2 : v1;
7369       Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc);
7370       Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc);
7371       ds = bld.ds(op, address, data0, data1, m, offset0, offset1);
7372    } else {
7373       Temp dst = get_ssa_temp(ctx, &instr->def);
7374       Definition tmp_dst(dst.type() == RegType::vgpr ? dst : bld.tmp(is64bit ? v4 : v2));
7375       aco_opcode op = st64 ? (is64bit ? aco_opcode::ds_read2st64_b64 : aco_opcode::ds_read2st64_b32)
7376                            : (is64bit ? aco_opcode::ds_read2_b64 : aco_opcode::ds_read2_b32);
7377       ds = bld.ds(op, tmp_dst, address, m, offset0, offset1);
7378    }
7379    ds->ds().sync = memory_sync_info(storage_shared);
7380    if (m.isUndefined())
7381       ds->operands.pop_back();
7382
7383    if (!is_store) {
7384       Temp dst = get_ssa_temp(ctx, &instr->def);
7385       if (dst.type() == RegType::sgpr) {
7386          emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size());
7387          Temp comp[4];
7388          /* Use scalar v_readfirstlane_b32 for better 32-bit copy propagation */
7389          for (unsigned i = 0; i < dst.size(); i++)
7390             comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1));
7391          if (is64bit) {
7392             Temp comp0 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[0], comp[1]);
7393             Temp comp1 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[2], comp[3]);
7394             ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]};
7395             ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]};
7396             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp0, comp1);
7397             ctx->allocated_vec[dst.id()] = {comp0, comp1};
7398          } else {
7399             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp[0], comp[1]);
7400          }
7401       }
7402
7403       emit_split_vector(ctx, dst, 2);
7404    }
7405 }
7406
7407 Temp
7408 get_scratch_resource(isel_context* ctx)
7409 {
7410    Builder bld(ctx->program, ctx->block);
7411    Temp scratch_addr = ctx->program->private_segment_buffer;
7412    if (!scratch_addr.bytes()) {
7413       Temp addr_lo =
7414          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
7415       Temp addr_hi =
7416          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
7417       scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
7418    } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
7419       scratch_addr =
7420          bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7421    }
7422
7423    uint32_t rsrc_conf =
7424       S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7425
7426    if (ctx->program->gfx_level >= GFX10) {
7427       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7428                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
7429                    S_008F0C_RESOURCE_LEVEL(ctx->program->gfx_level < GFX11);
7430    } else if (ctx->program->gfx_level <=
7431               GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7432       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7433                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7434    }
7435
7436    /* older generations need element size = 4 bytes. element size removed in GFX9 */
7437    if (ctx->program->gfx_level <= GFX8)
7438       rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7439
7440    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7441                      Operand::c32(rsrc_conf));
7442 }
7443
7444 void
7445 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7446 {
7447    Builder bld(ctx->program, ctx->block);
7448    Temp dst = get_ssa_temp(ctx, &instr->def);
7449
7450    LoadEmitInfo info = {Operand(v1), dst, instr->def.num_components, instr->def.bit_size / 8u};
7451    info.align_mul = nir_intrinsic_align_mul(instr);
7452    info.align_offset = nir_intrinsic_align_offset(instr);
7453    info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
7454    info.sync = memory_sync_info(storage_scratch, semantic_private);
7455    if (ctx->program->gfx_level >= GFX9) {
7456       if (nir_src_is_const(instr->src[0])) {
7457          uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7458          info.offset =
7459             bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
7460          info.const_offset = nir_src_as_uint(instr->src[0]) % max;
7461       } else {
7462          info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
7463       }
7464       EmitLoadParameters params = scratch_flat_load_params;
7465       params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
7466       emit_load(ctx, bld, info, params);
7467    } else {
7468       info.resource = get_scratch_resource(ctx);
7469       info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7470       info.soffset = ctx->program->scratch_offset;
7471       emit_load(ctx, bld, info, scratch_mubuf_load_params);
7472    }
7473 }
7474
7475 void
7476 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7477 {
7478    Builder bld(ctx->program, ctx->block);
7479    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7480    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
7481
7482    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7483    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7484
7485    unsigned write_count = 0;
7486    Temp write_datas[32];
7487    unsigned offsets[32];
7488    unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
7489    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7490                       &write_count, write_datas, offsets);
7491
7492    if (ctx->program->gfx_level >= GFX9) {
7493       uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7494       offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset;
7495       uint32_t base_const_offset =
7496          nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0;
7497
7498       for (unsigned i = 0; i < write_count; i++) {
7499          aco_opcode op;
7500          switch (write_datas[i].bytes()) {
7501          case 1: op = aco_opcode::scratch_store_byte; break;
7502          case 2: op = aco_opcode::scratch_store_short; break;
7503          case 4: op = aco_opcode::scratch_store_dword; break;
7504          case 8: op = aco_opcode::scratch_store_dwordx2; break;
7505          case 12: op = aco_opcode::scratch_store_dwordx3; break;
7506          case 16: op = aco_opcode::scratch_store_dwordx4; break;
7507          default: unreachable("Unexpected store size");
7508          }
7509
7510          uint32_t const_offset = base_const_offset + offsets[i];
7511          assert(const_offset < max || offset.id() == 0);
7512
7513          Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
7514          Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
7515          if (offset.id() == 0)
7516             saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max)));
7517
7518          bld.scratch(op, addr, saddr, write_datas[i], const_offset % max,
7519                      memory_sync_info(storage_scratch, semantic_private));
7520       }
7521    } else {
7522       Temp rsrc = get_scratch_resource(ctx);
7523       offset = as_vgpr(ctx, offset);
7524       for (unsigned i = 0; i < write_count; i++) {
7525          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7526          Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
7527                                         write_datas[i], offsets[i], true, true);
7528          mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7529       }
7530    }
7531 }
7532
7533 Temp
7534 emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
7535 {
7536    Builder bld(ctx->program, ctx->block);
7537
7538    if (cluster_size == 1) {
7539       return src;
7540    }
7541    if (op == nir_op_iand && cluster_size == 4) {
7542       /* subgroupClusteredAnd(val, 4) -> ~wqm(~val & exec) */
7543       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
7544       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
7545       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7546                       bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7547    } else if (op == nir_op_ior && cluster_size == 4) {
7548       /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
7549       return bld.sop1(
7550          Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7551          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7552    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7553       /* subgroupAnd(val) -> (~val & exec) == 0 */
7554       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
7555       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
7556                .def(1)
7557                .getTemp();
7558       Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
7559       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7560    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7561       /* subgroupOr(val) -> (val & exec) != 0 */
7562       Temp tmp =
7563          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
7564             .def(1)
7565             .getTemp();
7566       return bool_to_vector_condition(ctx, tmp);
7567    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7568       /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
7569       Temp tmp =
7570          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7571       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7572       tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
7573                .def(1)
7574                .getTemp();
7575       return bool_to_vector_condition(ctx, tmp);
7576    } else {
7577       /* subgroupClustered{And,Or,Xor}(val, n):
7578        *   lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
7579        *   cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)
7580        * subgroupClusteredAnd():
7581        *   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7582        * subgroupClusteredOr():
7583        *   return ((val & exec) >> cluster_offset) & cluster_mask != 0
7584        * subgroupClusteredXor():
7585        *   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7586        */
7587       Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
7588       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
7589                                      Operand::c32(~uint32_t(cluster_size - 1)), lane_id);
7590
7591       Temp tmp;
7592       if (op == nir_op_iand)
7593          tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,
7594                         Operand(exec, bld.lm));
7595       else
7596          tmp =
7597             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7598
7599       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7600
7601       if (ctx->program->gfx_level <= GFX7)
7602          tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7603       else if (ctx->program->wave_size == 64)
7604          tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7605       else
7606          tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7607       tmp = emit_extract_vector(ctx, tmp, 0, v1);
7608       if (cluster_mask != 0xffffffff)
7609          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
7610
7611       if (op == nir_op_iand) {
7612          return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::c32(cluster_mask),
7613                          tmp);
7614       } else if (op == nir_op_ior) {
7615          return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
7616       } else if (op == nir_op_ixor) {
7617          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
7618                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
7619          return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
7620       }
7621       assert(false);
7622       return Temp();
7623    }
7624 }
7625
7626 Temp
7627 emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
7628 {
7629    Builder bld(ctx->program, ctx->block);
7630    assert(src.regClass() == bld.lm);
7631
7632    /* subgroupExclusiveAnd(val) -> mbcnt(~val & exec) == 0
7633     * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7634     * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7635     */
7636    if (op == nir_op_iand)
7637       src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
7638
7639    Temp tmp =
7640       bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7641
7642    Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
7643
7644    if (op == nir_op_iand)
7645       return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), mbcnt);
7646    else if (op == nir_op_ior)
7647       return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), mbcnt);
7648    else if (op == nir_op_ixor)
7649       return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(),
7650                       bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
7651
7652    assert(false);
7653    return Temp();
7654 }
7655
7656 Temp
7657 emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
7658 {
7659    Builder bld(ctx->program, ctx->block);
7660
7661    /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7662     * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7663     * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7664     */
7665    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7666    if (op == nir_op_iand)
7667       return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7668    else if (op == nir_op_ior)
7669       return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7670    else if (op == nir_op_ixor)
7671       return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7672
7673    assert(false);
7674    return Temp();
7675 }
7676
7677 ReduceOp
7678 get_reduce_op(nir_op op, unsigned bit_size)
7679 {
7680    switch (op) {
7681 #define CASEI(name)                                                                                \
7682    case nir_op_##name:                                                                             \
7683       return (bit_size == 32)   ? name##32                                                         \
7684              : (bit_size == 16) ? name##16                                                         \
7685              : (bit_size == 8)  ? name##8                                                          \
7686                                 : name##64;
7687 #define CASEF(name)                                                                                \
7688    case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7689       CASEI(iadd)
7690       CASEI(imul)
7691       CASEI(imin)
7692       CASEI(umin)
7693       CASEI(imax)
7694       CASEI(umax)
7695       CASEI(iand)
7696       CASEI(ior)
7697       CASEI(ixor)
7698       CASEF(fadd)
7699       CASEF(fmul)
7700       CASEF(fmin)
7701       CASEF(fmax)
7702    default: unreachable("unknown reduction op");
7703 #undef CASEI
7704 #undef CASEF
7705    }
7706 }
7707
7708 void
7709 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7710 {
7711    Builder bld(ctx->program, ctx->block);
7712    Definition dst(get_ssa_temp(ctx, &instr->def));
7713    assert(dst.regClass().type() != RegType::vgpr);
7714    if (src.regClass().type() == RegType::vgpr)
7715       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7716    else
7717       bld.copy(dst, src);
7718 }
7719
7720 void
7721 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7722 {
7723    Builder bld(ctx->program, ctx->block);
7724    Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7725
7726    if (op == nir_op_fadd) {
7727       src_tmp = as_vgpr(ctx, src_tmp);
7728       Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
7729                                       : dst.getTemp();
7730
7731       if (src.ssa->bit_size == 16) {
7732          count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7733          bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7734       } else {
7735          assert(src.ssa->bit_size == 32);
7736          count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7737          bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7738       }
7739
7740       if (tmp != dst.getTemp())
7741          bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7742
7743       return;
7744    }
7745
7746    if (dst.regClass() == s1)
7747       src_tmp = bld.as_uniform(src_tmp);
7748
7749    if (op == nir_op_ixor && count.type() == RegType::sgpr)
7750       count =
7751          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7752    else if (op == nir_op_ixor)
7753       count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7754
7755    assert(dst.getTemp().type() == count.type());
7756
7757    if (nir_src_is_const(src)) {
7758       if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7759          bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7760       else if (nir_src_as_uint(src) == 1)
7761          bld.copy(dst, count);
7762       else if (nir_src_as_uint(src) == 0)
7763          bld.copy(dst, Operand::zero(dst.bytes()));
7764       else if (count.type() == RegType::vgpr)
7765          bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7766       else
7767          bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7768    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
7769       bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7770    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
7771       bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7772    } else if (dst.getTemp().type() == RegType::vgpr) {
7773       bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7774    } else {
7775       bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7776    }
7777 }
7778
7779 bool
7780 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7781 {
7782    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7783    if (op == nir_op_imul || op == nir_op_fmul)
7784       return false;
7785
7786    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7787       Builder bld(ctx->program, ctx->block);
7788       Definition dst(get_ssa_temp(ctx, &instr->def));
7789       unsigned bit_size = instr->src[0].ssa->bit_size;
7790       if (bit_size > 32)
7791          return false;
7792
7793       Temp thread_count =
7794          bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7795       thread_count = emit_wqm(bld, thread_count, Temp(0, s1), nir_intrinsic_include_helpers(instr));
7796
7797       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7798    } else {
7799       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7800    }
7801
7802    return true;
7803 }
7804
7805 bool
7806 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7807 {
7808    Builder bld(ctx->program, ctx->block);
7809    Definition dst(get_ssa_temp(ctx, &instr->def));
7810    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7811    bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7812
7813    if (op == nir_op_imul || op == nir_op_fmul)
7814       return false;
7815
7816    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7817       if (instr->src[0].ssa->bit_size > 32)
7818          return false;
7819
7820       Temp packed_tid;
7821       if (inc)
7822          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7823       else
7824          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7825       packed_tid = emit_wqm(bld, packed_tid);
7826
7827       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7828       return true;
7829    }
7830
7831    assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7832           op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7833
7834    if (inc) {
7835       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7836       return true;
7837    }
7838
7839    /* Copy the source and write the reduction operation identity to the first lane. */
7840    Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7841    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7842    ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7843    if (dst.bytes() == 8) {
7844       Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7845       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7846       uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7847       uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7848
7849       lo =
7850          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
7851       hi =
7852          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
7853       bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7854    } else {
7855       uint32_t identity = get_reduction_identity(reduce_op, 0);
7856       bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
7857                     as_vgpr(ctx, src));
7858    }
7859
7860    return true;
7861 }
7862
7863 Temp
7864 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7865                      Definition dst, Temp src)
7866 {
7867    assert(src.bytes() <= 8);
7868    assert(src.type() == RegType::vgpr);
7869
7870    Builder bld(ctx->program, ctx->block);
7871
7872    unsigned num_defs = 0;
7873    Definition defs[5];
7874    defs[num_defs++] = dst;
7875    defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7876
7877    /* scalar identity temporary */
7878    bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) &&
7879                      aco_op != aco_opcode::p_reduce;
7880    if (aco_op == aco_opcode::p_exclusive_scan) {
7881       need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7882                      op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7883                      op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7884                      op == fmul64);
7885    }
7886    if (need_sitmp)
7887       defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7888
7889    /* scc clobber */
7890    defs[num_defs++] = bld.def(s1, scc);
7891
7892    /* vcc clobber */
7893    bool clobber_vcc = false;
7894    if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9)
7895       clobber_vcc = true;
7896    if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8)
7897       clobber_vcc = true;
7898    if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7899       clobber_vcc = true;
7900
7901    if (clobber_vcc)
7902       defs[num_defs++] = bld.def(bld.lm, vcc);
7903
7904    Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7905       aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7906    reduce->operands[0] = Operand(src);
7907    /* setup_reduce_temp will update these undef operands if needed */
7908    reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7909    reduce->operands[2] = Operand(v1.as_linear());
7910    std::copy(defs, defs + num_defs, reduce->definitions.begin());
7911
7912    reduce->reduce_op = op;
7913    reduce->cluster_size = cluster_size;
7914    bld.insert(std::move(reduce));
7915
7916    return dst.getTemp();
7917 }
7918
7919 Temp
7920 inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Temp scan, Temp src)
7921 {
7922    Builder bld(ctx->program, ctx->block);
7923
7924    switch (op) {
7925    case iadd8:
7926    case iadd16:
7927    case iadd32: return bld.vsub32(bld.def(scan.regClass()), scan, src);
7928    case ixor64:
7929    case iadd64: {
7930       Temp src00 = bld.tmp(v1);
7931       Temp src01 = bld.tmp(v1);
7932       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), scan);
7933       Temp src10 = bld.tmp(v1);
7934       Temp src11 = bld.tmp(v1);
7935       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src);
7936
7937       Temp lower = bld.tmp(v1);
7938       Temp upper = bld.tmp(v1);
7939       if (op == iadd64) {
7940          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
7941          bld.vsub32(Definition(upper), src01, src11, false, borrow);
7942       } else {
7943          bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10);
7944          bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11);
7945       }
7946       return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lower, upper);
7947    }
7948    case ixor8:
7949    case ixor16:
7950    case ixor32: return bld.vop2(aco_opcode::v_xor_b32, bld.def(scan.regClass()), scan, src);
7951    default: unreachable("Unsupported op");
7952    }
7953 }
7954
7955 void
7956 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
7957 {
7958    Builder bld(ctx->program, ctx->block);
7959    Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7960    Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7961
7962    Temp ddx_1, ddx_2, ddy_1, ddy_2;
7963    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7964    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7965    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7966
7967    /* Build DD X/Y */
7968    if (ctx->program->gfx_level >= GFX8) {
7969       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7970       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7971       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7972       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7973       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7974       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7975    } else {
7976       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7977       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7978       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7979       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7980       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_1);
7981
7982       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7983       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7984       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_2);
7985       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7986       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7987    }
7988
7989    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7990    aco_opcode mad =
7991       ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
7992    Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
7993    Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
7994    tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
7995    tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
7996    Temp wqm1 = bld.tmp(v1);
7997    emit_wqm(bld, tmp1, wqm1, true);
7998    Temp wqm2 = bld.tmp(v1);
7999    emit_wqm(bld, tmp2, wqm2, true);
8000    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
8001    return;
8002 }
8003
8004 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
8005 Temp lanecount_to_mask(isel_context* ctx, Temp count);
8006 void pops_await_overlapped_waves(isel_context* ctx);
8007
8008 Temp
8009 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, enum glsl_interp_mode interp)
8010 {
8011    bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
8012    if (intrin == nir_intrinsic_load_barycentric_pixel ||
8013        intrin == nir_intrinsic_load_barycentric_at_offset) {
8014       return get_arg(ctx, linear ? ctx->args->linear_center : ctx->args->persp_center);
8015    } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
8016       return get_arg(ctx, linear ? ctx->args->linear_centroid : ctx->args->persp_centroid);
8017    } else {
8018       assert(intrin == nir_intrinsic_load_barycentric_sample);
8019       return get_arg(ctx, linear ? ctx->args->linear_sample : ctx->args->persp_sample);
8020    }
8021 }
8022
8023 void
8024 ds_ordered_count_offsets(isel_context* ctx, unsigned index_operand, unsigned wave_release,
8025                          unsigned wave_done, unsigned* offset0, unsigned* offset1)
8026 {
8027    unsigned ordered_count_index = index_operand & 0x3f;
8028    unsigned count_dword = (index_operand >> 24) & 0xf;
8029
8030    assert(ctx->options->gfx_level >= GFX10);
8031    assert(count_dword >= 1 && count_dword <= 4);
8032
8033    *offset0 = ordered_count_index << 2;
8034    *offset1 = wave_release | (wave_done << 1) | ((count_dword - 1) << 6);
8035
8036    if (ctx->options->gfx_level < GFX11)
8037       *offset1 |= 3 /* GS shader type */ << 2;
8038 }
8039
8040 struct aco_export_mrt {
8041    Operand out[4];
8042    unsigned enabled_channels;
8043    unsigned target;
8044    bool compr;
8045 };
8046
8047 static void
8048 create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
8049                                 const struct aco_export_mrt* mrt1)
8050 {
8051    Builder bld(ctx->program, ctx->block);
8052
8053    aco_ptr<Pseudo_instruction> exp{create_instruction<Pseudo_instruction>(
8054       aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
8055    for (unsigned i = 0; i < 4; i++) {
8056       exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
8057       exp->operands[i].setLateKill(true);
8058       exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
8059       exp->operands[i + 4].setLateKill(true);
8060    }
8061
8062    RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
8063    exp->definitions[0] = bld.def(type); /* mrt0 */
8064    exp->definitions[1] = bld.def(type); /* mrt1 */
8065    exp->definitions[2] = bld.def(v1);
8066    exp->definitions[3] = bld.def(bld.lm);
8067    exp->definitions[4] = bld.def(bld.lm, vcc);
8068    exp->definitions[5] = bld.def(s1, scc);
8069    ctx->block->instructions.emplace_back(std::move(exp));
8070
8071    ctx->program->has_color_exports = true;
8072 }
8073
8074 void
8075 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
8076 {
8077    Builder bld(ctx->program, ctx->block);
8078    switch (instr->intrinsic) {
8079    case nir_intrinsic_load_barycentric_sample:
8080    case nir_intrinsic_load_barycentric_pixel:
8081    case nir_intrinsic_load_barycentric_centroid: {
8082       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
8083       Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
8084       assert(bary.size() == 2);
8085       Temp dst = get_ssa_temp(ctx, &instr->def);
8086       bld.copy(Definition(dst), bary);
8087       emit_split_vector(ctx, dst, 2);
8088       break;
8089    }
8090    case nir_intrinsic_load_barycentric_model: {
8091       Temp model = get_arg(ctx, ctx->args->pull_model);
8092       assert(model.size() == 3);
8093       Temp dst = get_ssa_temp(ctx, &instr->def);
8094       bld.copy(Definition(dst), model);
8095       emit_split_vector(ctx, dst, 3);
8096       break;
8097    }
8098    case nir_intrinsic_load_barycentric_at_offset: {
8099       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8100       RegClass rc = RegClass(offset.type(), 1);
8101       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8102       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8103       Temp bary = get_interp_param(ctx, instr->intrinsic,
8104                                    (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8105       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->def), bary, pos1, pos2);
8106       break;
8107    }
8108    case nir_intrinsic_load_front_face: {
8109       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8110                Operand::zero(), get_arg(ctx, ctx->args->front_face));
8111       break;
8112    }
8113    case nir_intrinsic_load_view_index: {
8114       Temp dst = get_ssa_temp(ctx, &instr->def);
8115       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->view_index)));
8116       break;
8117    }
8118    case nir_intrinsic_load_frag_coord: {
8119       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->def), 4);
8120       break;
8121    }
8122    case nir_intrinsic_load_frag_shading_rate:
8123       emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->def));
8124       break;
8125    case nir_intrinsic_load_sample_pos: {
8126       Temp posx = get_arg(ctx, ctx->args->frag_pos[0]);
8127       Temp posy = get_arg(ctx, ctx->args->frag_pos[1]);
8128       bld.pseudo(
8129          aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->def)),
8130          posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8131          posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8132       break;
8133    }
8134    case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8135    case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8136    case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8137    case nir_intrinsic_load_input:
8138    case nir_intrinsic_load_input_vertex:
8139       if (ctx->program->stage == fragment_fs)
8140          visit_load_fs_input(ctx, instr);
8141       else
8142          isel_err(&instr->instr, "Shader inputs should have been lowered in NIR.");
8143       break;
8144    case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8145    case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8146    case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8147    case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8148    case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8149    case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8150    case nir_intrinsic_shared_atomic:
8151    case nir_intrinsic_shared_atomic_swap: visit_shared_atomic(ctx, instr); break;
8152    case nir_intrinsic_load_shared2_amd:
8153    case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break;
8154    case nir_intrinsic_bindless_image_load:
8155    case nir_intrinsic_bindless_image_fragment_mask_load_amd:
8156    case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
8157    case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
8158    case nir_intrinsic_bindless_image_atomic:
8159    case nir_intrinsic_bindless_image_atomic_swap: visit_image_atomic(ctx, instr); break;
8160    case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8161    case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8162    case nir_intrinsic_load_typed_buffer_amd:
8163    case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8164    case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8165    case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;
8166    case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break;
8167    case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break;
8168    case nir_intrinsic_global_atomic_amd:
8169    case nir_intrinsic_global_atomic_swap_amd: visit_global_atomic(ctx, instr); break;
8170    case nir_intrinsic_ssbo_atomic:
8171    case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
8172    case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8173    case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8174    case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
8175    case nir_intrinsic_load_num_workgroups: {
8176       Temp dst = get_ssa_temp(ctx, &instr->def);
8177       if (ctx->options->load_grid_size_from_user_sgpr) {
8178          bld.copy(Definition(dst), get_arg(ctx, ctx->args->num_work_groups));
8179       } else {
8180          Temp addr = get_arg(ctx, ctx->args->num_work_groups);
8181          assert(addr.regClass() == s2);
8182          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8183                     bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
8184                     bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
8185       }
8186       emit_split_vector(ctx, dst, 3);
8187       break;
8188    }
8189    case nir_intrinsic_load_ray_launch_size: {
8190       Temp dst = get_ssa_temp(ctx, &instr->def);
8191       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->rt.launch_size)));
8192       emit_split_vector(ctx, dst, 3);
8193       break;
8194    }
8195    case nir_intrinsic_load_ray_launch_id: {
8196       Temp dst = get_ssa_temp(ctx, &instr->def);
8197       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->rt.launch_id)));
8198       emit_split_vector(ctx, dst, 3);
8199       break;
8200    }
8201    case nir_intrinsic_load_ray_launch_size_addr_amd: {
8202       Temp dst = get_ssa_temp(ctx, &instr->def);
8203       Temp addr = get_arg(ctx, ctx->args->rt.launch_size_addr);
8204       assert(addr.regClass() == s2);
8205       bld.copy(Definition(dst), Operand(addr));
8206       break;
8207    }
8208    case nir_intrinsic_load_local_invocation_id: {
8209       Temp dst = get_ssa_temp(ctx, &instr->def);
8210       if (ctx->options->gfx_level >= GFX11) {
8211          Temp local_ids[3];
8212
8213          /* Thread IDs are packed in VGPR0, 10 bits per component. */
8214          for (uint32_t i = 0; i < 3; i++) {
8215             local_ids[i] = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
8216                                     get_arg(ctx, ctx->args->local_invocation_ids),
8217                                     Operand::c32(i * 10u), Operand::c32(10u));
8218          }
8219
8220          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), local_ids[0], local_ids[1],
8221                     local_ids[2]);
8222       } else {
8223          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->local_invocation_ids)));
8224       }
8225       emit_split_vector(ctx, dst, 3);
8226       break;
8227    }
8228    case nir_intrinsic_load_workgroup_id: {
8229       Temp dst = get_ssa_temp(ctx, &instr->def);
8230       if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
8231          const struct ac_arg* ids = ctx->args->workgroup_ids;
8232          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8233                     ids[0].used ? Operand(get_arg(ctx, ids[0])) : Operand::zero(),
8234                     ids[1].used ? Operand(get_arg(ctx, ids[1])) : Operand::zero(),
8235                     ids[2].used ? Operand(get_arg(ctx, ids[2])) : Operand::zero());
8236          emit_split_vector(ctx, dst, 3);
8237       } else {
8238          isel_err(&instr->instr, "Unsupported stage for load_workgroup_id");
8239       }
8240       break;
8241    }
8242    case nir_intrinsic_load_local_invocation_index: {
8243       if (ctx->stage.hw == AC_HW_LOCAL_SHADER || ctx->stage.hw == AC_HW_HULL_SHADER) {
8244          if (ctx->options->gfx_level >= GFX11) {
8245             /* On GFX11, RelAutoIndex is WaveID * WaveSize + ThreadID. */
8246             Temp wave_id =
8247                bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8248                         get_arg(ctx, ctx->args->tcs_wave_id), Operand::c32(0u | (3u << 16)));
8249
8250             Temp temp = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), wave_id,
8251                                  Operand::c32(ctx->program->wave_size));
8252             emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def), Operand(), Operand(temp));
8253          } else {
8254             bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
8255                      get_arg(ctx, ctx->args->vs_rel_patch_id));
8256          }
8257          break;
8258       } else if (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER ||
8259                  ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER) {
8260          bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), thread_id_in_threadgroup(ctx));
8261          break;
8262       } else if (ctx->program->workgroup_size <= ctx->program->wave_size) {
8263          emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8264          break;
8265       }
8266
8267       Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8268
8269       /* The tg_size bits [6:11] contain the subgroup id,
8270        * we need this multiplied by the wave size, and then OR the thread id to it.
8271        */
8272       if (ctx->program->wave_size == 64) {
8273          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8274           * feed that to v_or */
8275          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8276                                 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->tg_size));
8277          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num, id);
8278       } else {
8279          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8280          Temp tg_num =
8281             bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8282                      get_arg(ctx, ctx->args->tg_size), Operand::c32(0x6u | (0x6u << 16)));
8283          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num,
8284                   Operand::c32(0x5u), id);
8285       }
8286       break;
8287    }
8288    case nir_intrinsic_load_subgroup_invocation: {
8289       emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8290       break;
8291    }
8292    case nir_intrinsic_ballot: {
8293       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8294       Temp dst = get_ssa_temp(ctx, &instr->def);
8295
8296       if (instr->src[0].ssa->bit_size == 1) {
8297          assert(src.regClass() == bld.lm);
8298       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8299          src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8300       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8301          src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8302       } else {
8303          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8304       }
8305
8306       /* Make sure that all inactive lanes return zero.
8307        * Value-numbering might remove the comparison above */
8308       src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8309       if (dst.size() != bld.lm.size()) {
8310          /* Wave32 with ballot size set to 64 */
8311          src =
8312             bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero());
8313       }
8314
8315       emit_wqm(bld, src, dst);
8316       break;
8317    }
8318    case nir_intrinsic_shuffle:
8319    case nir_intrinsic_read_invocation: {
8320       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8321       if (!nir_src_is_divergent(instr->src[0])) {
8322          emit_uniform_subgroup(ctx, instr, src);
8323       } else {
8324          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8325          if (instr->intrinsic == nir_intrinsic_read_invocation ||
8326              !nir_src_is_divergent(instr->src[1]))
8327             tid = bld.as_uniform(tid);
8328          Temp dst = get_ssa_temp(ctx, &instr->def);
8329
8330          if (instr->def.bit_size != 1)
8331             src = as_vgpr(ctx, src);
8332
8333          if (src.regClass() == v1b || src.regClass() == v2b) {
8334             Temp tmp = bld.tmp(v1);
8335             tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
8336             if (dst.type() == RegType::vgpr)
8337                bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8338                           bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8339             else
8340                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8341          } else if (src.regClass() == v1) {
8342             emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);
8343          } else if (src.regClass() == v2) {
8344             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8345             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8346             lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));
8347             hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));
8348             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8349             emit_split_vector(ctx, dst, 2);
8350          } else if (instr->def.bit_size == 1 && tid.regClass() == s1) {
8351             assert(src.regClass() == bld.lm);
8352             Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
8353             bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8354          } else if (instr->def.bit_size == 1 && tid.regClass() == v1) {
8355             assert(src.regClass() == bld.lm);
8356             Temp tmp;
8357             if (ctx->program->gfx_level <= GFX7)
8358                tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
8359             else if (ctx->program->wave_size == 64)
8360                tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
8361             else
8362                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
8363             tmp = emit_extract_vector(ctx, tmp, 0, v1);
8364             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
8365             emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp),
8366                      dst);
8367          } else {
8368             isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8369          }
8370       }
8371       break;
8372    }
8373    case nir_intrinsic_load_sample_id: {
8374       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8375                get_arg(ctx, ctx->args->ancillary), Operand::c32(8u), Operand::c32(4u));
8376       break;
8377    }
8378    case nir_intrinsic_read_first_invocation: {
8379       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8380       Temp dst = get_ssa_temp(ctx, &instr->def);
8381       if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
8382          emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);
8383       } else if (src.regClass() == v2) {
8384          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8385          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8386          lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
8387          hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
8388          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8389          emit_split_vector(ctx, dst, 2);
8390       } else if (instr->def.bit_size == 1) {
8391          assert(src.regClass() == bld.lm);
8392          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8393                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8394          bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8395       } else {
8396          bld.copy(Definition(dst), src);
8397       }
8398       break;
8399    }
8400    case nir_intrinsic_vote_all: {
8401       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8402       Temp dst = get_ssa_temp(ctx, &instr->def);
8403       assert(src.regClass() == bld.lm);
8404       assert(dst.regClass() == bld.lm);
8405
8406       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8407       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
8408                .def(1)
8409                .getTemp();
8410       Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
8411       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8412       break;
8413    }
8414    case nir_intrinsic_vote_any: {
8415       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8416       Temp dst = get_ssa_temp(ctx, &instr->def);
8417       assert(src.regClass() == bld.lm);
8418       assert(dst.regClass() == bld.lm);
8419
8420       Temp tmp = bool_to_scalar_condition(ctx, src);
8421       bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8422       break;
8423    }
8424    case nir_intrinsic_reduce:
8425    case nir_intrinsic_inclusive_scan:
8426    case nir_intrinsic_exclusive_scan: {
8427       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8428       Temp dst = get_ssa_temp(ctx, &instr->def);
8429       nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8430       unsigned cluster_size =
8431          instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8432       cluster_size = util_next_power_of_two(
8433          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8434       bool create_helpers =
8435          instr->intrinsic == nir_intrinsic_reduce && nir_intrinsic_include_helpers(instr);
8436
8437       if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
8438           instr->def.bit_size != 1) {
8439          /* We use divergence analysis to assign the regclass, so check if it's
8440           * working as expected */
8441          ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8442          if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8443             expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8444          assert(instr->def.divergent == expected_divergent);
8445
8446          if (instr->intrinsic == nir_intrinsic_reduce) {
8447             if (emit_uniform_reduce(ctx, instr))
8448                break;
8449          } else if (emit_uniform_scan(ctx, instr)) {
8450             break;
8451          }
8452       }
8453
8454       if (instr->def.bit_size == 1) {
8455          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
8456             op = nir_op_iand;
8457          else if (op == nir_op_iadd)
8458             op = nir_op_ixor;
8459          else if (op == nir_op_umax || op == nir_op_imax)
8460             op = nir_op_ior;
8461          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
8462
8463          switch (instr->intrinsic) {
8464          case nir_intrinsic_reduce:
8465             emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst, create_helpers);
8466             break;
8467          case nir_intrinsic_exclusive_scan:
8468             emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);
8469             break;
8470          case nir_intrinsic_inclusive_scan:
8471             emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
8472             break;
8473          default: assert(false);
8474          }
8475       } else if (cluster_size == 1) {
8476          bld.copy(Definition(dst), src);
8477       } else {
8478          unsigned bit_size = instr->src[0].ssa->bit_size;
8479
8480          src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8481
8482          ReduceOp reduce_op = get_reduce_op(op, bit_size);
8483
8484          aco_opcode aco_op;
8485          switch (instr->intrinsic) {
8486          case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8487          case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8488          case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8489          default: unreachable("unknown reduce intrinsic");
8490          }
8491
8492          /* Avoid whole wave shift. */
8493          const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan &&
8494                                                   (op == nir_op_iadd || op == nir_op_ixor) &&
8495                                                   dst.type() == RegType::vgpr;
8496          if (use_inclusive_for_exclusive)
8497             aco_op = aco_opcode::p_inclusive_scan;
8498
8499          Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
8500                                              bld.def(dst.regClass()), src);
8501
8502          if (use_inclusive_for_exclusive)
8503             tmp_dst = inclusive_scan_to_exclusive(ctx, reduce_op, tmp_dst, src);
8504
8505          emit_wqm(bld, tmp_dst, dst, create_helpers);
8506       }
8507       break;
8508    }
8509    case nir_intrinsic_quad_broadcast:
8510    case nir_intrinsic_quad_swap_horizontal:
8511    case nir_intrinsic_quad_swap_vertical:
8512    case nir_intrinsic_quad_swap_diagonal:
8513    case nir_intrinsic_quad_swizzle_amd: {
8514       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8515
8516       if (!instr->def.divergent) {
8517          emit_uniform_subgroup(ctx, instr, src);
8518          break;
8519       }
8520
8521       /* Quad broadcast lane. */
8522       unsigned lane = 0;
8523       /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8524       bool bool_use_valu = instr->def.bit_size == 1;
8525
8526       uint16_t dpp_ctrl = 0;
8527
8528       switch (instr->intrinsic) {
8529       case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8530       case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8531       case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8532       case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break;
8533       case nir_intrinsic_quad_broadcast:
8534          lane = nir_src_as_const_value(instr->src[1])->u32;
8535          dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8536          bool_use_valu = false;
8537          break;
8538       default: break;
8539       }
8540
8541       Temp dst = get_ssa_temp(ctx, &instr->def);
8542       Temp tmp(dst);
8543
8544       /* Setup source. */
8545       if (bool_use_valu)
8546          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8547                             Operand::c32(-1), src);
8548       else if (instr->def.bit_size != 1)
8549          src = as_vgpr(ctx, src);
8550
8551       /* Setup temporary destination. */
8552       if (bool_use_valu)
8553          tmp = bld.tmp(v1);
8554       else if (ctx->program->stage == fragment_fs)
8555          tmp = bld.tmp(dst.regClass());
8556
8557       if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8558          /* Special case for quad broadcast using SALU only. */
8559          assert(src.regClass() == bld.lm && tmp.regClass() == bld.lm);
8560
8561          uint32_t half_mask = 0x11111111u << lane;
8562          Operand mask_tmp = bld.lm.bytes() == 4
8563                                ? Operand::c32(half_mask)
8564                                : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8565                                             Operand::c32(half_mask), Operand::c32(half_mask));
8566
8567          src =
8568             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8569          src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8570          bld.sop1(Builder::s_wqm, Definition(tmp), src);
8571       } else if (instr->def.bit_size <= 32 || bool_use_valu) {
8572          unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
8573          Definition def = excess_bytes ? bld.def(v1) : Definition(tmp);
8574
8575          if (ctx->program->gfx_level >= GFX8)
8576             bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl);
8577          else
8578             bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8579
8580          if (excess_bytes)
8581             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp),
8582                        bld.def(RegClass::get(tmp.type(), excess_bytes)), def.getTemp());
8583       } else if (instr->def.bit_size == 64) {
8584          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8585          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8586
8587          if (ctx->program->gfx_level >= GFX8) {
8588             lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl);
8589             hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl);
8590          } else {
8591             lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8592             hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8593          }
8594
8595          bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lo, hi);
8596          emit_split_vector(ctx, tmp, 2);
8597       } else {
8598          isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8599       }
8600
8601       if (tmp.id() != dst.id()) {
8602          if (bool_use_valu)
8603             tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
8604
8605          /* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */
8606          emit_wqm(bld, tmp, dst, true);
8607       }
8608
8609       break;
8610    }
8611    case nir_intrinsic_masked_swizzle_amd: {
8612       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8613       if (!instr->def.divergent) {
8614          emit_uniform_subgroup(ctx, instr, src);
8615          break;
8616       }
8617       Temp dst = get_ssa_temp(ctx, &instr->def);
8618       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8619
8620       if (instr->def.bit_size != 1)
8621          src = as_vgpr(ctx, src);
8622
8623       if (instr->def.bit_size == 1) {
8624          assert(src.regClass() == bld.lm);
8625          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8626                             Operand::c32(-1), src);
8627          src = emit_masked_swizzle(ctx, bld, src, mask);
8628          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8629          emit_wqm(bld, tmp, dst);
8630       } else if (dst.regClass() == v1b) {
8631          Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8632          emit_extract_vector(ctx, tmp, 0, dst);
8633       } else if (dst.regClass() == v2b) {
8634          Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8635          emit_extract_vector(ctx, tmp, 0, dst);
8636       } else if (dst.regClass() == v1) {
8637          emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);
8638       } else if (dst.regClass() == v2) {
8639          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8640          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8641          lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));
8642          hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));
8643          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8644          emit_split_vector(ctx, dst, 2);
8645       } else {
8646          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8647       }
8648       break;
8649    }
8650    case nir_intrinsic_write_invocation_amd: {
8651       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8652       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8653       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8654       Temp dst = get_ssa_temp(ctx, &instr->def);
8655       if (dst.regClass() == v1) {
8656          /* src2 is ignored for writelane. RA assigns the same reg for dst */
8657          emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst);
8658       } else if (dst.regClass() == v2) {
8659          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8660          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8661          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8662          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8663          Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8664          Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8665          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8666          emit_split_vector(ctx, dst, 2);
8667       } else {
8668          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8669       }
8670       break;
8671    }
8672    case nir_intrinsic_mbcnt_amd: {
8673       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8674       Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8675       Temp dst = get_ssa_temp(ctx, &instr->def);
8676       /* Fit 64-bit mask for wave32 */
8677       src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8678       Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));
8679       emit_wqm(bld, wqm_tmp, dst);
8680       break;
8681    }
8682    case nir_intrinsic_lane_permute_16_amd: {
8683       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8684       Temp dst = get_ssa_temp(ctx, &instr->def);
8685       assert(ctx->program->gfx_level >= GFX10);
8686
8687       if (src.regClass() == s1) {
8688          bld.copy(Definition(dst), src);
8689       } else if (dst.regClass() == v1 && src.regClass() == v1) {
8690          bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8691                   bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8692                   bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8693       } else {
8694          isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8695       }
8696       break;
8697    }
8698    case nir_intrinsic_load_helper_invocation:
8699    case nir_intrinsic_is_helper_invocation: {
8700       /* load_helper() after demote() get lowered to is_helper().
8701        * Otherwise, these two behave the same. */
8702       Temp dst = get_ssa_temp(ctx, &instr->def);
8703       bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8704       ctx->block->kind |= block_kind_needs_lowering;
8705       ctx->program->needs_exact = true;
8706       break;
8707    }
8708    case nir_intrinsic_demote:
8709    case nir_intrinsic_demote_if: {
8710       Operand cond = Operand::c32(-1u);
8711       if (instr->intrinsic == nir_intrinsic_demote_if) {
8712          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8713          assert(src.regClass() == bld.lm);
8714          cond =
8715             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8716       }
8717
8718       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8719
8720       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8721          ctx->cf_info.exec_potentially_empty_discard = true;
8722
8723       ctx->block->kind |= block_kind_uses_discard;
8724       ctx->program->needs_exact = true;
8725       break;
8726    }
8727    case nir_intrinsic_terminate:
8728    case nir_intrinsic_terminate_if:
8729    case nir_intrinsic_discard:
8730    case nir_intrinsic_discard_if: {
8731       Operand cond = Operand::c32(-1u);
8732       if (instr->intrinsic == nir_intrinsic_discard_if ||
8733           instr->intrinsic == nir_intrinsic_terminate_if) {
8734          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8735          assert(src.regClass() == bld.lm);
8736          cond =
8737             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8738
8739          ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(instr->src[0]);
8740       }
8741
8742       bld.pseudo(aco_opcode::p_discard_if, cond);
8743
8744       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8745          ctx->cf_info.exec_potentially_empty_discard = true;
8746       ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
8747       ctx->block->kind |= block_kind_uses_discard;
8748       ctx->program->needs_exact = true;
8749       break;
8750    }
8751    case nir_intrinsic_first_invocation: {
8752       emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8753                get_ssa_temp(ctx, &instr->def));
8754       break;
8755    }
8756    case nir_intrinsic_last_invocation: {
8757       Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8758       Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
8759                            Operand::c32(ctx->program->wave_size - 1u), flbit);
8760       emit_wqm(bld, last, get_ssa_temp(ctx, &instr->def));
8761       break;
8762    }
8763    case nir_intrinsic_elect: {
8764       /* p_elect is lowered in aco_insert_exec_mask.
8765        * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8766        * two p_elect with different exec masks as the same.
8767        */
8768       Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm), Operand(exec, bld.lm));
8769       emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->def));
8770       ctx->block->kind |= block_kind_needs_lowering;
8771       break;
8772    }
8773    case nir_intrinsic_shader_clock: {
8774       Temp dst = get_ssa_temp(ctx, &instr->def);
8775       if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
8776           ctx->options->gfx_level >= GFX10_3) {
8777          /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8778          Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8779          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8780       } else if (nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE &&
8781                  ctx->options->gfx_level >= GFX11) {
8782          bld.sop1(aco_opcode::s_sendmsg_rtn_b64, Definition(dst),
8783                   Operand::c32(sendmsg_rtn_get_realtime));
8784       } else {
8785          aco_opcode opcode = nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE
8786                                 ? aco_opcode::s_memrealtime
8787                                 : aco_opcode::s_memtime;
8788          bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8789       }
8790       emit_split_vector(ctx, dst, 2);
8791       break;
8792    }
8793    case nir_intrinsic_load_vertex_id_zero_base: {
8794       Temp dst = get_ssa_temp(ctx, &instr->def);
8795       bld.copy(Definition(dst), get_arg(ctx, ctx->args->vertex_id));
8796       break;
8797    }
8798    case nir_intrinsic_load_first_vertex: {
8799       Temp dst = get_ssa_temp(ctx, &instr->def);
8800       bld.copy(Definition(dst), get_arg(ctx, ctx->args->base_vertex));
8801       break;
8802    }
8803    case nir_intrinsic_load_base_instance: {
8804       Temp dst = get_ssa_temp(ctx, &instr->def);
8805       bld.copy(Definition(dst), get_arg(ctx, ctx->args->start_instance));
8806       break;
8807    }
8808    case nir_intrinsic_load_instance_id: {
8809       Temp dst = get_ssa_temp(ctx, &instr->def);
8810       bld.copy(Definition(dst), get_arg(ctx, ctx->args->instance_id));
8811       break;
8812    }
8813    case nir_intrinsic_load_draw_id: {
8814       Temp dst = get_ssa_temp(ctx, &instr->def);
8815       bld.copy(Definition(dst), get_arg(ctx, ctx->args->draw_id));
8816       break;
8817    }
8818    case nir_intrinsic_load_invocation_id: {
8819       Temp dst = get_ssa_temp(ctx, &instr->def);
8820
8821       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8822          if (ctx->options->gfx_level >= GFX10)
8823             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
8824                          get_arg(ctx, ctx->args->gs_invocation_id));
8825          else
8826             bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_invocation_id));
8827       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8828          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->tcs_rel_ids),
8829                   Operand::c32(8u), Operand::c32(5u));
8830       } else {
8831          unreachable("Unsupported stage for load_invocation_id");
8832       }
8833
8834       break;
8835    }
8836    case nir_intrinsic_load_primitive_id: {
8837       Temp dst = get_ssa_temp(ctx, &instr->def);
8838
8839       switch (ctx->shader->info.stage) {
8840       case MESA_SHADER_GEOMETRY:
8841          bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
8842          break;
8843       case MESA_SHADER_TESS_CTRL:
8844          bld.copy(Definition(dst), get_arg(ctx, ctx->args->tcs_patch_id));
8845          break;
8846       case MESA_SHADER_TESS_EVAL:
8847          bld.copy(Definition(dst), get_arg(ctx, ctx->args->tes_patch_id));
8848          break;
8849       default:
8850          if (ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && !ctx->stage.has(SWStage::GS)) {
8851             /* In case of NGG, the GS threads always have the primitive ID
8852              * even if there is no SW GS. */
8853             bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
8854             break;
8855          } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
8856             bld.copy(Definition(dst), get_arg(ctx, ctx->args->vs_prim_id));
8857             break;
8858          }
8859          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8860       }
8861
8862       break;
8863    }
8864    case nir_intrinsic_sendmsg_amd: {
8865       unsigned imm = nir_intrinsic_base(instr);
8866       Temp m0_content = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8867       bld.sopp(aco_opcode::s_sendmsg, bld.m0(m0_content), -1, imm);
8868       break;
8869    }
8870    case nir_intrinsic_load_gs_wave_id_amd: {
8871       Temp dst = get_ssa_temp(ctx, &instr->def);
8872       if (ctx->args->merged_wave_info.used)
8873          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
8874                     get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(2u), Operand::c32(8u),
8875                     Operand::zero());
8876       else if (ctx->args->gs_wave_id.used)
8877          bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_wave_id));
8878       else
8879          unreachable("Shader doesn't have GS wave ID.");
8880       break;
8881    }
8882    case nir_intrinsic_is_subgroup_invocation_lt_amd: {
8883       Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8884       bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src));
8885       break;
8886    }
8887    case nir_intrinsic_gds_atomic_add_amd: {
8888       Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8889       Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8890       Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8891       Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8892       bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8893              true);
8894       break;
8895    }
8896    case nir_intrinsic_load_sbt_base_amd: {
8897       Temp dst = get_ssa_temp(ctx, &instr->def);
8898       Temp addr = get_arg(ctx, ctx->args->rt.sbt_descriptors);
8899       assert(addr.regClass() == s2);
8900       bld.copy(Definition(dst), Operand(addr));
8901       break;
8902    }
8903    case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8904    case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
8905       bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
8906                get_arg(ctx, ctx->args->rt.dynamic_callable_stack_base));
8907       break;
8908    case nir_intrinsic_load_resume_shader_address_amd: {
8909       bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)),
8910                  bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr)));
8911       break;
8912    }
8913    case nir_intrinsic_overwrite_vs_arguments_amd: {
8914       ctx->arg_temps[ctx->args->vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
8915       ctx->arg_temps[ctx->args->instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
8916       break;
8917    }
8918    case nir_intrinsic_overwrite_tes_arguments_amd: {
8919       ctx->arg_temps[ctx->args->tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
8920       ctx->arg_temps[ctx->args->tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
8921       ctx->arg_temps[ctx->args->tes_rel_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
8922       ctx->arg_temps[ctx->args->tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[2].ssa);
8923       break;
8924    }
8925    case nir_intrinsic_load_scalar_arg_amd:
8926    case nir_intrinsic_load_vector_arg_amd: {
8927       assert(nir_intrinsic_base(instr) < ctx->args->arg_count);
8928       Temp dst = get_ssa_temp(ctx, &instr->def);
8929       Temp src = ctx->arg_temps[nir_intrinsic_base(instr)];
8930       assert(src.id());
8931       assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr
8932                                                                                   : RegType::vgpr));
8933       bld.copy(Definition(dst), src);
8934       emit_split_vector(ctx, dst, dst.size());
8935       break;
8936    }
8937    case nir_intrinsic_ordered_xfb_counter_add_amd: {
8938       Temp dst = get_ssa_temp(ctx, &instr->def);
8939       Temp ordered_id = get_ssa_temp(ctx, instr->src[0].ssa);
8940       Temp counter = get_ssa_temp(ctx, instr->src[1].ssa);
8941
8942       Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
8943       unsigned offset0, offset1;
8944       Instruction* ds_instr;
8945       Operand m;
8946
8947       /* Lock a GDS mutex. */
8948       ds_ordered_count_offsets(ctx, 1 << 24u, false, false, &offset0, &offset1);
8949       m = bld.m0(bld.as_uniform(ordered_id));
8950       ds_instr =
8951          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
8952       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
8953
8954       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
8955          aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)};
8956       unsigned write_mask = nir_intrinsic_write_mask(instr);
8957
8958       bool use_gds_registers = ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl;
8959
8960       for (unsigned i = 0; i < instr->num_components; i++) {
8961          if (write_mask & (1 << i)) {
8962             Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
8963
8964             if (use_gds_registers) {
8965                ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), Operand(),
8966                                  chan_counter, i * 4, 0u, true);
8967             } else {
8968                m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0x100u)));
8969
8970                ds_instr = bld.ds(aco_opcode::ds_add_rtn_u32, bld.def(v1), gds_base, chan_counter, m,
8971                                  i * 4, 0u, true);
8972             }
8973             ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
8974
8975             vec->operands[i] = Operand(ds_instr->definitions[0].getTemp());
8976          } else {
8977             vec->operands[i] = Operand::zero();
8978          }
8979       }
8980
8981       vec->definitions[0] = Definition(dst);
8982       ctx->block->instructions.emplace_back(std::move(vec));
8983
8984       /* Unlock a GDS mutex. */
8985       ds_ordered_count_offsets(ctx, 1 << 24u, true, true, &offset0, &offset1);
8986       m = bld.m0(bld.as_uniform(ordered_id));
8987       ds_instr =
8988          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
8989       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
8990
8991       emit_split_vector(ctx, dst, instr->num_components);
8992       break;
8993    }
8994    case nir_intrinsic_xfb_counter_sub_amd: {
8995       bool use_gds_registers = ctx->options->gfx_level >= GFX11 && ctx->options->is_opengl;
8996
8997       unsigned write_mask = nir_intrinsic_write_mask(instr);
8998       Temp counter = get_ssa_temp(ctx, instr->src[0].ssa);
8999       Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
9000
9001       u_foreach_bit (i, write_mask) {
9002          Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
9003          Instruction* ds_instr;
9004
9005          if (use_gds_registers) {
9006             ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
9007                               i * 4, 0u, true);
9008          } else {
9009             Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0x100u)));
9010
9011             ds_instr = bld.ds(aco_opcode::ds_sub_rtn_u32, bld.def(v1), gds_base, chan_counter, m,
9012                               i * 4, 0u, true);
9013          }
9014          ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
9015       }
9016       break;
9017    }
9018    case nir_intrinsic_export_amd: {
9019       unsigned flags = nir_intrinsic_flags(instr);
9020       unsigned target = nir_intrinsic_base(instr);
9021       unsigned write_mask = nir_intrinsic_write_mask(instr);
9022
9023       /* Mark vertex export block. */
9024       if (target == V_008DFC_SQ_EXP_POS || target <= V_008DFC_SQ_EXP_NULL)
9025          ctx->block->kind |= block_kind_export_end;
9026
9027       if (target < V_008DFC_SQ_EXP_MRTZ)
9028          ctx->program->has_color_exports = true;
9029
9030       aco_ptr<Export_instruction> exp{
9031          create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9032
9033       exp->dest = target;
9034       exp->enabled_mask = write_mask;
9035       exp->compressed = flags & AC_EXP_FLAG_COMPRESSED;
9036
9037       /* ACO may reorder position/mrt export instructions, then mark done for last
9038        * export instruction. So don't respect the nir AC_EXP_FLAG_DONE for position/mrt
9039        * exports here and leave it to ACO.
9040        */
9041       if (target == V_008DFC_SQ_EXP_PRIM)
9042          exp->done = flags & AC_EXP_FLAG_DONE;
9043       else
9044          exp->done = false;
9045
9046       /* ACO may reorder mrt export instructions, then mark valid mask for last
9047        * export instruction. So don't respect the nir AC_EXP_FLAG_VALID_MASK for mrt
9048        * exports here and leave it to ACO.
9049        */
9050       if (target > V_008DFC_SQ_EXP_NULL)
9051          exp->valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
9052       else
9053          exp->valid_mask = false;
9054
9055       /* Compressed export uses two bits for a channel. */
9056       uint32_t channel_mask =
9057          exp->compressed ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0) : write_mask;
9058
9059       Temp value = get_ssa_temp(ctx, instr->src[0].ssa);
9060       for (unsigned i = 0; i < 4; i++) {
9061          exp->operands[i] = channel_mask & BITFIELD_BIT(i)
9062                                ? Operand(emit_extract_vector(ctx, value, i, v1))
9063                                : Operand(v1);
9064       }
9065
9066       ctx->block->instructions.emplace_back(std::move(exp));
9067       break;
9068    }
9069    case nir_intrinsic_export_dual_src_blend_amd: {
9070       Temp val0 = get_ssa_temp(ctx, instr->src[0].ssa);
9071       Temp val1 = get_ssa_temp(ctx, instr->src[1].ssa);
9072       unsigned write_mask = nir_intrinsic_write_mask(instr);
9073
9074       struct aco_export_mrt mrt0, mrt1;
9075       for (unsigned i = 0; i < 4; i++) {
9076          mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val0, i, v1))
9077                                                     : Operand(v1);
9078
9079          mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val1, i, v1))
9080                                                     : Operand(v1);
9081       }
9082       mrt0.enabled_channels = mrt1.enabled_channels = write_mask;
9083
9084       create_fs_dual_src_export_gfx11(ctx, &mrt0, &mrt1);
9085
9086       ctx->block->kind |= block_kind_export_end;
9087       break;
9088    }
9089    case nir_intrinsic_strict_wqm_coord_amd: {
9090       Temp dst = get_ssa_temp(ctx, &instr->def);
9091       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9092       Temp tmp = bld.tmp(RegClass::get(RegType::vgpr, dst.bytes()));
9093       unsigned begin_size = nir_intrinsic_base(instr);
9094
9095       unsigned num_src = 1;
9096       auto it = ctx->allocated_vec.find(src.id());
9097       if (it != ctx->allocated_vec.end())
9098          num_src = src.bytes() / it->second[0].bytes();
9099
9100       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9101          aco_opcode::p_create_vector, Format::PSEUDO, num_src + !!begin_size, 1)};
9102
9103       if (begin_size)
9104          vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
9105       for (unsigned i = 0; i < num_src; i++) {
9106          Temp comp = it != ctx->allocated_vec.end() ? it->second[i] : src;
9107          vec->operands[i + !!begin_size] = Operand(comp);
9108       }
9109
9110       vec->definitions[0] = Definition(tmp);
9111       ctx->block->instructions.emplace_back(std::move(vec));
9112
9113       bld.pseudo(aco_opcode::p_start_linear_vgpr, Definition(dst), tmp);
9114       break;
9115    }
9116    case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
9117       Temp dst = get_ssa_temp(ctx, &instr->def);
9118       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9119                Operand::c32(aco_symbol_lds_ngg_scratch_base));
9120       break;
9121    }
9122    case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: {
9123       Temp dst = get_ssa_temp(ctx, &instr->def);
9124       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9125                Operand::c32(aco_symbol_lds_ngg_gs_out_vertex_base));
9126       break;
9127    }
9128    case nir_intrinsic_store_scalar_arg_amd: {
9129       ctx->arg_temps[nir_intrinsic_base(instr)] =
9130          bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9131       break;
9132    }
9133    case nir_intrinsic_store_vector_arg_amd: {
9134       ctx->arg_temps[nir_intrinsic_base(instr)] =
9135          as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
9136       break;
9137    }
9138    case nir_intrinsic_begin_invocation_interlock: {
9139       pops_await_overlapped_waves(ctx);
9140       break;
9141    }
9142    case nir_intrinsic_end_invocation_interlock: {
9143       if (ctx->options->gfx_level < GFX11)
9144          bld.pseudo(aco_opcode::p_pops_gfx9_ordered_section_done);
9145       break;
9146    }
9147    default:
9148       isel_err(&instr->instr, "Unimplemented intrinsic instr");
9149       abort();
9150
9151       break;
9152    }
9153 }
9154
9155 void
9156 get_const_vec(nir_def* vec, nir_const_value* cv[4])
9157 {
9158    if (vec->parent_instr->type != nir_instr_type_alu)
9159       return;
9160    nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9161    if (vec_instr->op != nir_op_vec(vec->num_components))
9162       return;
9163
9164    for (unsigned i = 0; i < vec->num_components; i++) {
9165       cv[i] =
9166          vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9167    }
9168 }
9169
9170 void
9171 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9172 {
9173    assert(instr->op != nir_texop_samples_identical);
9174
9175    Builder bld(ctx->program, ctx->block);
9176    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9177         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9178         has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
9179    Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9180                            offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(),
9181                            coord = Temp(), wqm_coord = Temp();
9182    std::vector<Temp> coords;
9183    std::vector<Temp> derivs;
9184    nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9185
9186    for (unsigned i = 0; i < instr->num_srcs; i++) {
9187       switch (instr->src[i].src_type) {
9188       case nir_tex_src_texture_handle:
9189          resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9190          break;
9191       case nir_tex_src_sampler_handle:
9192          sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9193          break;
9194       default: break;
9195       }
9196    }
9197
9198    bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
9199                                   (instr->dest_type & (nir_type_int | nir_type_uint));
9200    bool tg4_integer_cube_workaround =
9201       tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9202
9203    bool a16 = false, g16 = false;
9204
9205    int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
9206    if (coord_idx > 0)
9207       a16 = instr->src[coord_idx].src.ssa->bit_size == 16;
9208
9209    int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
9210    if (ddx_idx > 0)
9211       g16 = instr->src[ddx_idx].src.ssa->bit_size == 16;
9212
9213    for (unsigned i = 0; i < instr->num_srcs; i++) {
9214       switch (instr->src[i].src_type) {
9215       case nir_tex_src_coord: {
9216          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9217          coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9218          break;
9219       }
9220       case nir_tex_src_backend1: {
9221          assert(instr->src[i].src.ssa->bit_size == 32);
9222          wqm_coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9223          has_wqm_coord = true;
9224          break;
9225       }
9226       case nir_tex_src_bias:
9227          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9228          /* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */
9229          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9230          has_bias = true;
9231          break;
9232       case nir_tex_src_lod: {
9233          if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9234             level_zero = true;
9235          } else {
9236             assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9237             lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9238             has_lod = true;
9239          }
9240          break;
9241       }
9242       case nir_tex_src_min_lod:
9243          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9244          clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9245          has_clamped_lod = true;
9246          break;
9247       case nir_tex_src_comparator:
9248          if (instr->is_shadow) {
9249             assert(instr->src[i].src.ssa->bit_size == 32);
9250             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9251             has_compare = true;
9252          }
9253          break;
9254       case nir_tex_src_offset:
9255       case nir_tex_src_backend2:
9256          assert(instr->src[i].src.ssa->bit_size == 32);
9257          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9258          get_const_vec(instr->src[i].src.ssa, const_offset);
9259          has_offset = true;
9260          break;
9261       case nir_tex_src_ddx:
9262          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9263          ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9264          has_ddx = true;
9265          break;
9266       case nir_tex_src_ddy:
9267          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9268          ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9269          has_ddy = true;
9270          break;
9271       case nir_tex_src_ms_index:
9272          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9273          sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9274          has_sample_index = true;
9275          break;
9276       case nir_tex_src_texture_offset:
9277       case nir_tex_src_sampler_offset:
9278       default: break;
9279       }
9280    }
9281
9282    if (has_wqm_coord) {
9283       assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb ||
9284              instr->op == nir_texop_lod);
9285       assert(wqm_coord.regClass().is_linear_vgpr());
9286       assert(!a16 && !g16);
9287    }
9288
9289    if (instr->op == nir_texop_tg4 && !has_lod && !instr->is_gather_implicit_lod)
9290       level_zero = true;
9291
9292    if (has_offset) {
9293       assert(instr->op != nir_texop_txf);
9294
9295       aco_ptr<Instruction> tmp_instr;
9296       Temp acc, pack = Temp();
9297
9298       uint32_t pack_const = 0;
9299       for (unsigned i = 0; i < offset.size(); i++) {
9300          if (!const_offset[i])
9301             continue;
9302          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9303       }
9304
9305       if (offset.type() == RegType::sgpr) {
9306          for (unsigned i = 0; i < offset.size(); i++) {
9307             if (const_offset[i])
9308                continue;
9309
9310             acc = emit_extract_vector(ctx, offset, i, s1);
9311             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9312                            Operand::c32(0x3Fu));
9313
9314             if (i) {
9315                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9316                               Operand::c32(8u * i));
9317             }
9318
9319             if (pack == Temp()) {
9320                pack = acc;
9321             } else {
9322                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9323             }
9324          }
9325
9326          if (pack_const && pack != Temp())
9327             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9328                             Operand::c32(pack_const), pack);
9329       } else {
9330          for (unsigned i = 0; i < offset.size(); i++) {
9331             if (const_offset[i])
9332                continue;
9333
9334             acc = emit_extract_vector(ctx, offset, i, v1);
9335             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9336
9337             if (i) {
9338                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9339             }
9340
9341             if (pack == Temp()) {
9342                pack = acc;
9343             } else {
9344                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9345             }
9346          }
9347
9348          if (pack_const && pack != Temp())
9349             pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9350       }
9351       if (pack_const && pack == Temp())
9352          offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9353       else if (pack == Temp())
9354          has_offset = false;
9355       else
9356          offset = pack;
9357    }
9358
9359    std::vector<Temp> unpacked_coord;
9360    if (coord != Temp())
9361       unpacked_coord.push_back(coord);
9362    if (has_sample_index)
9363       unpacked_coord.push_back(sample_index);
9364    if (has_lod)
9365       unpacked_coord.push_back(lod);
9366    if (has_clamped_lod)
9367       unpacked_coord.push_back(clamped_lod);
9368
9369    coords = emit_pack_v1(ctx, unpacked_coord);
9370
9371    /* pack derivatives */
9372    if (has_ddx || has_ddy) {
9373       assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
9374       std::array<Temp, 2> ddxddy = {ddx, ddy};
9375       for (Temp tmp : ddxddy) {
9376          if (tmp == Temp())
9377             continue;
9378          std::vector<Temp> unpacked = {tmp};
9379          for (Temp derv : emit_pack_v1(ctx, unpacked))
9380             derivs.push_back(derv);
9381       }
9382       has_derivs = true;
9383    }
9384
9385    unsigned dim = 0;
9386    bool da = false;
9387    if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
9388       dim = ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array);
9389       da = should_declare_array((ac_image_dim)dim);
9390    }
9391
9392    /* Build tex instruction */
9393    unsigned dmask = nir_def_components_read(&instr->def) & 0xf;
9394    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9395       dmask = u_bit_consecutive(0, util_last_bit(dmask));
9396    if (instr->is_sparse)
9397       dmask = MAX2(dmask, 1) | 0x10;
9398    bool d16 = instr->def.bit_size == 16;
9399    Temp dst = get_ssa_temp(ctx, &instr->def);
9400    Temp tmp_dst = dst;
9401
9402    /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9403    if (instr->op == nir_texop_tg4) {
9404       assert(instr->def.num_components == (4 + instr->is_sparse));
9405       if (instr->is_shadow)
9406          dmask = 1;
9407       else
9408          dmask = 1 << instr->component;
9409       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9410          tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4));
9411    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9412       tmp_dst = bld.tmp(v1);
9413    } else if (util_bitcount(dmask) != instr->def.num_components || dst.type() == RegType::sgpr) {
9414       unsigned bytes = util_bitcount(dmask) * instr->def.bit_size / 8;
9415       tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes));
9416    }
9417
9418    Temp tg4_compare_cube_wa64 = Temp();
9419
9420    if (tg4_integer_workarounds) {
9421       Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9422       Temp size = bld.tmp(v2);
9423       MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, size, resource,
9424                                         Operand(s4), std::vector<Temp>{tg4_lod});
9425       tex->dim = dim;
9426       tex->dmask = 0x3;
9427       tex->da = da;
9428       emit_split_vector(ctx, size, size.size());
9429
9430       Temp half_texel[2];
9431       for (unsigned i = 0; i < 2; i++) {
9432          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9433          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9434          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9435          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9436                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9437       }
9438
9439       if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9440          /* In vulkan, whether the sampler uses unnormalized
9441           * coordinates or not is a dynamic property of the
9442           * sampler. Hence, to figure out whether or not we
9443           * need to divide by the texture size, we need to test
9444           * the sampler at runtime. This tests the bit set by
9445           * radv_init_sampler().
9446           */
9447          unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9448          Temp not_needed =
9449             bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx));
9450
9451          not_needed = bool_to_vector_condition(ctx, not_needed);
9452          half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9453                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9454          half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9455                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9456       }
9457
9458       Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9459                             bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9460
9461       if (tg4_integer_cube_workaround) {
9462          /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9463          Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9464          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9465             aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9466          split->operands[0] = Operand(resource);
9467          for (unsigned i = 0; i < resource.size(); i++) {
9468             desc[i] = bld.tmp(s1);
9469             split->definitions[i] = Definition(desc[i]);
9470          }
9471          ctx->block->instructions.emplace_back(std::move(split));
9472
9473          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9474                               Operand::c32(20u | (6u << 16)));
9475          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9476                                          Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9477
9478          Temp nfmt;
9479          if (instr->dest_type & nir_type_uint) {
9480             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9481                             Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9482                             Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9483          } else {
9484             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9485                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9486                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9487          }
9488          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9489          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9490
9491          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9492                          Operand::c32(26u));
9493
9494          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9495                             Operand::c32(C_008F14_NUM_FORMAT));
9496          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9497
9498          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9499             aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9500          for (unsigned i = 0; i < resource.size(); i++)
9501             vec->operands[i] = Operand(desc[i]);
9502          resource = bld.tmp(resource.regClass());
9503          vec->definitions[0] = Definition(resource);
9504          ctx->block->instructions.emplace_back(std::move(vec));
9505
9506          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9507                                   tg4_compare_cube_wa64);
9508          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9509                                   tg4_compare_cube_wa64);
9510       }
9511       coords[0] = new_coords[0];
9512       coords[1] = new_coords[1];
9513    }
9514
9515    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9516       // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9517       // ac_build_buffer_load_format_gfx9_safe()
9518
9519       assert(coords.size() == 1);
9520       aco_opcode op;
9521       if (d16) {
9522          switch (util_last_bit(dmask & 0xf)) {
9523          case 1: op = aco_opcode::buffer_load_format_d16_x; break;
9524          case 2: op = aco_opcode::buffer_load_format_d16_xy; break;
9525          case 3: op = aco_opcode::buffer_load_format_d16_xyz; break;
9526          case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break;
9527          default: unreachable("Tex instruction loads more than 4 components.");
9528          }
9529       } else {
9530          switch (util_last_bit(dmask & 0xf)) {
9531          case 1: op = aco_opcode::buffer_load_format_x; break;
9532          case 2: op = aco_opcode::buffer_load_format_xy; break;
9533          case 3: op = aco_opcode::buffer_load_format_xyz; break;
9534          case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9535          default: unreachable("Tex instruction loads more than 4 components.");
9536          }
9537       }
9538
9539       aco_ptr<MUBUF_instruction> mubuf{
9540          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9541       mubuf->operands[0] = Operand(resource);
9542       mubuf->operands[1] = Operand(coords[0]);
9543       mubuf->operands[2] = Operand::c32(0);
9544       mubuf->definitions[0] = Definition(tmp_dst);
9545       mubuf->idxen = true;
9546       mubuf->tfe = instr->is_sparse;
9547       if (mubuf->tfe)
9548          mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9549       ctx->block->instructions.emplace_back(std::move(mubuf));
9550
9551       expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9552       return;
9553    }
9554
9555    /* gather MIMG address components */
9556    std::vector<Temp> args;
9557    if (has_wqm_coord) {
9558       args.emplace_back(wqm_coord);
9559       if (!(ctx->block->kind & block_kind_top_level))
9560          ctx->unended_linear_vgprs.push_back(wqm_coord);
9561    }
9562    if (has_offset)
9563       args.emplace_back(offset);
9564    if (has_bias)
9565       args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
9566    if (has_compare)
9567       args.emplace_back(compare);
9568    if (has_derivs)
9569       args.insert(args.end(), derivs.begin(), derivs.end());
9570
9571    args.insert(args.end(), coords.begin(), coords.end());
9572
9573    if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9574        instr->op == nir_texop_fragment_mask_fetch_amd || instr->op == nir_texop_txf_ms) {
9575       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9576                             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9577                          ? aco_opcode::image_load
9578                          : aco_opcode::image_load_mip;
9579       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9580       MIMG_instruction* tex =
9581          emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, false, vdata);
9582       if (instr->op == nir_texop_fragment_mask_fetch_amd)
9583          tex->dim = da ? ac_image_2darray : ac_image_2d;
9584       else
9585          tex->dim = dim;
9586       tex->dmask = dmask & 0xf;
9587       tex->unrm = true;
9588       tex->da = da;
9589       tex->tfe = instr->is_sparse;
9590       tex->d16 = d16;
9591       tex->a16 = a16;
9592
9593       if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9594          /* Use 0x76543210 if the image doesn't have FMASK. */
9595          assert(dmask == 1 && dst.bytes() == 4);
9596          assert(dst.id() != tmp_dst.id());
9597
9598          if (dst.regClass() == s1) {
9599             Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9600                                         emit_extract_vector(ctx, resource, 1, s1));
9601             bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst),
9602                      Operand::c32(0x76543210), bld.scc(is_not_null));
9603          } else {
9604             Temp is_not_null = bld.tmp(bld.lm);
9605             bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9606                          emit_extract_vector(ctx, resource, 1, s1));
9607             bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9608                      bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9609          }
9610       } else {
9611          expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9612       }
9613       return;
9614    }
9615
9616    bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
9617
9618    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9619    aco_opcode opcode = aco_opcode::image_sample;
9620    if (has_offset) { /* image_sample_*_o */
9621       if (has_clamped_lod) {
9622          if (has_compare) {
9623             opcode = aco_opcode::image_sample_c_cl_o;
9624             if (separate_g16)
9625                opcode = aco_opcode::image_sample_c_d_cl_o_g16;
9626             else if (has_derivs)
9627                opcode = aco_opcode::image_sample_c_d_cl_o;
9628             if (has_bias)
9629                opcode = aco_opcode::image_sample_c_b_cl_o;
9630          } else {
9631             opcode = aco_opcode::image_sample_cl_o;
9632             if (separate_g16)
9633                opcode = aco_opcode::image_sample_d_cl_o_g16;
9634             else if (has_derivs)
9635                opcode = aco_opcode::image_sample_d_cl_o;
9636             if (has_bias)
9637                opcode = aco_opcode::image_sample_b_cl_o;
9638          }
9639       } else if (has_compare) {
9640          opcode = aco_opcode::image_sample_c_o;
9641          if (separate_g16)
9642             opcode = aco_opcode::image_sample_c_d_o_g16;
9643          else if (has_derivs)
9644             opcode = aco_opcode::image_sample_c_d_o;
9645          if (has_bias)
9646             opcode = aco_opcode::image_sample_c_b_o;
9647          if (level_zero)
9648             opcode = aco_opcode::image_sample_c_lz_o;
9649          if (has_lod)
9650             opcode = aco_opcode::image_sample_c_l_o;
9651       } else {
9652          opcode = aco_opcode::image_sample_o;
9653          if (separate_g16)
9654             opcode = aco_opcode::image_sample_d_o_g16;
9655          else if (has_derivs)
9656             opcode = aco_opcode::image_sample_d_o;
9657          if (has_bias)
9658             opcode = aco_opcode::image_sample_b_o;
9659          if (level_zero)
9660             opcode = aco_opcode::image_sample_lz_o;
9661          if (has_lod)
9662             opcode = aco_opcode::image_sample_l_o;
9663       }
9664    } else if (has_clamped_lod) { /* image_sample_*_cl */
9665       if (has_compare) {
9666          opcode = aco_opcode::image_sample_c_cl;
9667          if (separate_g16)
9668             opcode = aco_opcode::image_sample_c_d_cl_g16;
9669          else if (has_derivs)
9670             opcode = aco_opcode::image_sample_c_d_cl;
9671          if (has_bias)
9672             opcode = aco_opcode::image_sample_c_b_cl;
9673       } else {
9674          opcode = aco_opcode::image_sample_cl;
9675          if (separate_g16)
9676             opcode = aco_opcode::image_sample_d_cl_g16;
9677          else if (has_derivs)
9678             opcode = aco_opcode::image_sample_d_cl;
9679          if (has_bias)
9680             opcode = aco_opcode::image_sample_b_cl;
9681       }
9682    } else { /* no offset */
9683       if (has_compare) {
9684          opcode = aco_opcode::image_sample_c;
9685          if (separate_g16)
9686             opcode = aco_opcode::image_sample_c_d_g16;
9687          else if (has_derivs)
9688             opcode = aco_opcode::image_sample_c_d;
9689          if (has_bias)
9690             opcode = aco_opcode::image_sample_c_b;
9691          if (level_zero)
9692             opcode = aco_opcode::image_sample_c_lz;
9693          if (has_lod)
9694             opcode = aco_opcode::image_sample_c_l;
9695       } else {
9696          opcode = aco_opcode::image_sample;
9697          if (separate_g16)
9698             opcode = aco_opcode::image_sample_d_g16;
9699          else if (has_derivs)
9700             opcode = aco_opcode::image_sample_d;
9701          if (has_bias)
9702             opcode = aco_opcode::image_sample_b;
9703          if (level_zero)
9704             opcode = aco_opcode::image_sample_lz;
9705          if (has_lod)
9706             opcode = aco_opcode::image_sample_l;
9707       }
9708    }
9709
9710    if (instr->op == nir_texop_tg4) {
9711       /* GFX11 supports implicit LOD, but the extension is unsupported. */
9712       assert(level_zero || ctx->options->gfx_level < GFX11);
9713
9714       if (has_offset) { /* image_gather4_*_o */
9715          if (has_compare) {
9716             opcode = aco_opcode::image_gather4_c_o;
9717             if (level_zero)
9718                opcode = aco_opcode::image_gather4_c_lz_o;
9719             if (has_lod)
9720                opcode = aco_opcode::image_gather4_c_l_o;
9721             if (has_bias)
9722                opcode = aco_opcode::image_gather4_c_b_o;
9723          } else {
9724             opcode = aco_opcode::image_gather4_o;
9725             if (level_zero)
9726                opcode = aco_opcode::image_gather4_lz_o;
9727             if (has_lod)
9728                opcode = aco_opcode::image_gather4_l_o;
9729             if (has_bias)
9730                opcode = aco_opcode::image_gather4_b_o;
9731          }
9732       } else {
9733          if (has_compare) {
9734             opcode = aco_opcode::image_gather4_c;
9735             if (level_zero)
9736                opcode = aco_opcode::image_gather4_c_lz;
9737             if (has_lod)
9738                opcode = aco_opcode::image_gather4_c_l;
9739             if (has_bias)
9740                opcode = aco_opcode::image_gather4_c_b;
9741          } else {
9742             opcode = aco_opcode::image_gather4;
9743             if (level_zero)
9744                opcode = aco_opcode::image_gather4_lz;
9745             if (has_lod)
9746                opcode = aco_opcode::image_gather4_l;
9747             if (has_bias)
9748                opcode = aco_opcode::image_gather4_b;
9749          }
9750       }
9751    } else if (instr->op == nir_texop_lod) {
9752       opcode = aco_opcode::image_get_lod;
9753    }
9754
9755    bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9756                           !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9757                           instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9758
9759    Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9760    MIMG_instruction* tex =
9761       emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, implicit_derivs, vdata);
9762    tex->dim = dim;
9763    tex->dmask = dmask & 0xf;
9764    tex->da = da;
9765    tex->tfe = instr->is_sparse;
9766    tex->d16 = d16;
9767    tex->a16 = a16;
9768
9769    if (tg4_integer_cube_workaround) {
9770       assert(tmp_dst.id() != dst.id());
9771       assert(tmp_dst.size() == dst.size());
9772
9773       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9774       Temp val[4];
9775       for (unsigned i = 0; i < 4; i++) {
9776          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9777          Temp cvt_val;
9778          if (instr->dest_type & nir_type_uint)
9779             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9780          else
9781             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9782          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9783                            tg4_compare_cube_wa64);
9784       }
9785
9786       Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9787       if (instr->is_sparse)
9788          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9789                               val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9790       else
9791          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9792                               val[3]);
9793    }
9794    unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9795    expand_vector(ctx, tmp_dst, dst, instr->def.num_components, mask);
9796 }
9797
9798 Operand
9799 get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc, bool logical)
9800 {
9801    Temp tmp = get_ssa_temp(ctx, ssa);
9802    if (ssa->parent_instr->type == nir_instr_type_undef) {
9803       return Operand(rc);
9804    } else if (logical && ssa->bit_size == 1 &&
9805               ssa->parent_instr->type == nir_instr_type_load_const) {
9806       bool val = nir_instr_as_load_const(ssa->parent_instr)->value[0].b;
9807       return Operand::c32_or_c64(val ? -1 : 0, ctx->program->lane_mask == s2);
9808    } else {
9809       return Operand(tmp);
9810    }
9811 }
9812
9813 void
9814 visit_phi(isel_context* ctx, nir_phi_instr* instr)
9815 {
9816    aco_ptr<Pseudo_instruction> phi;
9817    Temp dst = get_ssa_temp(ctx, &instr->def);
9818    assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9819
9820    bool logical = !dst.is_linear() || instr->def.divergent;
9821    logical |= (ctx->block->kind & block_kind_merge) != 0;
9822    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9823
9824    /* we want a sorted list of sources, since the predecessor list is also sorted */
9825    std::map<unsigned, nir_def*> phi_src;
9826    nir_foreach_phi_src (src, instr)
9827       phi_src[src->pred->index] = src->src.ssa;
9828
9829    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9830    unsigned num_operands = 0;
9831    Operand* const operands = (Operand*)alloca(
9832       (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
9833    unsigned num_defined = 0;
9834    unsigned cur_pred_idx = 0;
9835    for (std::pair<unsigned, nir_def*> src : phi_src) {
9836       if (cur_pred_idx < preds.size()) {
9837          /* handle missing preds (IF merges with discard/break) and extra preds
9838           * (loop exit with discard) */
9839          unsigned block = ctx->cf_info.nir_to_aco[src.first];
9840          unsigned skipped = 0;
9841          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9842             skipped++;
9843          if (cur_pred_idx + skipped < preds.size()) {
9844             for (unsigned i = 0; i < skipped; i++)
9845                operands[num_operands++] = Operand(dst.regClass());
9846             cur_pred_idx += skipped;
9847          } else {
9848             continue;
9849          }
9850       }
9851       /* Handle missing predecessors at the end. This shouldn't happen with loop
9852        * headers and we can't ignore these sources for loop header phis. */
9853       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9854          continue;
9855       cur_pred_idx++;
9856       Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9857       operands[num_operands++] = op;
9858       num_defined += !op.isUndefined();
9859    }
9860    /* handle block_kind_continue_or_break at loop exit blocks */
9861    while (cur_pred_idx++ < preds.size())
9862       operands[num_operands++] = Operand(dst.regClass());
9863
9864    /* If the loop ends with a break, still add a linear continue edge in case
9865     * that break is divergent or continue_or_break is used. We'll either remove
9866     * this operand later in visit_loop() if it's not necessary or replace the
9867     * undef with something correct. */
9868    if (!logical && ctx->block->kind & block_kind_loop_header) {
9869       nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9870       nir_block* last = nir_loop_last_block(loop);
9871       if (last->successors[0] != instr->instr.block)
9872          operands[num_operands++] = Operand(RegClass());
9873    }
9874
9875    /* we can use a linear phi in some cases if one src is undef */
9876    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9877       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
9878                                                        num_operands, 1));
9879
9880       Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9881       Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9882       assert(invert->kind & block_kind_invert);
9883
9884       unsigned then_block = invert->linear_preds[0];
9885
9886       Block* insert_block = NULL;
9887       for (unsigned i = 0; i < num_operands; i++) {
9888          Operand op = operands[i];
9889          if (op.isUndefined())
9890             continue;
9891          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9892          phi->operands[0] = op;
9893          break;
9894       }
9895       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9896       phi->operands[1] = Operand(dst.regClass());
9897       phi->definitions[0] = Definition(dst);
9898       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9899       return;
9900    }
9901
9902    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9903    for (unsigned i = 0; i < num_operands; i++)
9904       phi->operands[i] = operands[i];
9905    phi->definitions[0] = Definition(dst);
9906    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9907 }
9908
9909 void
9910 visit_undef(isel_context* ctx, nir_undef_instr* instr)
9911 {
9912    Temp dst = get_ssa_temp(ctx, &instr->def);
9913
9914    assert(dst.type() == RegType::sgpr);
9915
9916    if (dst.size() == 1) {
9917       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9918    } else {
9919       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9920          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9921       for (unsigned i = 0; i < dst.size(); i++)
9922          vec->operands[i] = Operand::zero();
9923       vec->definitions[0] = Definition(dst);
9924       ctx->block->instructions.emplace_back(std::move(vec));
9925    }
9926 }
9927
9928 void
9929 begin_loop(isel_context* ctx, loop_context* lc)
9930 {
9931    // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9932    append_logical_end(ctx->block);
9933    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9934    Builder bld(ctx->program, ctx->block);
9935    bld.branch(aco_opcode::p_branch, bld.def(s2));
9936    unsigned loop_preheader_idx = ctx->block->index;
9937
9938    lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9939
9940    ctx->program->next_loop_depth++;
9941
9942    Block* loop_header = ctx->program->create_and_insert_block();
9943    loop_header->kind |= block_kind_loop_header;
9944    add_edge(loop_preheader_idx, loop_header);
9945    ctx->block = loop_header;
9946
9947    append_logical_start(ctx->block);
9948
9949    lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
9950    lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
9951    lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
9952    lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
9953    lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
9954 }
9955
9956 void
9957 end_loop(isel_context* ctx, loop_context* lc)
9958 {
9959    // TODO: what if a loop ends with a unconditional or uniformly branched continue
9960    //       and this branch is never taken?
9961    if (!ctx->cf_info.has_branch) {
9962       unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
9963       Builder bld(ctx->program, ctx->block);
9964       append_logical_end(ctx->block);
9965
9966       if (ctx->cf_info.exec_potentially_empty_discard ||
9967           ctx->cf_info.exec_potentially_empty_break) {
9968          /* Discards can result in code running with an empty exec mask.
9969           * This would result in divergent breaks not ever being taken. As a
9970           * workaround, break the loop when the loop mask is empty instead of
9971           * always continuing. */
9972          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9973          unsigned block_idx = ctx->block->index;
9974
9975          /* create helper blocks to avoid critical edges */
9976          Block* break_block = ctx->program->create_and_insert_block();
9977          break_block->kind = block_kind_uniform;
9978          bld.reset(break_block);
9979          bld.branch(aco_opcode::p_branch, bld.def(s2));
9980          add_linear_edge(block_idx, break_block);
9981          add_linear_edge(break_block->index, &lc->loop_exit);
9982
9983          Block* continue_block = ctx->program->create_and_insert_block();
9984          continue_block->kind = block_kind_uniform;
9985          bld.reset(continue_block);
9986          bld.branch(aco_opcode::p_branch, bld.def(s2));
9987          add_linear_edge(block_idx, continue_block);
9988          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
9989
9990          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9991             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
9992          ctx->block = &ctx->program->blocks[block_idx];
9993       } else {
9994          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
9995          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9996             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9997          else
9998             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9999       }
10000
10001       bld.reset(ctx->block);
10002       bld.branch(aco_opcode::p_branch, bld.def(s2));
10003    }
10004
10005    ctx->cf_info.has_branch = false;
10006    ctx->program->next_loop_depth--;
10007
10008    // TODO: if the loop has not a single exit, we must add one °°
10009    /* emit loop successor block */
10010    ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10011    append_logical_start(ctx->block);
10012
10013 #if 0
10014    // TODO: check if it is beneficial to not branch on continues
10015    /* trim linear phis in loop header */
10016    for (auto&& instr : loop_entry->instructions) {
10017       if (instr->opcode == aco_opcode::p_linear_phi) {
10018          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
10019          new_phi->definitions[0] = instr->definitions[0];
10020          for (unsigned i = 0; i < new_phi->operands.size(); i++)
10021             new_phi->operands[i] = instr->operands[i];
10022          /* check that the remaining operands are all the same */
10023          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
10024             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
10025          instr.swap(new_phi);
10026       } else if (instr->opcode == aco_opcode::p_phi) {
10027          continue;
10028       } else {
10029          break;
10030       }
10031    }
10032 #endif
10033
10034    ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10035    ctx->cf_info.parent_loop.exit = lc->exit_old;
10036    ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10037    ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10038    ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10039    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10040       ctx->cf_info.exec_potentially_empty_discard = false;
10041 }
10042
10043 void
10044 emit_loop_jump(isel_context* ctx, bool is_break)
10045 {
10046    Builder bld(ctx->program, ctx->block);
10047    Block* logical_target;
10048    append_logical_end(ctx->block);
10049    unsigned idx = ctx->block->index;
10050
10051    if (is_break) {
10052       logical_target = ctx->cf_info.parent_loop.exit;
10053       add_logical_edge(idx, logical_target);
10054       ctx->block->kind |= block_kind_break;
10055
10056       if (!ctx->cf_info.parent_if.is_divergent &&
10057           !ctx->cf_info.parent_loop.has_divergent_continue) {
10058          /* uniform break - directly jump out of the loop */
10059          ctx->block->kind |= block_kind_uniform;
10060          ctx->cf_info.has_branch = true;
10061          bld.branch(aco_opcode::p_branch, bld.def(s2));
10062          add_linear_edge(idx, logical_target);
10063          return;
10064       }
10065       ctx->cf_info.parent_loop.has_divergent_branch = true;
10066    } else {
10067       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10068       add_logical_edge(idx, logical_target);
10069       ctx->block->kind |= block_kind_continue;
10070
10071       if (!ctx->cf_info.parent_if.is_divergent) {
10072          /* uniform continue - directly jump to the loop header */
10073          ctx->block->kind |= block_kind_uniform;
10074          ctx->cf_info.has_branch = true;
10075          bld.branch(aco_opcode::p_branch, bld.def(s2));
10076          add_linear_edge(idx, logical_target);
10077          return;
10078       }
10079
10080       /* for potential uniform breaks after this continue,
10081          we must ensure that they are handled correctly */
10082       ctx->cf_info.parent_loop.has_divergent_continue = true;
10083       ctx->cf_info.parent_loop.has_divergent_branch = true;
10084    }
10085
10086    if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10087       ctx->cf_info.exec_potentially_empty_break = true;
10088       ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10089    }
10090
10091    /* remove critical edges from linear CFG */
10092    bld.branch(aco_opcode::p_branch, bld.def(s2));
10093    Block* break_block = ctx->program->create_and_insert_block();
10094    break_block->kind |= block_kind_uniform;
10095    add_linear_edge(idx, break_block);
10096    /* the loop_header pointer might be invalidated by this point */
10097    if (!is_break)
10098       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10099    add_linear_edge(break_block->index, logical_target);
10100    bld.reset(break_block);
10101    bld.branch(aco_opcode::p_branch, bld.def(s2));
10102
10103    Block* continue_block = ctx->program->create_and_insert_block();
10104    add_linear_edge(idx, continue_block);
10105    append_logical_start(continue_block);
10106    ctx->block = continue_block;
10107 }
10108
10109 void
10110 emit_loop_break(isel_context* ctx)
10111 {
10112    emit_loop_jump(ctx, true);
10113 }
10114
10115 void
10116 emit_loop_continue(isel_context* ctx)
10117 {
10118    emit_loop_jump(ctx, false);
10119 }
10120
10121 void
10122 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10123 {
10124    /* visit_block() would usually do this but divergent jumps updates ctx->block */
10125    ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10126
10127    switch (instr->type) {
10128    case nir_jump_break: emit_loop_break(ctx); break;
10129    case nir_jump_continue: emit_loop_continue(ctx); break;
10130    default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10131    }
10132 }
10133
10134 void
10135 visit_block(isel_context* ctx, nir_block* block)
10136 {
10137    if (ctx->block->kind & block_kind_top_level) {
10138       Builder bld(ctx->program, ctx->block);
10139       for (Temp tmp : ctx->unended_linear_vgprs)
10140          bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
10141       ctx->unended_linear_vgprs.clear();
10142    }
10143
10144    ctx->block->instructions.reserve(ctx->block->instructions.size() +
10145                                     exec_list_length(&block->instr_list) * 2);
10146    nir_foreach_instr (instr, block) {
10147       switch (instr->type) {
10148       case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10149       case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10150       case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10151       case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10152       case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10153       case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
10154       case nir_instr_type_deref: break;
10155       case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10156       default: isel_err(instr, "Unknown NIR instr type");
10157       }
10158    }
10159
10160    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10161       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10162 }
10163
10164 static Operand
10165 create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10166                      aco_ptr<Instruction>& header_phi, Operand* vals)
10167 {
10168    vals[0] = Operand(header_phi->definitions[0].getTemp());
10169    RegClass rc = vals[0].regClass();
10170
10171    unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10172
10173    unsigned next_pred = 1;
10174
10175    for (unsigned idx = first + 1; idx <= last; idx++) {
10176       Block& block = ctx->program->blocks[idx];
10177       if (block.loop_nest_depth != loop_nest_depth) {
10178          vals[idx - first] = vals[idx - 1 - first];
10179          continue;
10180       }
10181
10182       if ((block.kind & block_kind_continue) && block.index != last) {
10183          vals[idx - first] = header_phi->operands[next_pred];
10184          next_pred++;
10185          continue;
10186       }
10187
10188       bool all_same = true;
10189       for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10190          all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10191
10192       Operand val;
10193       if (all_same) {
10194          val = vals[block.linear_preds[0] - first];
10195       } else {
10196          aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10197             aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10198          for (unsigned i = 0; i < block.linear_preds.size(); i++)
10199             phi->operands[i] = vals[block.linear_preds[i] - first];
10200          val = Operand(ctx->program->allocateTmp(rc));
10201          phi->definitions[0] = Definition(val.getTemp());
10202          block.instructions.emplace(block.instructions.begin(), std::move(phi));
10203       }
10204       vals[idx - first] = val;
10205    }
10206
10207    return vals[last - first];
10208 }
10209
10210 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10211 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10212 static void end_uniform_if(isel_context* ctx, if_context* ic);
10213
10214 static void
10215 visit_loop(isel_context* ctx, nir_loop* loop)
10216 {
10217    assert(!nir_loop_has_continue_construct(loop));
10218    loop_context lc;
10219    begin_loop(ctx, &lc);
10220
10221    bool unreachable = visit_cf_list(ctx, &loop->body);
10222
10223    unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10224
10225    /* Fixup phis in loop header from unreachable blocks.
10226     * has_branch/has_divergent_branch also indicates if the loop ends with a
10227     * break/continue instruction, but we don't emit those if unreachable=true */
10228    if (unreachable) {
10229       assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10230       bool linear = ctx->cf_info.has_branch;
10231       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10232       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10233          if ((logical && instr->opcode == aco_opcode::p_phi) ||
10234              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
10235             /* the last operand should be the one that needs to be removed */
10236             instr->operands.pop_back();
10237          } else if (!is_phi(instr)) {
10238             break;
10239          }
10240       }
10241    }
10242
10243    /* Fixup linear phis in loop header from expecting a continue. Both this fixup
10244     * and the previous one shouldn't both happen at once because a break in the
10245     * merge block would get CSE'd */
10246    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10247       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10248       Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10249       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10250          if (instr->opcode == aco_opcode::p_linear_phi) {
10251             if (ctx->cf_info.has_branch)
10252                instr->operands.pop_back();
10253             else
10254                instr->operands.back() =
10255                   create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10256          } else if (!is_phi(instr)) {
10257             break;
10258          }
10259       }
10260    }
10261
10262    /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10263     * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10264     */
10265    if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10266       Builder bld(ctx->program, ctx->block);
10267       Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10268       if_context ic;
10269       begin_uniform_if_then(ctx, &ic, cond);
10270       emit_loop_break(ctx);
10271       begin_uniform_if_else(ctx, &ic);
10272       end_uniform_if(ctx, &ic);
10273    }
10274
10275    end_loop(ctx, &lc);
10276 }
10277
10278 static void
10279 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
10280                         nir_selection_control sel_ctrl = nir_selection_control_none)
10281 {
10282    ic->cond = cond;
10283
10284    append_logical_end(ctx->block);
10285    ctx->block->kind |= block_kind_branch;
10286
10287    /* branch to linear then block */
10288    assert(cond.regClass() == ctx->program->lane_mask);
10289    aco_ptr<Pseudo_branch_instruction> branch;
10290    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10291                                                               Format::PSEUDO_BRANCH, 1, 1));
10292    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10293    branch->operands[0] = Operand(cond);
10294    branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
10295                                       sel_ctrl == nir_selection_control_divergent_always_taken;
10296    ctx->block->instructions.push_back(std::move(branch));
10297
10298    ic->BB_if_idx = ctx->block->index;
10299    ic->BB_invert = Block();
10300    /* Invert blocks are intentionally not marked as top level because they
10301     * are not part of the logical cfg. */
10302    ic->BB_invert.kind |= block_kind_invert;
10303    ic->BB_endif = Block();
10304    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10305
10306    ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10307    ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10308    ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10309    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10310    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10311    ctx->cf_info.parent_if.is_divergent = true;
10312
10313    /* divergent branches use cbranch_execz */
10314    ctx->cf_info.exec_potentially_empty_discard = false;
10315    ctx->cf_info.exec_potentially_empty_break = false;
10316    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10317
10318    /** emit logical then block */
10319    ctx->program->next_divergent_if_logical_depth++;
10320    Block* BB_then_logical = ctx->program->create_and_insert_block();
10321    add_edge(ic->BB_if_idx, BB_then_logical);
10322    ctx->block = BB_then_logical;
10323    append_logical_start(BB_then_logical);
10324 }
10325
10326 static void
10327 begin_divergent_if_else(isel_context* ctx, if_context* ic,
10328                         nir_selection_control sel_ctrl = nir_selection_control_none)
10329 {
10330    Block* BB_then_logical = ctx->block;
10331    append_logical_end(BB_then_logical);
10332    /* branch from logical then block to invert block */
10333    aco_ptr<Pseudo_branch_instruction> branch;
10334    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10335                                                               Format::PSEUDO_BRANCH, 0, 1));
10336    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10337    BB_then_logical->instructions.emplace_back(std::move(branch));
10338    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10339    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10340       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10341    BB_then_logical->kind |= block_kind_uniform;
10342    assert(!ctx->cf_info.has_branch);
10343    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10344    ctx->cf_info.parent_loop.has_divergent_branch = false;
10345    ctx->program->next_divergent_if_logical_depth--;
10346
10347    /** emit linear then block */
10348    Block* BB_then_linear = ctx->program->create_and_insert_block();
10349    BB_then_linear->kind |= block_kind_uniform;
10350    add_linear_edge(ic->BB_if_idx, BB_then_linear);
10351    /* branch from linear then block to invert block */
10352    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10353                                                               Format::PSEUDO_BRANCH, 0, 1));
10354    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10355    BB_then_linear->instructions.emplace_back(std::move(branch));
10356    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10357
10358    /** emit invert merge block */
10359    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10360    ic->invert_idx = ctx->block->index;
10361
10362    /* branch to linear else block (skip else) */
10363    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10364                                                               Format::PSEUDO_BRANCH, 0, 1));
10365    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10366    branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
10367                                       sel_ctrl == nir_selection_control_divergent_always_taken;
10368    ctx->block->instructions.push_back(std::move(branch));
10369
10370    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10371    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10372    ic->exec_potentially_empty_break_depth_old = std::min(
10373       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10374    /* divergent branches use cbranch_execz */
10375    ctx->cf_info.exec_potentially_empty_discard = false;
10376    ctx->cf_info.exec_potentially_empty_break = false;
10377    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10378
10379    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10380    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10381
10382    /** emit logical else block */
10383    ctx->program->next_divergent_if_logical_depth++;
10384    Block* BB_else_logical = ctx->program->create_and_insert_block();
10385    add_logical_edge(ic->BB_if_idx, BB_else_logical);
10386    add_linear_edge(ic->invert_idx, BB_else_logical);
10387    ctx->block = BB_else_logical;
10388    append_logical_start(BB_else_logical);
10389 }
10390
10391 static void
10392 end_divergent_if(isel_context* ctx, if_context* ic)
10393 {
10394    Block* BB_else_logical = ctx->block;
10395    append_logical_end(BB_else_logical);
10396
10397    /* branch from logical else block to endif block */
10398    aco_ptr<Pseudo_branch_instruction> branch;
10399    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10400                                                               Format::PSEUDO_BRANCH, 0, 1));
10401    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10402    BB_else_logical->instructions.emplace_back(std::move(branch));
10403    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10404    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10405       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10406    BB_else_logical->kind |= block_kind_uniform;
10407    ctx->program->next_divergent_if_logical_depth--;
10408
10409    assert(!ctx->cf_info.has_branch);
10410    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10411
10412    /** emit linear else block */
10413    Block* BB_else_linear = ctx->program->create_and_insert_block();
10414    BB_else_linear->kind |= block_kind_uniform;
10415    add_linear_edge(ic->invert_idx, BB_else_linear);
10416
10417    /* branch from linear else block to endif block */
10418    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10419                                                               Format::PSEUDO_BRANCH, 0, 1));
10420    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10421    BB_else_linear->instructions.emplace_back(std::move(branch));
10422    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10423
10424    /** emit endif merge block */
10425    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10426    append_logical_start(ctx->block);
10427
10428    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10429    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10430    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10431    ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10432       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10433    if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10434        !ctx->cf_info.parent_if.is_divergent) {
10435       ctx->cf_info.exec_potentially_empty_break = false;
10436       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10437    }
10438    /* uniform control flow never has an empty exec-mask */
10439    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10440       ctx->cf_info.exec_potentially_empty_discard = false;
10441       ctx->cf_info.exec_potentially_empty_break = false;
10442       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10443    }
10444    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10445 }
10446
10447 static void
10448 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10449 {
10450    assert(cond.regClass() == s1);
10451
10452    append_logical_end(ctx->block);
10453    ctx->block->kind |= block_kind_uniform;
10454
10455    aco_ptr<Pseudo_branch_instruction> branch;
10456    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10457    branch.reset(
10458       create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10459    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10460    branch->operands[0] = Operand(cond);
10461    branch->operands[0].setFixed(scc);
10462    ctx->block->instructions.emplace_back(std::move(branch));
10463
10464    ic->BB_if_idx = ctx->block->index;
10465    ic->BB_endif = Block();
10466    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10467
10468    ctx->cf_info.has_branch = false;
10469    ctx->cf_info.parent_loop.has_divergent_branch = false;
10470
10471    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10472
10473    /** emit then block */
10474    ctx->program->next_uniform_if_depth++;
10475    Block* BB_then = ctx->program->create_and_insert_block();
10476    add_edge(ic->BB_if_idx, BB_then);
10477    append_logical_start(BB_then);
10478    ctx->block = BB_then;
10479 }
10480
10481 static void
10482 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10483 {
10484    Block* BB_then = ctx->block;
10485
10486    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10487    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10488
10489    if (!ic->uniform_has_then_branch) {
10490       append_logical_end(BB_then);
10491       /* branch from then block to endif block */
10492       aco_ptr<Pseudo_branch_instruction> branch;
10493       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10494                                                                  Format::PSEUDO_BRANCH, 0, 1));
10495       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10496       BB_then->instructions.emplace_back(std::move(branch));
10497       add_linear_edge(BB_then->index, &ic->BB_endif);
10498       if (!ic->then_branch_divergent)
10499          add_logical_edge(BB_then->index, &ic->BB_endif);
10500       BB_then->kind |= block_kind_uniform;
10501    }
10502
10503    ctx->cf_info.has_branch = false;
10504    ctx->cf_info.parent_loop.has_divergent_branch = false;
10505
10506    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10507    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10508
10509    /** emit else block */
10510    Block* BB_else = ctx->program->create_and_insert_block();
10511    add_edge(ic->BB_if_idx, BB_else);
10512    append_logical_start(BB_else);
10513    ctx->block = BB_else;
10514 }
10515
10516 static void
10517 end_uniform_if(isel_context* ctx, if_context* ic)
10518 {
10519    Block* BB_else = ctx->block;
10520
10521    if (!ctx->cf_info.has_branch) {
10522       append_logical_end(BB_else);
10523       /* branch from then block to endif block */
10524       aco_ptr<Pseudo_branch_instruction> branch;
10525       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10526                                                                  Format::PSEUDO_BRANCH, 0, 1));
10527       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10528       BB_else->instructions.emplace_back(std::move(branch));
10529       add_linear_edge(BB_else->index, &ic->BB_endif);
10530       if (!ctx->cf_info.parent_loop.has_divergent_branch)
10531          add_logical_edge(BB_else->index, &ic->BB_endif);
10532       BB_else->kind |= block_kind_uniform;
10533    }
10534
10535    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10536    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10537    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10538
10539    /** emit endif merge block */
10540    ctx->program->next_uniform_if_depth--;
10541    if (!ctx->cf_info.has_branch) {
10542       ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10543       append_logical_start(ctx->block);
10544    }
10545 }
10546
10547 static bool
10548 visit_if(isel_context* ctx, nir_if* if_stmt)
10549 {
10550    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10551    Builder bld(ctx->program, ctx->block);
10552    aco_ptr<Pseudo_branch_instruction> branch;
10553    if_context ic;
10554
10555    if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10556       /**
10557        * Uniform conditionals are represented in the following way*) :
10558        *
10559        * The linear and logical CFG:
10560        *                        BB_IF
10561        *                        /    \
10562        *       BB_THEN (logical)      BB_ELSE (logical)
10563        *                        \    /
10564        *                        BB_ENDIF
10565        *
10566        * *) Exceptions may be due to break and continue statements within loops
10567        *    If a break/continue happens within uniform control flow, it branches
10568        *    to the loop exit/entry block. Otherwise, it branches to the next
10569        *    merge block.
10570        **/
10571
10572       assert(cond.regClass() == ctx->program->lane_mask);
10573       cond = bool_to_scalar_condition(ctx, cond);
10574
10575       begin_uniform_if_then(ctx, &ic, cond);
10576       visit_cf_list(ctx, &if_stmt->then_list);
10577
10578       begin_uniform_if_else(ctx, &ic);
10579       visit_cf_list(ctx, &if_stmt->else_list);
10580
10581       end_uniform_if(ctx, &ic);
10582    } else { /* non-uniform condition */
10583       /**
10584        * To maintain a logical and linear CFG without critical edges,
10585        * non-uniform conditionals are represented in the following way*) :
10586        *
10587        * The linear CFG:
10588        *                        BB_IF
10589        *                        /    \
10590        *       BB_THEN (logical)      BB_THEN (linear)
10591        *                        \    /
10592        *                        BB_INVERT (linear)
10593        *                        /    \
10594        *       BB_ELSE (logical)      BB_ELSE (linear)
10595        *                        \    /
10596        *                        BB_ENDIF
10597        *
10598        * The logical CFG:
10599        *                        BB_IF
10600        *                        /    \
10601        *       BB_THEN (logical)      BB_ELSE (logical)
10602        *                        \    /
10603        *                        BB_ENDIF
10604        *
10605        * *) Exceptions may be due to break and continue statements within loops
10606        **/
10607
10608       begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
10609       visit_cf_list(ctx, &if_stmt->then_list);
10610
10611       begin_divergent_if_else(ctx, &ic, if_stmt->control);
10612       visit_cf_list(ctx, &if_stmt->else_list);
10613
10614       end_divergent_if(ctx, &ic);
10615    }
10616
10617    return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10618 }
10619
10620 static bool
10621 visit_cf_list(isel_context* ctx, struct exec_list* list)
10622 {
10623    foreach_list_typed (nir_cf_node, node, node, list) {
10624       switch (node->type) {
10625       case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10626       case nir_cf_node_if:
10627          if (!visit_if(ctx, nir_cf_node_as_if(node)))
10628             return true;
10629          break;
10630       case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10631       default: unreachable("unimplemented cf list type");
10632       }
10633    }
10634    return false;
10635 }
10636
10637 struct mrt_color_export {
10638    int slot;
10639    unsigned write_mask;
10640    Operand values[4];
10641    uint8_t col_format;
10642
10643    /* Fields below are only used for PS epilogs. */
10644    bool is_int8;
10645    bool is_int10;
10646    bool enable_mrt_output_nan_fixup;
10647 };
10648
10649 static void
10650 export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt)
10651 {
10652    Builder bld(ctx->program, ctx->block);
10653
10654    bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3],
10655            mrt->enabled_channels, mrt->target, mrt->compr);
10656
10657    ctx->program->has_color_exports = true;
10658 }
10659
10660 static bool
10661 export_fs_mrt_color(isel_context* ctx, const struct mrt_color_export* out,
10662                     struct aco_export_mrt* mrt)
10663 {
10664    Builder bld(ctx->program, ctx->block);
10665    Operand values[4];
10666
10667    for (unsigned i = 0; i < 4; ++i) {
10668       values[i] = out->values[i];
10669    }
10670
10671    unsigned target;
10672    unsigned enabled_channels = 0;
10673    aco_opcode compr_op = aco_opcode::num_opcodes;
10674    bool compr = false;
10675    bool is_16bit = values[0].regClass() == v2b;
10676
10677    target = V_008DFC_SQ_EXP_MRT + out->slot;
10678
10679    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10680    if (out->enable_mrt_output_nan_fixup && !is_16bit &&
10681        (out->col_format == V_028714_SPI_SHADER_32_R ||
10682         out->col_format == V_028714_SPI_SHADER_32_GR ||
10683         out->col_format == V_028714_SPI_SHADER_32_AR ||
10684         out->col_format == V_028714_SPI_SHADER_32_ABGR ||
10685         out->col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10686       u_foreach_bit (i, out->write_mask) {
10687          Temp is_not_nan =
10688             bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]);
10689          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i],
10690                               is_not_nan);
10691       }
10692    }
10693
10694    switch (out->col_format) {
10695    case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10696
10697    case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10698
10699    case V_028714_SPI_SHADER_32_AR:
10700       if (ctx->options->gfx_level >= GFX10) {
10701          /* Special case: on GFX10, the outputs are different for 32_AR */
10702          enabled_channels = 0x3;
10703          values[1] = values[3];
10704          values[3] = Operand(v1);
10705       } else {
10706          enabled_channels = 0x9;
10707       }
10708       break;
10709
10710    case V_028714_SPI_SHADER_FP16_ABGR:
10711       for (int i = 0; i < 2; i++) {
10712          bool enabled = (out->write_mask >> (i * 2)) & 0x3;
10713          if (enabled) {
10714             enabled_channels |= 0x3 << (i * 2);
10715             if (is_16bit) {
10716                values[i] =
10717                   bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
10718                              values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
10719                              values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
10720             } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
10721                values[i] =
10722                   bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
10723                            values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10724                            values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10725             } else {
10726                values[i] =
10727                   bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
10728                            values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
10729                            values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
10730             }
10731          } else {
10732             values[i] = Operand(v1);
10733          }
10734       }
10735       values[2] = Operand(v1);
10736       values[3] = Operand(v1);
10737       compr = true;
10738       break;
10739
10740    case V_028714_SPI_SHADER_UNORM16_ABGR:
10741       if (is_16bit && ctx->options->gfx_level >= GFX9) {
10742          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10743       } else {
10744          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10745       }
10746       break;
10747
10748    case V_028714_SPI_SHADER_SNORM16_ABGR:
10749       if (is_16bit && ctx->options->gfx_level >= GFX9) {
10750          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10751       } else {
10752          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10753       }
10754       break;
10755
10756    case V_028714_SPI_SHADER_UINT16_ABGR:
10757       compr_op = aco_opcode::v_cvt_pk_u16_u32;
10758       if (out->is_int8 || out->is_int10) {
10759          /* clamp */
10760          uint32_t max_rgb = out->is_int8 ? 255 : out->is_int10 ? 1023 : 0;
10761
10762          u_foreach_bit (i, out->write_mask) {
10763             uint32_t max = i == 3 && out->is_int10 ? 3 : max_rgb;
10764
10765             values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]);
10766          }
10767       } else if (is_16bit) {
10768          u_foreach_bit (i, out->write_mask) {
10769             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10770             values[i] = Operand(tmp);
10771          }
10772       }
10773       break;
10774
10775    case V_028714_SPI_SHADER_SINT16_ABGR:
10776       compr_op = aco_opcode::v_cvt_pk_i16_i32;
10777       if (out->is_int8 || out->is_int10) {
10778          /* clamp */
10779          uint32_t max_rgb = out->is_int8 ? 127 : out->is_int10 ? 511 : 0;
10780          uint32_t min_rgb = out->is_int8 ? -128 : out->is_int10 ? -512 : 0;
10781
10782          u_foreach_bit (i, out->write_mask) {
10783             uint32_t max = i == 3 && out->is_int10 ? 1 : max_rgb;
10784             uint32_t min = i == 3 && out->is_int10 ? -2u : min_rgb;
10785
10786             values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]);
10787             values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]);
10788          }
10789       } else if (is_16bit) {
10790          u_foreach_bit (i, out->write_mask) {
10791             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10792             values[i] = Operand(tmp);
10793          }
10794       }
10795       break;
10796
10797    case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
10798
10799    case V_028714_SPI_SHADER_ZERO:
10800    default: return false;
10801    }
10802
10803    if (compr_op != aco_opcode::num_opcodes) {
10804       for (int i = 0; i < 2; i++) {
10805          /* check if at least one of the values to be compressed is enabled */
10806          bool enabled = (out->write_mask >> (i * 2)) & 0x3;
10807          if (enabled) {
10808             enabled_channels |= 0x3 << (i * 2);
10809             values[i] = bld.vop3(
10810                compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10811                values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10812          } else {
10813             values[i] = Operand(v1);
10814          }
10815       }
10816       values[2] = Operand(v1);
10817       values[3] = Operand(v1);
10818       compr = true;
10819    } else if (!compr) {
10820       for (int i = 0; i < 4; i++)
10821          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10822    }
10823
10824    if (ctx->program->gfx_level >= GFX11) {
10825       /* GFX11 doesn't use COMPR for exports, but the channel mask should be
10826        * 0x3 instead.
10827        */
10828       enabled_channels = compr ? 0x3 : enabled_channels;
10829       compr = false;
10830    }
10831
10832    for (unsigned i = 0; i < 4; i++)
10833       mrt->out[i] = values[i];
10834    mrt->target = target;
10835    mrt->enabled_channels = enabled_channels;
10836    mrt->compr = compr;
10837
10838    return true;
10839 }
10840
10841 static void
10842 create_fs_null_export(isel_context* ctx)
10843 {
10844    /* FS must always have exports.
10845     * So when there are none, we need to add a null export.
10846     */
10847
10848    Builder bld(ctx->program, ctx->block);
10849    /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */
10850    unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
10851    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
10852            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
10853
10854    ctx->program->has_color_exports = true;
10855 }
10856
10857 static void
10858 create_fs_jump_to_epilog(isel_context* ctx)
10859 {
10860    Builder bld(ctx->program, ctx->block);
10861    std::vector<Operand> color_exports;
10862    PhysReg exports_start(256); /* VGPR 0 */
10863
10864    for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) {
10865       unsigned color_index = slot - FRAG_RESULT_DATA0;
10866       unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
10867       unsigned write_mask = ctx->outputs.mask[slot];
10868
10869       if (!write_mask)
10870          continue;
10871
10872       PhysReg color_start(exports_start.reg() + color_index * 4);
10873
10874       for (unsigned i = 0; i < 4; i++) {
10875          if (!(write_mask & BITFIELD_BIT(i))) {
10876             color_exports.emplace_back(Operand(v1));
10877             continue;
10878          }
10879
10880          PhysReg chan_reg = color_start.advance(i * 4u);
10881          Operand chan(ctx->outputs.temps[slot * 4u + i]);
10882
10883          if (color_type == ACO_TYPE_FLOAT16) {
10884             chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan);
10885          } else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) {
10886             bool sign_ext = color_type == ACO_TYPE_INT16;
10887             Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
10888             chan = Operand(tmp);
10889          }
10890
10891          chan.setFixed(chan_reg);
10892          color_exports.emplace_back(chan);
10893       }
10894    }
10895
10896    Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.ps.epilog_pc));
10897
10898    aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
10899       aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + color_exports.size(), 0)};
10900    jump->operands[0] = Operand(continue_pc);
10901    for (unsigned i = 0; i < color_exports.size(); i++) {
10902       jump->operands[i + 1] = color_exports[i];
10903    }
10904    ctx->block->instructions.emplace_back(std::move(jump));
10905 }
10906
10907 PhysReg
10908 get_arg_reg(const struct ac_shader_args* args, struct ac_arg arg)
10909 {
10910    assert(arg.used);
10911    enum ac_arg_regfile file = args->args[arg.arg_index].file;
10912    unsigned reg = args->args[arg.arg_index].offset;
10913    return PhysReg(file == AC_ARG_SGPR ? reg : reg + 256);
10914 }
10915
10916 static Operand
10917 get_arg_for_end(isel_context* ctx, struct ac_arg arg)
10918 {
10919    return Operand(get_arg(ctx, arg), get_arg_reg(ctx->args, arg));
10920 }
10921
10922 static Temp
10923 get_tcs_out_current_patch_data_offset(isel_context* ctx)
10924 {
10925    Builder bld(ctx->program, ctx->block);
10926
10927    const unsigned output_vertex_size = ctx->program->info.tcs.num_linked_outputs * 4u;
10928    const unsigned pervertex_output_patch_size =
10929       ctx->program->info.tcs.tcs_vertices_out * output_vertex_size;
10930    const unsigned output_patch_stride =
10931       pervertex_output_patch_size + ctx->program->info.tcs.num_linked_patch_outputs * 4u;
10932
10933    Temp tcs_rel_ids = get_arg(ctx, ctx->args->tcs_rel_ids);
10934    Temp rel_patch_id =
10935       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand::c32(0u), Operand::c32(8u));
10936    Temp patch_offset = bld.v_mul_imm(bld.def(v1), rel_patch_id, output_patch_stride, false);
10937
10938    Temp tcs_offchip_layout = get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout);
10939
10940    Temp patch_control_points = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
10941                                         tcs_offchip_layout, Operand::c32(0x3f));
10942
10943    Temp num_patches = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10944                                tcs_offchip_layout, Operand::c32(0x60006));
10945
10946    Temp lshs_vertex_stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10947                                       tcs_offchip_layout, Operand::c32(0x8000c));
10948
10949    Temp input_patch_size =
10950       bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), patch_control_points, lshs_vertex_stride);
10951
10952    Temp output_patch0_offset =
10953       bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches, input_patch_size);
10954
10955    Temp output_patch_offset =
10956       bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
10957                      Operand::c32(pervertex_output_patch_size), output_patch0_offset);
10958
10959    return bld.nuw().vadd32(bld.def(v1), patch_offset, output_patch_offset);
10960 }
10961
10962 static Temp
10963 get_patch_base(isel_context* ctx)
10964 {
10965    Builder bld(ctx->program, ctx->block);
10966
10967    const unsigned output_vertex_size = ctx->program->info.tcs.num_linked_outputs * 16u;
10968    const unsigned pervertex_output_patch_size =
10969       ctx->program->info.tcs.tcs_vertices_out * output_vertex_size;
10970
10971    Temp num_patches =
10972       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10973                get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout), Operand::c32(0x60006));
10974
10975    return bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches,
10976                    Operand::c32(pervertex_output_patch_size));
10977 }
10978
10979 static void
10980 passthrough_all_args(isel_context* ctx, std::vector<Operand>& regs)
10981 {
10982    struct ac_arg arg;
10983    arg.used = true;
10984
10985    for (arg.arg_index = 0; arg.arg_index < ctx->args->arg_count; arg.arg_index++)
10986       regs.emplace_back(get_arg_for_end(ctx, arg));
10987 }
10988
10989 static void
10990 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
10991 {
10992    aco_ptr<Pseudo_instruction> end{create_instruction<Pseudo_instruction>(
10993       aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
10994
10995    for (unsigned i = 0; i < regs.size(); i++)
10996       end->operands[i] = regs[i];
10997
10998    ctx->block->instructions.emplace_back(std::move(end));
10999 }
11000
11001 static void
11002 create_tcs_jump_to_epilog(isel_context* ctx)
11003 {
11004    Builder bld(ctx->program, ctx->block);
11005
11006    PhysReg vgpr_start(256); /* VGPR 0 */
11007    PhysReg sgpr_start(0);   /* SGPR 0 */
11008
11009    /* SGPRs */
11010    Operand ring_offsets = Operand(get_arg(ctx, ctx->args->ring_offsets));
11011    ring_offsets.setFixed(sgpr_start);
11012
11013    Operand tess_offchip_offset = Operand(get_arg(ctx, ctx->args->tess_offchip_offset));
11014    tess_offchip_offset.setFixed(sgpr_start.advance(8u));
11015
11016    Operand tcs_factor_offset = Operand(get_arg(ctx, ctx->args->tcs_factor_offset));
11017    tcs_factor_offset.setFixed(sgpr_start.advance(12u));
11018
11019    Operand tcs_offchip_layout = Operand(get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout));
11020    tcs_offchip_layout.setFixed(sgpr_start.advance(16u));
11021
11022    Operand patch_base = Operand(get_patch_base(ctx));
11023    patch_base.setFixed(sgpr_start.advance(20u));
11024
11025    /* VGPRs */
11026    Operand tcs_out_current_patch_data_offset = Operand(get_tcs_out_current_patch_data_offset(ctx));
11027    tcs_out_current_patch_data_offset.setFixed(vgpr_start);
11028
11029    Operand invocation_id =
11030       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11031                Operand::c32(8u), Operand::c32(5u));
11032    invocation_id.setFixed(vgpr_start.advance(4u));
11033
11034    Operand rel_patch_id =
11035       bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11036                  Operand::c32(0u), Operand::c32(8u), Operand::c32(0u));
11037    rel_patch_id.setFixed(vgpr_start.advance(8u));
11038
11039    Temp continue_pc =
11040       convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.tcs.epilog_pc));
11041
11042    aco_ptr<Pseudo_instruction> jump{
11043       create_instruction<Pseudo_instruction>(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 9, 0)};
11044    jump->operands[0] = Operand(continue_pc);
11045    jump->operands[1] = ring_offsets;
11046    jump->operands[2] = tess_offchip_offset;
11047    jump->operands[3] = tcs_factor_offset;
11048    jump->operands[4] = tcs_offchip_layout;
11049    jump->operands[5] = patch_base;
11050    jump->operands[6] = tcs_out_current_patch_data_offset;
11051    jump->operands[7] = invocation_id;
11052    jump->operands[8] = rel_patch_id;
11053    ctx->block->instructions.emplace_back(std::move(jump));
11054 }
11055
11056 static void
11057 create_tcs_end_for_epilog(isel_context* ctx)
11058 {
11059    std::vector<Operand> regs;
11060
11061    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.tcs.tcs_offchip_layout));
11062    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.tcs.tes_offchip_addr));
11063    regs.emplace_back(get_arg_for_end(ctx, ctx->args->tess_offchip_offset));
11064    regs.emplace_back(get_arg_for_end(ctx, ctx->args->tcs_factor_offset));
11065
11066    Builder bld(ctx->program, ctx->block);
11067
11068    /* Leave a hole corresponding to the two input VGPRs. This ensures that
11069     * the invocation_id output does not alias the tcs_rel_ids input,
11070     * which saves a V_MOV on gfx9.
11071     */
11072    unsigned vgpr = 256 + ctx->args->num_vgprs_used;
11073
11074    Temp rel_patch_id =
11075       bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11076                  Operand::c32(0u), Operand::c32(8u), Operand::c32(0u));
11077    regs.emplace_back(Operand(rel_patch_id, PhysReg{vgpr++}));
11078
11079    Temp invocation_id =
11080       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11081                Operand::c32(8u), Operand::c32(5u));
11082    regs.emplace_back(Operand(invocation_id, PhysReg{vgpr++}));
11083
11084    if (ctx->program->info.tcs.pass_tessfactors_by_reg) {
11085       vgpr++; /* skip the tess factor LDS offset */
11086
11087       unsigned slot = VARYING_SLOT_TESS_LEVEL_OUTER;
11088       u_foreach_bit (i, ctx->outputs.mask[slot]) {
11089          regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11090       }
11091       vgpr += 4;
11092
11093       slot = VARYING_SLOT_TESS_LEVEL_INNER;
11094       u_foreach_bit (i, ctx->outputs.mask[slot]) {
11095          regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11096       }
11097    } else {
11098       Temp patch0_patch_data_offset =
11099          bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11100                   get_arg(ctx, ctx->program->info.tcs.vs_state_bits), Operand::c32(0xe000a));
11101
11102       Temp tf_lds_offset =
11103          bld.v_mul24_imm(bld.def(v1), rel_patch_id, ctx->program->info.tcs.patch_stride);
11104       tf_lds_offset = bld.nuw().vadd32(bld.def(v1), tf_lds_offset, patch0_patch_data_offset);
11105
11106       regs.emplace_back(Operand(tf_lds_offset, PhysReg{vgpr}));
11107    }
11108
11109    build_end_with_regs(ctx, regs);
11110 }
11111
11112 Pseudo_instruction*
11113 add_startpgm(struct isel_context* ctx)
11114 {
11115    unsigned def_count = 0;
11116    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11117       if (ctx->args->args[i].skip)
11118          continue;
11119       unsigned align = MIN2(4, util_next_power_of_two(ctx->args->args[i].size));
11120       if (ctx->args->args[i].file == AC_ARG_SGPR && ctx->args->args[i].offset % align)
11121          def_count += ctx->args->args[i].size;
11122       else
11123          def_count++;
11124    }
11125
11126    Pseudo_instruction* startpgm =
11127       create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
11128    ctx->block->instructions.emplace_back(startpgm);
11129    for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
11130       if (ctx->args->args[i].skip)
11131          continue;
11132
11133       enum ac_arg_regfile file = ctx->args->args[i].file;
11134       unsigned size = ctx->args->args[i].size;
11135       unsigned reg = ctx->args->args[i].offset;
11136       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11137
11138       if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) {
11139          Temp elems[16];
11140          for (unsigned j = 0; j < size; j++) {
11141             elems[j] = ctx->program->allocateTmp(s1);
11142             startpgm->definitions[arg++] = Definition(elems[j].id(), PhysReg{reg + j}, s1);
11143          }
11144          ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
11145       } else {
11146          Temp dst = ctx->program->allocateTmp(type);
11147          Definition def(dst);
11148          def.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11149          ctx->arg_temps[i] = dst;
11150          startpgm->definitions[arg++] = def;
11151
11152          if (ctx->args->args[i].pending_vmem) {
11153             assert(file == AC_ARG_VGPR);
11154             ctx->program->args_pending_vmem.push_back(def);
11155          }
11156       }
11157    }
11158
11159    /* epilog has no scratch */
11160    if (ctx->args->scratch_offset.used) {
11161       if (ctx->program->gfx_level < GFX9) {
11162          /* Stash these in the program so that they can be accessed later when
11163           * handling spilling.
11164           */
11165          if (ctx->args->ring_offsets.used)
11166             ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11167
11168          ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
11169       } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
11170          /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
11171           */
11172          Operand scratch_offset = Operand(get_arg(ctx, ctx->args->scratch_offset));
11173          scratch_offset.setLateKill(true);
11174
11175          Operand scratch_addr = ctx->args->ring_offsets.used
11176                                    ? Operand(get_arg(ctx, ctx->args->ring_offsets))
11177                                    : Operand(s2);
11178
11179          Builder bld(ctx->program, ctx->block);
11180          bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
11181                     scratch_offset);
11182       }
11183    }
11184
11185    return startpgm;
11186 }
11187
11188 void
11189 fix_ls_vgpr_init_bug(isel_context* ctx)
11190 {
11191    Builder bld(ctx->program, ctx->block);
11192    constexpr unsigned hs_idx = 1u;
11193    Builder::Result hs_thread_count =
11194       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11195                get_arg(ctx, ctx->args->merged_wave_info), Operand::c32((8u << 16) | (hs_idx * 8u)));
11196    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11197
11198    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11199
11200    Temp instance_id =
11201       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->vertex_id),
11202                get_arg(ctx, ctx->args->instance_id), ls_has_nonzero_hs_threads);
11203    Temp vs_rel_patch_id =
11204       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11205                get_arg(ctx, ctx->args->vs_rel_patch_id), ls_has_nonzero_hs_threads);
11206    Temp vertex_id =
11207       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_patch_id),
11208                get_arg(ctx, ctx->args->vertex_id), ls_has_nonzero_hs_threads);
11209
11210    ctx->arg_temps[ctx->args->instance_id.arg_index] = instance_id;
11211    ctx->arg_temps[ctx->args->vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11212    ctx->arg_temps[ctx->args->vertex_id.arg_index] = vertex_id;
11213 }
11214
11215 void
11216 split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11217 {
11218    /* Split all arguments except for the first (ring_offsets) and the last
11219     * (exec) so that the dead channels don't stay live throughout the program.
11220     */
11221    for (int i = 1; i < startpgm->definitions.size(); i++) {
11222       if (startpgm->definitions[i].regClass().size() > 1) {
11223          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11224                            startpgm->definitions[i].regClass().size());
11225       }
11226    }
11227 }
11228
11229 void
11230 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11231 {
11232    Program* program = ctx->program;
11233
11234    unsigned float_controls = shader->info.float_controls_execution_mode;
11235
11236    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11237       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11238    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11239       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11240                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11241
11242    program->next_fp_mode.must_flush_denorms32 =
11243       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11244    program->next_fp_mode.must_flush_denorms16_64 =
11245       float_controls &
11246       (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11247
11248    program->next_fp_mode.care_about_round32 =
11249       float_controls &
11250       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11251
11252    program->next_fp_mode.care_about_round16_64 =
11253       float_controls &
11254       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11255        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11256
11257    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11258     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11259    if (program->next_fp_mode.must_flush_denorms16_64)
11260       program->next_fp_mode.denorm16_64 = 0;
11261    else
11262       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11263
11264    /* preserving fp32 denorms is expensive, so only do it if asked */
11265    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11266       program->next_fp_mode.denorm32 = fp_denorm_keep;
11267    else
11268       program->next_fp_mode.denorm32 = 0;
11269
11270    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11271       program->next_fp_mode.round32 = fp_round_tz;
11272    else
11273       program->next_fp_mode.round32 = fp_round_ne;
11274
11275    if (float_controls &
11276        (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11277       program->next_fp_mode.round16_64 = fp_round_tz;
11278    else
11279       program->next_fp_mode.round16_64 = fp_round_ne;
11280
11281    ctx->block->fp_mode = program->next_fp_mode;
11282 }
11283
11284 void
11285 cleanup_cfg(Program* program)
11286 {
11287    /* create linear_succs/logical_succs */
11288    for (Block& BB : program->blocks) {
11289       for (unsigned idx : BB.linear_preds)
11290          program->blocks[idx].linear_succs.emplace_back(BB.index);
11291       for (unsigned idx : BB.logical_preds)
11292          program->blocks[idx].logical_succs.emplace_back(BB.index);
11293    }
11294 }
11295
11296 Temp
11297 lanecount_to_mask(isel_context* ctx, Temp count)
11298 {
11299    assert(count.regClass() == s1);
11300
11301    Builder bld(ctx->program, ctx->block);
11302    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11303    Temp cond;
11304
11305    if (ctx->program->wave_size == 64) {
11306       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11307       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11308                                 Operand::c32(6u /* log2(64) */));
11309       cond =
11310          bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11311    } else {
11312       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11313        * the register */
11314       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11315    }
11316
11317    return cond;
11318 }
11319
11320 Temp
11321 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11322 {
11323    Builder bld(ctx->program, ctx->block);
11324
11325    /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11326    Temp count = i == 0 ? get_arg(ctx, ctx->args->merged_wave_info)
11327                        : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11328                                   get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(i * 8u));
11329
11330    return lanecount_to_mask(ctx, count);
11331 }
11332
11333 static void
11334 insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
11335 {
11336    append_logical_end(ctx.block);
11337    ctx.block->kind |= block_kind_uniform;
11338
11339    unsigned src_count = ctx.args->arg_count;
11340    Pseudo_instruction* ret =
11341       create_instruction<Pseudo_instruction>(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
11342    ctx.block->instructions.emplace_back(ret);
11343
11344    for (unsigned i = 0; i < src_count; i++) {
11345       enum ac_arg_regfile file = ctx.args->args[i].file;
11346       unsigned size = ctx.args->args[i].size;
11347       unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
11348       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11349       Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
11350                                          : Operand(PhysReg{reg}, type);
11351       ret->operands[i] = op;
11352    }
11353
11354    Builder bld(ctx.program, ctx.block);
11355    bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
11356 }
11357
11358 void
11359 select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
11360                   const struct ac_shader_args* args)
11361 {
11362    for (unsigned i = 0; i < shader_count; i++) {
11363       if (i) {
11364          ctx.block = ctx.program->create_and_insert_block();
11365          ctx.block->kind = block_kind_top_level | block_kind_resume;
11366       }
11367
11368       nir_shader* nir = shaders[i];
11369       init_context(&ctx, nir);
11370       setup_fp_mode(&ctx, nir);
11371
11372       Pseudo_instruction* startpgm = add_startpgm(&ctx);
11373       append_logical_start(ctx.block);
11374       split_arguments(&ctx, startpgm);
11375       visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
11376
11377       /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
11378        * shader without shader calls.
11379        */
11380       if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
11381          insert_rt_jump_next(ctx, args);
11382
11383       cleanup_context(&ctx);
11384    }
11385
11386    ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
11387    cleanup_cfg(ctx.program);
11388 }
11389
11390 void
11391 pops_await_overlapped_waves(isel_context* ctx)
11392 {
11393    ctx->program->has_pops_overlapped_waves_wait = true;
11394
11395    Builder bld(ctx->program, ctx->block);
11396
11397    if (ctx->program->gfx_level >= GFX11) {
11398       /* GFX11+ - waiting for the export from the overlapped waves.
11399        * Await the export_ready event (bit wait_event_imm_dont_wait_export_ready clear).
11400        */
11401       bld.sopp(aco_opcode::s_wait_event, -1, 0);
11402       return;
11403    }
11404
11405    /* Pre-GFX11 - sleep loop polling the exiting wave ID. */
11406
11407    const Temp collision = get_arg(ctx, ctx->args->pops_collision_wave_id);
11408
11409    /* Check if there's an overlap in the current wave - otherwise, the wait may result in a hang. */
11410    const Temp did_overlap =
11411       bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), collision, Operand::c32(31));
11412    if_context did_overlap_if_context;
11413    begin_uniform_if_then(ctx, &did_overlap_if_context, did_overlap);
11414    bld.reset(ctx->block);
11415
11416    /* Set the packer register - after this, pops_exiting_wave_id can be polled. */
11417    if (ctx->program->gfx_level >= GFX10) {
11418       /* 2 packer ID bits on GFX10-10.3. */
11419       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11420                                       collision, Operand::c32(0x2001c));
11421       /* POPS_PACKER register: bit 0 - POPS enabled for this wave, bits 2:1 - packer ID. */
11422       const Temp packer_id_hwreg_bits = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1),
11423                                                  bld.def(s1, scc), packer_id, Operand::c32(1));
11424       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((3 - 1) << 11) | 25);
11425    } else {
11426       /* 1 packer ID bit on GFX9. */
11427       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11428                                       collision, Operand::c32(0x1001c));
11429       /* MODE register: bit 24 - wave is associated with packer 0, bit 25 - with packer 1.
11430        * Packer index to packer bits: 0 to 0b01, 1 to 0b10.
11431        */
11432       const Temp packer_id_hwreg_bits =
11433          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), packer_id, Operand::c32(1));
11434       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((2 - 1) << 11) | (24 << 6) | 1);
11435    }
11436
11437    Temp newest_overlapped_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11438                                              collision, Operand::c32(0xa0010));
11439    if (ctx->program->gfx_level < GFX10) {
11440       /* On GFX9, the newest overlapped wave ID value passed to the shader is smaller than the
11441        * actual wave ID by 1 in case of wraparound.
11442        */
11443       const Temp current_wave_id = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11444                                             collision, Operand::c32(0x3ff));
11445       const Temp newest_overlapped_wave_id_wrapped = bld.sopc(
11446          aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), newest_overlapped_wave_id, current_wave_id);
11447       newest_overlapped_wave_id =
11448          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), newest_overlapped_wave_id,
11449                   newest_overlapped_wave_id_wrapped);
11450    }
11451
11452    /* The wave IDs are the low 10 bits of a monotonically increasing wave counter.
11453     * The overlapped and the exiting wave IDs can't be larger than the current wave ID, and they are
11454     * no more than 1023 values behind the current wave ID.
11455     * Remap the overlapped and the exiting wave IDs from wrapping to monotonic so an unsigned
11456     * comparison can be used: the wave `current - 1023` becomes 0, it's followed by a piece growing
11457     * away from 0, then a piece increasing until UINT32_MAX, and the current wave is UINT32_MAX.
11458     * To do that, subtract `current - 1023`, which with wrapping arithmetic is (current + 1), and
11459     * `a - (b + 1)` is `a + ~b`.
11460     * Note that if the 10-bit current wave ID is 1023 (thus 1024 will be subtracted), the wave
11461     * `current - 1023` will become `UINT32_MAX - 1023` rather than 0, but all the possible wave IDs
11462     * will still grow monotonically in the 32-bit value, and the unsigned comparison will behave as
11463     * expected.
11464     */
11465    const Temp wave_id_offset = bld.sop2(aco_opcode::s_nand_b32, bld.def(s1), bld.def(s1, scc),
11466                                         collision, Operand::c32(0x3ff));
11467    newest_overlapped_wave_id = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11468                                         newest_overlapped_wave_id, wave_id_offset);
11469
11470    /* Await the overlapped waves. */
11471
11472    loop_context wait_loop_context;
11473    begin_loop(ctx, &wait_loop_context);
11474    bld.reset(ctx->block);
11475
11476    const Temp exiting_wave_id = bld.pseudo(aco_opcode::p_pops_gfx9_add_exiting_wave_id, bld.def(s1),
11477                                            bld.def(s1, scc), wave_id_offset);
11478    /* If the exiting (not exited) wave ID is larger than the newest overlapped wave ID (after
11479     * remapping both to monotonically increasing unsigned integers), the newest overlapped wave has
11480     * exited the ordered section.
11481     */
11482    const Temp newest_overlapped_wave_exited = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc),
11483                                                        newest_overlapped_wave_id, exiting_wave_id);
11484    if_context newest_overlapped_wave_exited_if_context;
11485    begin_uniform_if_then(ctx, &newest_overlapped_wave_exited_if_context,
11486                          newest_overlapped_wave_exited);
11487    emit_loop_break(ctx);
11488    begin_uniform_if_else(ctx, &newest_overlapped_wave_exited_if_context);
11489    end_uniform_if(ctx, &newest_overlapped_wave_exited_if_context);
11490    bld.reset(ctx->block);
11491
11492    /* Sleep before rechecking to let overlapped waves run for some time. */
11493    bld.sopp(aco_opcode::s_sleep, -1, ctx->program->gfx_level >= GFX10 ? UINT16_MAX : 3);
11494
11495    end_loop(ctx, &wait_loop_context);
11496    bld.reset(ctx->block);
11497
11498    /* Indicate the wait has been done to subsequent compilation stages. */
11499    bld.pseudo(aco_opcode::p_pops_gfx9_overlapped_wave_wait_done);
11500
11501    begin_uniform_if_else(ctx, &did_overlap_if_context);
11502    end_uniform_if(ctx, &did_overlap_if_context);
11503    bld.reset(ctx->block);
11504 }
11505
11506 static void
11507 create_merged_jump_to_epilog(isel_context* ctx)
11508 {
11509    Builder bld(ctx->program, ctx->block);
11510    std::vector<Operand> regs;
11511
11512    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11513       if (!ctx->args->args[i].preserved)
11514          continue;
11515
11516       const enum ac_arg_regfile file = ctx->args->args[i].file;
11517       const unsigned reg = ctx->args->args[i].offset;
11518
11519       Operand op(ctx->arg_temps[i]);
11520       op.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11521       regs.emplace_back(op);
11522    }
11523
11524    Temp continue_pc =
11525       convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));
11526
11527    aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
11528       aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
11529    jump->operands[0] = Operand(continue_pc);
11530    for (unsigned i = 0; i < regs.size(); i++) {
11531       jump->operands[i + 1] = regs[i];
11532    }
11533    ctx->block->instructions.emplace_back(std::move(jump));
11534 }
11535
11536 void
11537 select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, const bool need_barrier,
11538               if_context* ic_merged_wave_info, const bool check_merged_wave_info,
11539               const bool endif_merged_wave_info)
11540 {
11541    init_context(&ctx, nir);
11542    setup_fp_mode(&ctx, nir);
11543
11544    Program* program = ctx.program;
11545
11546    if (need_startpgm) {
11547       /* Needs to be after init_context() for FS. */
11548       Pseudo_instruction* startpgm = add_startpgm(&ctx);
11549       append_logical_start(ctx.block);
11550
11551       if (unlikely(ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
11552          fix_ls_vgpr_init_bug(&ctx);
11553
11554       split_arguments(&ctx, startpgm);
11555
11556       if (!program->info.vs.has_prolog &&
11557           (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11558          Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11559       }
11560    }
11561
11562    if (program->gfx_level == GFX10 && program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER &&
11563        !program->stage.has(SWStage::GS)) {
11564       /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11565        * s_sendmsg(GS_ALLOC_REQ).
11566        */
11567       Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11568    }
11569
11570    if (check_merged_wave_info) {
11571       const unsigned i =
11572          nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ? 0 : 1;
11573       const Temp cond = merged_wave_info_to_mask(&ctx, i);
11574       begin_divergent_if_then(&ctx, ic_merged_wave_info, cond);
11575    }
11576
11577    if (need_barrier) {
11578       const sync_scope scope = ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq &&
11579                                      program->wave_size % nir->info.tess.tcs_vertices_out == 0
11580                                   ? scope_subgroup
11581                                   : scope_workgroup;
11582
11583       Builder(ctx.program, ctx.block)
11584          .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
11585                   scope);
11586    }
11587
11588    nir_function_impl* func = nir_shader_get_entrypoint(nir);
11589    visit_cf_list(&ctx, &func->body);
11590
11591    if (ctx.program->info.has_epilog) {
11592       if (ctx.stage == fragment_fs) {
11593          create_fs_jump_to_epilog(&ctx);
11594
11595          /* FS epilogs always have at least one color/null export. */
11596          ctx.program->has_color_exports = true;
11597          ctx.block->kind |= block_kind_export_end;
11598       } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
11599          assert(ctx.stage == tess_control_hs || ctx.stage == vertex_tess_control_hs);
11600          if (ctx.options->is_opengl)
11601             create_tcs_end_for_epilog(&ctx);
11602          else
11603             create_tcs_jump_to_epilog(&ctx);
11604       }
11605    }
11606
11607    if (endif_merged_wave_info) {
11608       begin_divergent_if_else(&ctx, ic_merged_wave_info);
11609       end_divergent_if(&ctx, ic_merged_wave_info);
11610    }
11611
11612    if (!ctx.program->info.is_monolithic &&
11613        (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES)) {
11614       assert(program->gfx_level >= GFX9);
11615       create_merged_jump_to_epilog(&ctx);
11616       ctx.block->kind |= block_kind_export_end;
11617    }
11618
11619    cleanup_context(&ctx);
11620 }
11621
11622 void
11623 select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader* const* shaders)
11624 {
11625    if_context ic_merged_wave_info;
11626    const bool ngg_gs = ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.has(SWStage::GS);
11627
11628    for (unsigned i = 0; i < shader_count; i++) {
11629       nir_shader* nir = shaders[i];
11630
11631       /* We always need to insert p_startpgm at the beginning of the first shader.  */
11632       const bool need_startpgm = i == 0;
11633
11634       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11635       nir_function_impl* func = nir_shader_get_entrypoint(nir);
11636       const bool empty_shader =
11637          nir_cf_list_is_empty_block(&func->body) &&
11638          ((nir->info.stage == MESA_SHADER_VERTEX &&
11639            (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11640           (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11641
11642       /* See if we need to emit a check of the merged wave info SGPR. */
11643       const bool check_merged_wave_info =
11644          ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11645       const bool endif_merged_wave_info =
11646          ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11647
11648       /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11649       const bool tcs_skip_barrier =
11650          ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read;
11651
11652       /* A barrier is usually needed at the beginning of the second shader, with exceptions. */
11653       const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
11654
11655       select_shader(ctx, nir, need_startpgm, need_barrier, &ic_merged_wave_info,
11656                     check_merged_wave_info, endif_merged_wave_info);
11657
11658       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11659          /* Special handling when TCS input and output patch size is the same.
11660           * Outputs of the previous stage are inputs to the next stage.
11661           */
11662          ctx.inputs = ctx.outputs;
11663          ctx.outputs = shader_io_state();
11664       }
11665    }
11666 }
11667
11668 Temp
11669 get_tess_ring_descriptor(isel_context* ctx, const struct aco_tcs_epilog_info* einfo,
11670                          bool is_tcs_factor_ring)
11671 {
11672    Builder bld(ctx->program, ctx->block);
11673
11674    if (!ctx->options->is_opengl) {
11675       Temp ring_offsets = get_arg(ctx, ctx->args->ring_offsets);
11676       uint32_t tess_ring_offset =
11677          is_tcs_factor_ring ? 5 /* RING_HS_TESS_FACTOR */ : 6 /* RING_HS_TESS_OFFCHIP */;
11678       return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ring_offsets,
11679                       Operand::c32(tess_ring_offset * 16u));
11680    }
11681
11682    Temp addr = get_arg(ctx, einfo->tcs_out_lds_layout);
11683    /* TCS only receives high 13 bits of the address. */
11684    addr = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), addr,
11685                    Operand::c32(0xfff80000));
11686
11687    if (is_tcs_factor_ring) {
11688       addr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr,
11689                       Operand::c32(einfo->tess_offchip_ring_size));
11690    }
11691
11692    uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
11693                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
11694
11695    if (ctx->options->gfx_level >= GFX11) {
11696       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
11697                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
11698    } else if (ctx->options->gfx_level >= GFX10) {
11699       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
11700                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
11701    } else {
11702       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
11703                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
11704    }
11705
11706    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr,
11707                      Operand::c32(ctx->options->address32_hi), Operand::c32(0xffffffff),
11708                      Operand::c32(rsrc3));
11709 }
11710
11711 void
11712 store_tess_factor_to_tess_ring(isel_context* ctx, Temp tess_ring_desc, Temp factors[],
11713                                unsigned factor_comps, Temp sbase, Temp voffset, Temp num_patches,
11714                                unsigned patch_offset)
11715 {
11716    Builder bld(ctx->program, ctx->block);
11717
11718    Temp soffset = sbase;
11719    if (patch_offset) {
11720       Temp offset =
11721          bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches, Operand::c32(patch_offset));
11722       soffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), soffset, offset);
11723    }
11724
11725    Temp data = factor_comps == 1
11726                   ? factors[0]
11727                   : create_vec_from_array(ctx, factors, factor_comps, RegType::vgpr, 4);
11728
11729    emit_single_mubuf_store(ctx, tess_ring_desc, voffset, soffset, Temp(), data, 0,
11730                            memory_sync_info(storage_vmem_output), true, false, false);
11731 }
11732
11733 Temp
11734 build_fast_udiv_nuw(isel_context* ctx, Temp num, Temp multiplier, Temp pre_shift, Temp post_shift,
11735                     Temp increment)
11736 {
11737    Builder bld(ctx->program, ctx->block);
11738
11739    num = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), pre_shift, num);
11740    num = bld.nuw().vadd32(bld.def(v1), num, increment);
11741    num = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), num, multiplier);
11742    return bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), post_shift, num);
11743 }
11744
11745 Temp
11746 get_gl_vs_prolog_vertex_index(isel_context* ctx, const struct aco_gl_vs_prolog_info* vinfo,
11747                               unsigned input_index, Temp instance_divisor_constbuf)
11748 {
11749    bool divisor_is_one = vinfo->instance_divisor_is_one & (1u << input_index);
11750    bool divisor_is_fetched = vinfo->instance_divisor_is_fetched & (1u << input_index);
11751
11752    Builder bld(ctx->program, ctx->block);
11753
11754    Temp index;
11755    if (divisor_is_one) {
11756       index = get_arg(ctx, ctx->args->instance_id);
11757    } else if (divisor_is_fetched) {
11758       Temp instance_id = get_arg(ctx, ctx->args->instance_id);
11759
11760       Temp udiv_factors = bld.smem(aco_opcode::s_buffer_load_dwordx4, bld.def(s4),
11761                                    instance_divisor_constbuf, Operand::c32(input_index * 16));
11762       emit_split_vector(ctx, udiv_factors, 4);
11763
11764       index = build_fast_udiv_nuw(ctx, instance_id, emit_extract_vector(ctx, udiv_factors, 0, s1),
11765                                   emit_extract_vector(ctx, udiv_factors, 1, s1),
11766                                   emit_extract_vector(ctx, udiv_factors, 2, s1),
11767                                   emit_extract_vector(ctx, udiv_factors, 3, s1));
11768    }
11769
11770    if (divisor_is_one || divisor_is_fetched) {
11771       Temp start_instance = get_arg(ctx, ctx->args->start_instance);
11772       index = bld.vadd32(bld.def(v1), index, start_instance);
11773    } else {
11774       Temp base_vertex = get_arg(ctx, ctx->args->base_vertex);
11775       Temp vertex_id = get_arg(ctx, ctx->args->vertex_id);
11776       index = bld.vadd32(bld.def(v1), base_vertex, vertex_id);
11777    }
11778
11779    return index;
11780 }
11781
11782 } /* end namespace */
11783
11784 void
11785 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11786                ac_shader_config* config, const struct aco_compiler_options* options,
11787                const struct aco_shader_info* info, const struct ac_shader_args* args)
11788 {
11789    isel_context ctx =
11790       setup_isel_context(program, shader_count, shaders, config, options, info, args);
11791
11792    if (ctx.stage == raytracing_cs)
11793       return select_program_rt(ctx, shader_count, shaders, args);
11794
11795    if (shader_count >= 2) {
11796       select_program_merged(ctx, shader_count, shaders);
11797    } else {
11798       bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
11799       if_context ic_merged_wave_info;
11800
11801       /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
11802       if (!ctx.program->info.is_monolithic) {
11803          assert(ctx.program->gfx_level >= GFX9);
11804          if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
11805             check_merged_wave_info = endif_merged_wave_info = true;
11806          } else {
11807             const bool ngg_gs =
11808                ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.sw == SWStage::GS;
11809             assert(ctx.stage == tess_control_hs || ctx.stage == geometry_gs || ngg_gs);
11810             check_merged_wave_info = endif_merged_wave_info = !ngg_gs;
11811             need_barrier = !ngg_gs;
11812          }
11813       }
11814
11815       select_shader(ctx, shaders[0], true, need_barrier, &ic_merged_wave_info,
11816                     check_merged_wave_info, endif_merged_wave_info);
11817    }
11818
11819    program->config->float_mode = program->blocks[0].fp_mode.val;
11820
11821    append_logical_end(ctx.block);
11822    ctx.block->kind |= block_kind_uniform;
11823
11824    if (!ctx.program->info.has_epilog ||
11825        (shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL &&
11826         options->gfx_level >= GFX9)) {
11827       Builder bld(ctx.program, ctx.block);
11828       bld.sopp(aco_opcode::s_endpgm);
11829    }
11830
11831    cleanup_cfg(program);
11832 }
11833
11834 void
11835 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
11836                            const struct aco_compiler_options* options,
11837                            const struct aco_shader_info* info, const struct ac_shader_args* args)
11838 {
11839    assert(options->gfx_level == GFX8);
11840
11841    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
11842                 config);
11843
11844    isel_context ctx = {};
11845    ctx.program = program;
11846    ctx.args = args;
11847    ctx.options = options;
11848    ctx.stage = program->stage;
11849
11850    ctx.block = ctx.program->create_and_insert_block();
11851    ctx.block->kind = block_kind_top_level;
11852
11853    program->workgroup_size = 1; /* XXX */
11854
11855    add_startpgm(&ctx);
11856    append_logical_start(ctx.block);
11857
11858    Builder bld(ctx.program, ctx.block);
11859
11860    /* Load the buffer descriptor from TMA. */
11861    bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
11862             Operand::zero());
11863
11864    /* Store TTMP0-TTMP1. */
11865    bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
11866             Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
11867
11868    uint32_t hw_regs_idx[] = {
11869       2, /* HW_REG_STATUS */
11870       3, /* HW_REG_TRAP_STS */
11871       4, /* HW_REG_HW_ID */
11872       7, /* HW_REG_IB_STS */
11873    };
11874
11875    /* Store some hardware registers. */
11876    for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
11877       /* "((size - 1) << 11) | register" */
11878       bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
11879                ((20 - 1) << 11) | hw_regs_idx[i]);
11880
11881       bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
11882                Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
11883    }
11884
11885    program->config->float_mode = program->blocks[0].fp_mode.val;
11886
11887    append_logical_end(ctx.block);
11888    ctx.block->kind |= block_kind_uniform;
11889    bld.sopp(aco_opcode::s_endpgm);
11890
11891    cleanup_cfg(program);
11892 }
11893
11894 Operand
11895 get_arg_fixed(const struct ac_shader_args* args, struct ac_arg arg)
11896 {
11897    enum ac_arg_regfile file = args->args[arg.arg_index].file;
11898    unsigned size = args->args[arg.arg_index].size;
11899    RegClass rc = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11900    return Operand(get_arg_reg(args, arg), rc);
11901 }
11902
11903 unsigned
11904 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
11905 {
11906    unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
11907
11908    unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
11909    if (bld.program->gfx_level >= GFX10 && num_loads > 1)
11910       bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
11911
11912    for (unsigned i = 0; i < count;) {
11913       unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
11914
11915       if (size == 4)
11916          bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
11917                   Operand::c32((start + i) * 16u));
11918       else if (size == 2)
11919          bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
11920                   Operand::c32((start + i) * 16u));
11921       else
11922          bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
11923                   Operand::c32((start + i) * 16u));
11924
11925       dest = dest.advance(size * 16u);
11926       i += size;
11927    }
11928
11929    return count;
11930 }
11931
11932 Operand
11933 calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args,
11934                             const struct aco_vs_prolog_info* pinfo, unsigned index,
11935                             Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
11936                             PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
11937 {
11938    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
11939             get_arg_fixed(args, pinfo->inputs), Operand::c32(8u + index * 8u));
11940
11941    wait_imm lgkm_imm;
11942    lgkm_imm.lgkm = 0;
11943    bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->gfx_level));
11944
11945    Definition fetch_index_def(tmp_vgpr0, v1);
11946    Operand fetch_index(tmp_vgpr0, v1);
11947
11948    Operand div_info(tmp_sgpr, s1);
11949    if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX11) {
11950       /* use SDWA */
11951       if (bld.program->gfx_level < GFX9) {
11952          bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
11953          div_info = Operand(tmp_vgpr1, v1);
11954       }
11955
11956       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
11957
11958       Instruction* instr;
11959       if (bld.program->gfx_level >= GFX9)
11960          instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
11961       else
11962          instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
11963                                div_info, fetch_index)
11964                     .instr;
11965       instr->sdwa().sel[0] = SubdwordSel::ubyte1;
11966
11967       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
11968                fetch_index);
11969
11970       instr =
11971          bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
11972       instr->sdwa().sel[0] = SubdwordSel::ubyte2;
11973    } else {
11974       Operand tmp_op(tmp_vgpr1, v1);
11975       Definition tmp_def(tmp_vgpr1, v1);
11976
11977       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
11978
11979       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
11980       bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
11981
11982       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
11983                Operand(tmp_sgpr.advance(4), s1));
11984
11985       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
11986       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
11987    }
11988
11989    bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
11990
11991    return fetch_index;
11992 }
11993
11994 void
11995 select_rt_prolog(Program* program, ac_shader_config* config,
11996                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
11997                  const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
11998 {
11999    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12000                 config);
12001    Block* block = program->create_and_insert_block();
12002    block->kind = block_kind_top_level;
12003    program->workgroup_size = info->workgroup_size;
12004    program->wave_size = info->workgroup_size;
12005    calc_min_waves(program);
12006    Builder bld(program, block);
12007    block->instructions.reserve(32);
12008    unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
12009    unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
12010
12011    /* Inputs:
12012     * Ring offsets:                s[0-1]
12013     * Indirect descriptor sets:    s[2]
12014     * Push constants pointer:      s[3]
12015     * SBT descriptors:             s[4-5]
12016     * Traversal shader address:    s[6-7]
12017     * Ray launch size address:     s[8-9]
12018     * Dynamic callable stack base: s[10]
12019     * Workgroup IDs (xyz):         s[11], s[12], s[13]
12020     * Scratch offset:              s[14]
12021     * Local invocation IDs:        v[0-2]
12022     */
12023    PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
12024    PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
12025    PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
12026    PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
12027    PhysReg in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]);
12028    PhysReg in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]);
12029    PhysReg in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]);
12030    PhysReg in_scratch_offset;
12031    if (options->gfx_level < GFX11)
12032       in_scratch_offset = get_arg_reg(in_args, in_args->scratch_offset);
12033    PhysReg in_local_ids[2] = {
12034       get_arg_reg(in_args, in_args->local_invocation_ids),
12035       get_arg_reg(in_args, in_args->local_invocation_ids).advance(4),
12036    };
12037
12038    /* Outputs:
12039     * Callee shader PC:            s[0-1]
12040     * Indirect descriptor sets:    s[2]
12041     * Push constants pointer:      s[3]
12042     * SBT descriptors:             s[4-5]
12043     * Traversal shader address:    s[6-7]
12044     * Ray launch sizes (xyz):      s[8], s[9], s[10]
12045     * Scratch offset (<GFX9 only): s[11]
12046     * Ring offsets (<GFX9 only):   s[12-13]
12047     * Ray launch IDs:              v[0-2]
12048     * Stack pointer:               v[3]
12049     * Shader VA:                   v[4-5]
12050     * Shader Record Ptr:           v[6-7]
12051     */
12052    PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
12053    PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_size);
12054    PhysReg out_launch_size_z = out_launch_size_x.advance(8);
12055    PhysReg out_launch_ids[3];
12056    for (unsigned i = 0; i < 3; i++)
12057       out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_id).advance(i * 4);
12058    PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
12059    PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
12060
12061    /* Temporaries: */
12062    num_sgprs = align(num_sgprs, 2) + 4;
12063    PhysReg tmp_raygen_sbt = PhysReg{num_sgprs - 4};
12064    PhysReg tmp_ring_offsets = PhysReg{num_sgprs - 2};
12065
12066    /* Confirm some assumptions about register aliasing */
12067    assert(in_ring_offsets == out_uniform_shader_addr);
12068    assert(get_arg_reg(in_args, in_args->push_constants) ==
12069           get_arg_reg(out_args, out_args->push_constants));
12070    assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
12071           get_arg_reg(out_args, out_args->rt.sbt_descriptors));
12072    assert(in_launch_size_addr == out_launch_size_x);
12073    assert(in_stack_base == out_launch_size_z);
12074    assert(in_local_ids[0] == out_launch_ids[0]);
12075
12076    /* load raygen sbt */
12077    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
12078             Operand::c32(0u));
12079
12080    /* init scratch */
12081    if (options->gfx_level < GFX9) {
12082       /* copy ring offsets to temporary location*/
12083       bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
12084                Operand(in_ring_offsets, s2));
12085    } else if (options->gfx_level < GFX11) {
12086       hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
12087                       Operand(in_scratch_offset, s1));
12088    }
12089
12090    /* set stack ptr */
12091    bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
12092
12093    /* load raygen address */
12094    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
12095             Operand(tmp_raygen_sbt, s2), Operand::c32(0u));
12096
12097    /* load ray launch sizes */
12098    bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
12099             Operand(in_launch_size_addr, s2), Operand::c32(8u));
12100    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
12101             Operand(in_launch_size_addr, s2), Operand::c32(0u));
12102
12103    /* calculate ray launch ids */
12104    if (options->gfx_level >= GFX11) {
12105       /* Thread IDs are packed in VGPR0, 10 bits per component. */
12106       bld.vop3(aco_opcode::v_bfe_u32, Definition(in_local_ids[1], v1), Operand(in_local_ids[0], v1),
12107                Operand::c32(10u), Operand::c32(3u));
12108       bld.vop2(aco_opcode::v_and_b32, Definition(in_local_ids[0], v1), Operand::c32(0x7),
12109                Operand(in_local_ids[0], v1));
12110    }
12111    /* Do this backwards to reduce some RAW hazards on GFX11+ */
12112    bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
12113    bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1), Operand(in_wg_id_y, s1),
12114             Operand::c32(program->workgroup_size == 32 ? 4 : 8), Operand(in_local_ids[1], v1));
12115    bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1),
12116             Operand::c32(8), Operand(in_local_ids[0], v1));
12117
12118    if (options->gfx_level < GFX9) {
12119       /* write scratch/ring offsets to outputs, if needed */
12120       bld.sop1(aco_opcode::s_mov_b32,
12121                Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
12122                Operand(in_scratch_offset, s1));
12123       bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
12124                Operand(tmp_ring_offsets, s2));
12125    }
12126
12127    /* calculate shader record ptr: SBT + RADV_RT_HANDLE_SIZE */
12128    if (options->gfx_level < GFX9) {
12129       bld.vop2_e64(aco_opcode::v_add_co_u32, Definition(out_record_ptr, v1), Definition(vcc, s2),
12130                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12131    } else {
12132       bld.vop2_e64(aco_opcode::v_add_u32, Definition(out_record_ptr, v1),
12133                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12134    }
12135    bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
12136             Operand(tmp_raygen_sbt.advance(4), s1));
12137
12138    /* jump to raygen */
12139    bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
12140
12141    program->config->float_mode = program->blocks[0].fp_mode.val;
12142    program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12143    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12144 }
12145
12146 void
12147 select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config,
12148                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12149                  const struct ac_shader_args* args)
12150 {
12151    assert(pinfo->num_attributes > 0);
12152
12153    /* This should be enough for any shader/stage. */
12154    unsigned max_user_sgprs = options->gfx_level >= GFX9 ? 32 : 16;
12155
12156    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12157                 config);
12158    program->dev.vgpr_limit = 256;
12159
12160    Block* block = program->create_and_insert_block();
12161    block->kind = block_kind_top_level;
12162
12163    program->workgroup_size = 64;
12164    calc_min_waves(program);
12165
12166    Builder bld(program, block);
12167
12168    block->instructions.reserve(16 + pinfo->num_attributes * 4);
12169
12170    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
12171
12172    uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
12173    bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors & attrib_mask;
12174
12175    wait_imm lgkm_imm;
12176    lgkm_imm.lgkm = 0;
12177
12178    /* choose sgprs */
12179    PhysReg vertex_buffers(align(max_user_sgprs + 14, 2));
12180    PhysReg prolog_input = vertex_buffers.advance(8);
12181    PhysReg desc(
12182       align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
12183
12184    Operand start_instance = get_arg_fixed(args, args->start_instance);
12185    Operand instance_id = get_arg_fixed(args, args->instance_id);
12186
12187    PhysReg attributes_start(256 + args->num_vgprs_used);
12188    /* choose vgprs that won't be used for anything else until the last attribute load */
12189    PhysReg vertex_index(attributes_start.reg() + pinfo->num_attributes * 4 - 1);
12190    PhysReg instance_index(attributes_start.reg() + pinfo->num_attributes * 4 - 2);
12191    PhysReg start_instance_vgpr(attributes_start.reg() + pinfo->num_attributes * 4 - 3);
12192    PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + pinfo->num_attributes * 4 - 4);
12193    PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + pinfo->num_attributes * 4);
12194
12195    bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
12196             get_arg_fixed(args, args->vertex_buffers));
12197    if (options->address32_hi >= 0xffff8000 || options->address32_hi <= 0x7fff) {
12198       bld.sopk(aco_opcode::s_movk_i32, Definition(vertex_buffers.advance(4), s1),
12199                options->address32_hi & 0xFFFF);
12200    } else {
12201       bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
12202                Operand::c32((unsigned)options->address32_hi));
12203    }
12204
12205    /* calculate vgpr requirements */
12206    unsigned num_vgprs = attributes_start.reg() - 256;
12207    num_vgprs += pinfo->num_attributes * 4;
12208    if (has_nontrivial_divisors && program->gfx_level <= GFX8)
12209       num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
12210    unsigned num_sgprs = 0;
12211
12212    const struct ac_vtx_format_info* vtx_info_table =
12213       ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
12214
12215    for (unsigned loc = 0; loc < pinfo->num_attributes;) {
12216       unsigned num_descs =
12217          load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc);
12218       num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
12219
12220       if (loc == 0) {
12221          /* perform setup while we load the descriptors */
12222          if (pinfo->is_ngg || pinfo->next_stage != MESA_SHADER_VERTEX) {
12223             Operand count = get_arg_fixed(args, args->merged_wave_info);
12224             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
12225             if (program->wave_size == 64) {
12226                bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
12227                         Operand::c32(6u /* log2(64) */));
12228                bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
12229                         Operand(exec, s2), Operand(scc, s1));
12230             }
12231          }
12232
12233          bool needs_instance_index = false;
12234          bool needs_start_instance = false;
12235          u_foreach_bit (i, pinfo->state.instance_rate_inputs & attrib_mask) {
12236             needs_instance_index |= pinfo->state.divisors[i] == 1;
12237             needs_start_instance |= pinfo->state.divisors[i] == 0;
12238          }
12239          bool needs_vertex_index = ~pinfo->state.instance_rate_inputs & attrib_mask;
12240          if (needs_vertex_index)
12241             bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
12242                        get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);
12243          if (needs_instance_index)
12244             bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12245                        Operand(s2), true);
12246          if (needs_start_instance)
12247             bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12248       }
12249
12250       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
12251
12252       for (unsigned i = 0; i < num_descs;) {
12253          PhysReg dest(attributes_start.reg() + loc * 4u);
12254
12255          /* calculate index */
12256          Operand fetch_index = Operand(vertex_index, v1);
12257          if (pinfo->state.instance_rate_inputs & (1u << loc)) {
12258             uint32_t divisor = pinfo->state.divisors[loc];
12259             if (divisor) {
12260                fetch_index = instance_id;
12261                if (pinfo->state.nontrivial_divisors & (1u << loc)) {
12262                   unsigned index =
12263                      util_bitcount(pinfo->state.nontrivial_divisors & BITFIELD_MASK(loc));
12264                   fetch_index = calc_nontrivial_instance_id(
12265                      bld, args, pinfo, index, instance_id, start_instance, prolog_input,
12266                      nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12267                } else {
12268                   fetch_index = Operand(instance_index, v1);
12269                }
12270             } else {
12271                fetch_index = Operand(start_instance_vgpr, v1);
12272             }
12273          }
12274
12275          /* perform load */
12276          PhysReg cur_desc = desc.advance(i * 16);
12277          if ((pinfo->misaligned_mask & (1u << loc))) {
12278             const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->state.formats[loc]];
12279
12280             assert(vtx_info->has_hw_format & 0x1);
12281             unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12282             unsigned nfmt = vtx_info->hw_format[0] >> 4;
12283
12284             for (unsigned j = 0; j < vtx_info->num_channels; j++) {
12285                bool post_shuffle = pinfo->state.post_shuffle & (1u << loc);
12286                unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12287
12288                /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
12289                 * doesn't require this to work, but some GL CTS tests over Zink do this anyway.
12290                 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
12291                 * care).
12292                 */
12293                if (dfmt == V_008F0C_BUF_DATA_FORMAT_32)
12294                   bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
12295                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
12296                             false, true);
12297                else if (vtx_info->chan_byte_size == 8)
12298                   bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
12299                             Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
12300                             fetch_index, Operand::c32(0u), dfmt, nfmt, offset, false, true);
12301                else
12302                   bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12303                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), dfmt, nfmt,
12304                             offset, false, true);
12305             }
12306             uint32_t one =
12307                nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
12308                   ? 1u
12309                   : 0x3f800000u;
12310             /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
12311              * For 64-bit data types, no default attribute values are provided. Input variables must
12312              * not use more components than provided by the attribute.
12313              */
12314             for (unsigned j = vtx_info->num_channels; vtx_info->chan_byte_size != 8 && j < 4; j++) {
12315                bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
12316                         Operand::c32(j == 3 ? one : 0u));
12317             }
12318
12319             unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
12320             loc += slots;
12321             i += slots;
12322          } else {
12323             bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12324                       Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
12325             loc++;
12326             i++;
12327          }
12328       }
12329    }
12330
12331    if (pinfo->state.alpha_adjust_lo | pinfo->state.alpha_adjust_hi) {
12332       wait_imm vm_imm;
12333       vm_imm.vm = 0;
12334       bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->gfx_level));
12335    }
12336
12337    /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
12338     * so we may need to fix it up. */
12339    u_foreach_bit (loc, (pinfo->state.alpha_adjust_lo | pinfo->state.alpha_adjust_hi)) {
12340       PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
12341
12342       unsigned alpha_adjust = (pinfo->state.alpha_adjust_lo >> loc) & 0x1;
12343       alpha_adjust |= ((pinfo->state.alpha_adjust_hi >> loc) & 0x1) << 1;
12344
12345       if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
12346          bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
12347
12348       /* For the integer-like cases, do a natural sign extension.
12349        *
12350        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
12351        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
12352        * exponent.
12353        */
12354       unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
12355       bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
12356                Operand::c32(offset), Operand::c32(2u));
12357
12358       /* Convert back to the right type. */
12359       if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
12360          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12361          bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
12362                   Operand(alpha, v1));
12363       } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
12364          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12365       }
12366    }
12367
12368    block->kind |= block_kind_uniform;
12369
12370    /* continue on to the main shader */
12371    Operand continue_pc = get_arg_fixed(args, pinfo->inputs);
12372    if (has_nontrivial_divisors) {
12373       bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
12374                get_arg_fixed(args, pinfo->inputs), Operand::c32(0u));
12375       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
12376       continue_pc = Operand(prolog_input, s2);
12377    }
12378
12379    bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
12380
12381    program->config->float_mode = program->blocks[0].fp_mode.val;
12382    /* addition on GFX6-8 requires a carry-out (we use VCC) */
12383    program->needs_vcc = program->gfx_level <= GFX8;
12384    program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
12385    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12386 }
12387
12388 void
12389 select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
12390                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12391                  const struct ac_shader_args* args)
12392 {
12393    const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo;
12394    isel_context ctx =
12395       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
12396
12397    ctx.block->fp_mode = program->next_fp_mode;
12398
12399    add_startpgm(&ctx);
12400    append_logical_start(ctx.block);
12401
12402    Builder bld(ctx.program, ctx.block);
12403
12404    /* Export all color render targets */
12405    struct aco_export_mrt mrts[8];
12406    uint8_t exported_mrts = 0;
12407
12408    for (unsigned i = 0; i < 8; i++) {
12409       unsigned col_format = (einfo->spi_shader_col_format >> (i * 4)) & 0xf;
12410
12411       if (col_format == V_028714_SPI_SHADER_ZERO)
12412          continue;
12413
12414       struct mrt_color_export out;
12415
12416       out.slot = i;
12417       out.write_mask = 0xf;
12418       out.col_format = col_format;
12419       out.is_int8 = (einfo->color_is_int8 >> i) & 1;
12420       out.is_int10 = (einfo->color_is_int10 >> i) & 1;
12421       out.enable_mrt_output_nan_fixup = (options->enable_mrt_output_nan_fixup >> i) & 1;
12422
12423       Temp inputs = get_arg(&ctx, einfo->inputs[i]);
12424       emit_split_vector(&ctx, inputs, 4);
12425       for (unsigned c = 0; c < 4; ++c) {
12426          out.values[c] = Operand(emit_extract_vector(&ctx, inputs, c, v1));
12427       }
12428
12429       if (export_fs_mrt_color(&ctx, &out, &mrts[i])) {
12430          exported_mrts |= 1 << i;
12431       }
12432    }
12433
12434    if (exported_mrts) {
12435       if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) {
12436          struct aco_export_mrt* mrt0 = (exported_mrts & BITFIELD_BIT(0)) ? &mrts[0] : NULL;
12437          struct aco_export_mrt* mrt1 = (exported_mrts & BITFIELD_BIT(1)) ? &mrts[1] : NULL;
12438          create_fs_dual_src_export_gfx11(&ctx, mrt0, mrt1);
12439       } else {
12440          u_foreach_bit (i, exported_mrts) {
12441             export_mrt(&ctx, &mrts[i]);
12442          }
12443       }
12444    } else {
12445       create_fs_null_export(&ctx);
12446    }
12447
12448    program->config->float_mode = program->blocks[0].fp_mode.val;
12449
12450    append_logical_end(ctx.block);
12451    ctx.block->kind |= block_kind_export_end;
12452    bld.reset(ctx.block);
12453    bld.sopp(aco_opcode::s_endpgm);
12454
12455    cleanup_cfg(program);
12456 }
12457
12458 void
12459 select_tcs_epilog(Program* program, void* pinfo, ac_shader_config* config,
12460                   const struct aco_compiler_options* options, const struct aco_shader_info* info,
12461                   const struct ac_shader_args* args)
12462 {
12463    const struct aco_tcs_epilog_info* einfo = (const struct aco_tcs_epilog_info*)pinfo;
12464    isel_context ctx =
12465       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::TCS);
12466
12467    ctx.block->fp_mode = program->next_fp_mode;
12468
12469    add_startpgm(&ctx);
12470    append_logical_start(ctx.block);
12471
12472    Builder bld(ctx.program, ctx.block);
12473
12474    /* Add a barrier before loading tess factors from LDS. */
12475    if (!einfo->pass_tessfactors_by_reg) {
12476       /* To generate s_waitcnt lgkmcnt(0) when waitcnt insertion. */
12477       program->pending_lds_access = true;
12478
12479       sync_scope scope = einfo->tcs_out_patch_fits_subgroup ? scope_subgroup : scope_workgroup;
12480       bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
12481                   scope);
12482    }
12483
12484    Temp invocation_id = get_arg(&ctx, einfo->invocation_id);
12485
12486    Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), invocation_id);
12487
12488    if_context ic_invoc_0;
12489    begin_divergent_if_then(&ctx, &ic_invoc_0, cond);
12490
12491    int outer_comps, inner_comps;
12492    switch (einfo->primitive_mode) {
12493    case TESS_PRIMITIVE_ISOLINES:
12494       outer_comps = 2;
12495       inner_comps = 0;
12496       break;
12497    case TESS_PRIMITIVE_TRIANGLES:
12498       outer_comps = 3;
12499       inner_comps = 1;
12500       break;
12501    case TESS_PRIMITIVE_QUADS:
12502       outer_comps = 4;
12503       inner_comps = 2;
12504       break;
12505    default: unreachable("invalid primitive mode"); return;
12506    }
12507
12508    bld.reset(ctx.block);
12509
12510    unsigned tess_lvl_out_loc =
12511       ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER) * 16;
12512    unsigned tess_lvl_in_loc =
12513       ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER) * 16;
12514
12515    Temp outer[4];
12516    Temp inner[2];
12517    if (einfo->pass_tessfactors_by_reg) {
12518       for (int i = 0; i < outer_comps; i++)
12519          outer[i] = get_arg(&ctx, einfo->tess_lvl_out[i]);
12520
12521       for (int i = 0; i < inner_comps; i++)
12522          inner[i] = get_arg(&ctx, einfo->tess_lvl_in[i]);
12523    } else {
12524       Temp addr = get_arg(&ctx, einfo->tcs_out_current_patch_data_offset);
12525       addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr);
12526
12527       Temp data = program->allocateTmp(RegClass(RegType::vgpr, outer_comps));
12528       load_lds(&ctx, 4, outer_comps, data, addr, tess_lvl_out_loc, 4);
12529       for (int i = 0; i < outer_comps; i++)
12530          outer[i] = emit_extract_vector(&ctx, data, i, v1);
12531
12532       if (inner_comps) {
12533          data = program->allocateTmp(RegClass(RegType::vgpr, inner_comps));
12534          load_lds(&ctx, 4, inner_comps, data, addr, tess_lvl_in_loc, 4);
12535          for (int i = 0; i < inner_comps; i++)
12536             inner[i] = emit_extract_vector(&ctx, data, i, v1);
12537       }
12538    }
12539
12540    Temp tess_factor_ring_desc = get_tess_ring_descriptor(&ctx, einfo, true);
12541    Temp tess_factor_ring_base = get_arg(&ctx, args->tcs_factor_offset);
12542    Temp rel_patch_id = get_arg(&ctx, einfo->rel_patch_id);
12543    unsigned tess_factor_ring_const_offset = 0;
12544
12545    if (program->gfx_level <= GFX8) {
12546       /* Store the dynamic HS control word. */
12547       cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), rel_patch_id);
12548
12549       if_context ic_patch_0;
12550       begin_divergent_if_then(&ctx, &ic_patch_0, cond);
12551
12552       bld.reset(ctx.block);
12553
12554       Temp data = bld.copy(bld.def(v1), Operand::c32(0x80000000u));
12555
12556       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, Temp(0, v1), tess_factor_ring_base,
12557                               Temp(), data, 0, memory_sync_info(), true, false, false);
12558
12559       tess_factor_ring_const_offset += 4;
12560
12561       begin_divergent_if_else(&ctx, &ic_patch_0);
12562       end_divergent_if(&ctx, &ic_patch_0);
12563    }
12564
12565    bld.reset(ctx.block);
12566
12567    Temp tess_factor_ring_offset =
12568       bld.v_mul_imm(bld.def(v1), rel_patch_id, (inner_comps + outer_comps) * 4, false);
12569
12570    switch (einfo->primitive_mode) {
12571    case TESS_PRIMITIVE_ISOLINES: {
12572       /* For isolines, the hardware expects tess factors in the reverse order. */
12573       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), outer[1], outer[0]);
12574       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
12575                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
12576                               memory_sync_info(), true, false, false);
12577       break;
12578    }
12579    case TESS_PRIMITIVE_TRIANGLES: {
12580       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), outer[0], outer[1], outer[2],
12581                              inner[0]);
12582       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
12583                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
12584                               memory_sync_info(), true, false, false);
12585       break;
12586    }
12587    case TESS_PRIMITIVE_QUADS: {
12588       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), outer[0], outer[1], outer[2],
12589                              outer[3]);
12590       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
12591                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
12592                               memory_sync_info(), true, false, false);
12593
12594       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), inner[0], inner[1]);
12595       emit_single_mubuf_store(
12596          &ctx, tess_factor_ring_desc, tess_factor_ring_offset, tess_factor_ring_base, Temp(), data,
12597          tess_factor_ring_const_offset + 16, memory_sync_info(), true, false, false);
12598       break;
12599    }
12600    default: unreachable("invalid primitive mode"); break;
12601    }
12602
12603    if (einfo->tes_reads_tessfactors) {
12604       Temp layout = get_arg(&ctx, einfo->tcs_offchip_layout);
12605       Temp num_patches, patch_base;
12606
12607       if (ctx.options->is_opengl) {
12608          num_patches = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), layout,
12609                                 Operand::c32(0x3f));
12610          num_patches = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), num_patches,
12611                                 Operand::c32(1));
12612
12613          patch_base = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), layout,
12614                                Operand::c32(16));
12615       } else {
12616          num_patches = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), layout,
12617                                 Operand::c32(0x60006));
12618
12619          patch_base = get_arg(&ctx, einfo->patch_base);
12620       }
12621
12622       Temp tess_ring_desc = get_tess_ring_descriptor(&ctx, einfo, false);
12623       Temp tess_ring_base = get_arg(&ctx, args->tess_offchip_offset);
12624
12625       Temp sbase =
12626          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), tess_ring_base, patch_base);
12627
12628       Temp voffset =
12629          bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4), rel_patch_id);
12630
12631       store_tess_factor_to_tess_ring(&ctx, tess_ring_desc, outer, outer_comps, sbase, voffset,
12632                                      num_patches, tess_lvl_out_loc);
12633
12634       if (inner_comps) {
12635          store_tess_factor_to_tess_ring(&ctx, tess_ring_desc, inner, inner_comps, sbase, voffset,
12636                                         num_patches, tess_lvl_in_loc);
12637       }
12638    }
12639
12640    begin_divergent_if_else(&ctx, &ic_invoc_0);
12641    end_divergent_if(&ctx, &ic_invoc_0);
12642
12643    program->config->float_mode = program->blocks[0].fp_mode.val;
12644
12645    append_logical_end(ctx.block);
12646
12647    bld.reset(ctx.block);
12648    bld.sopp(aco_opcode::s_endpgm);
12649
12650    cleanup_cfg(program);
12651 }
12652
12653 void
12654 select_gl_vs_prolog(Program* program, void* pinfo, ac_shader_config* config,
12655                     const struct aco_compiler_options* options, const struct aco_shader_info* info,
12656                     const struct ac_shader_args* args)
12657 {
12658    const struct aco_gl_vs_prolog_info* vinfo = (const struct aco_gl_vs_prolog_info*)pinfo;
12659    isel_context ctx =
12660       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::VS);
12661
12662    ctx.block->fp_mode = program->next_fp_mode;
12663
12664    add_startpgm(&ctx);
12665    append_logical_start(ctx.block);
12666
12667    Builder bld(ctx.program, ctx.block);
12668
12669    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
12670
12671    if (vinfo->as_ls && options->has_ls_vgpr_init_bug)
12672       fix_ls_vgpr_init_bug(&ctx);
12673
12674    std::vector<Operand> regs;
12675    passthrough_all_args(&ctx, regs);
12676
12677    Temp instance_divisor_constbuf;
12678
12679    if (vinfo->instance_divisor_is_fetched) {
12680       Temp list = get_arg(&ctx, vinfo->internal_bindings);
12681       list = convert_pointer_to_64_bit(&ctx, list);
12682
12683       instance_divisor_constbuf = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list,
12684                                            Operand::c32(vinfo->instance_diviser_buf_offset));
12685    }
12686
12687    unsigned vgpr = 256 + ctx.args->num_vgprs_used;
12688
12689    for (unsigned i = 0; i < vinfo->num_inputs; i++) {
12690       Temp index = get_gl_vs_prolog_vertex_index(&ctx, vinfo, i, instance_divisor_constbuf);
12691       regs.emplace_back(Operand(index, PhysReg{vgpr + i}));
12692    }
12693
12694    program->config->float_mode = program->blocks[0].fp_mode.val;
12695
12696    append_logical_end(ctx.block);
12697
12698    build_end_with_regs(&ctx, regs);
12699
12700    cleanup_cfg(program);
12701 }
12702
12703 } // namespace aco