2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "aco_builder.h"
28 #include "common/sid.h"
35 struct lower_context {
38 std::vector<aco_ptr<Instruction>> instructions;
41 /* Class for obtaining where s_sendmsg(MSG_ORDERED_PS_DONE) must be done in a Primitive Ordered
42 * Pixel Shader on GFX9-10.3.
44 * MSG_ORDERED_PS_DONE must be sent once after the ordered section is done along all execution paths
45 * from the POPS packer ID hardware register setting to s_endpgm. It is, however, also okay to send
46 * it if the packer ID is not going to be set at all by the wave, so some conservativeness is fine.
48 * For simplicity, sending the message from top-level blocks as dominance and post-dominance
49 * checking for any location in the shader is trivial in them. Also, for simplicity, sending it
50 * regardless of whether the POPS packer ID hardware register has already potentially been set up.
52 * Note that there can be multiple interlock end instructions in the shader.
53 * SPV_EXT_fragment_shader_interlock requires OpEndInvocationInterlockEXT to be executed exactly
54 * once by the invocation. However, there may be, for instance, multiple ordered sections, and which
55 * one will be executed may depend on divergent control flow (some lanes may execute one ordered
56 * section, other lanes may execute another). MSG_ORDERED_PS_DONE, however, is sent via a scalar
57 * instruction, so it must be ensured that the message is sent after the last ordered section in the
60 class gfx9_pops_done_msg_bounds {
62 explicit gfx9_pops_done_msg_bounds() = default;
64 explicit gfx9_pops_done_msg_bounds(const Program* const program)
66 /* Find the top-level location after the last ordered section end pseudo-instruction in the
68 * Consider `p_pops_gfx9_overlapped_wave_wait_done` a boundary too - make sure the message
69 * isn't sent if any wait hasn't been fully completed yet (if a begin-end-begin situation
70 * occurs somehow, as the location of `p_pops_gfx9_ordered_section_done` is controlled by the
71 * application) for safety, assuming that waits are the only thing that need the packer
72 * hardware register to be set at some point during or before them, and it won't be set
73 * anymore after the last wait.
75 int last_top_level_block_idx = -1;
76 for (int block_idx = (int)program->blocks.size() - 1; block_idx >= 0; block_idx--) {
77 const Block& block = program->blocks[block_idx];
78 if (block.kind & block_kind_top_level) {
79 last_top_level_block_idx = block_idx;
81 for (size_t instr_idx = block.instructions.size() - 1; instr_idx + size_t(1) > 0;
83 const aco_opcode opcode = block.instructions[instr_idx]->opcode;
84 if (opcode == aco_opcode::p_pops_gfx9_ordered_section_done ||
85 opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done) {
86 end_block_idx_ = last_top_level_block_idx;
87 /* The same block if it's already a top-level block, or the beginning of the next
90 instr_after_end_idx_ = block_idx == end_block_idx_ ? instr_idx + 1 : 0;
94 if (end_block_idx_ != -1) {
100 /* If this is not -1, during the normal execution flow (not early exiting), MSG_ORDERED_PS_DONE
101 * must be sent in this block.
103 int end_block_idx() const { return end_block_idx_; }
105 /* If end_block_idx() is an existing block, during the normal execution flow (not early exiting),
106 * MSG_ORDERED_PS_DONE must be sent before this instruction in the block end_block_idx().
107 * If this is out of the bounds of the instructions in the end block, it must be sent in the end
110 size_t instr_after_end_idx() const { return instr_after_end_idx_; }
112 /* Whether an instruction doing early exit (such as discard) needs to send MSG_ORDERED_PS_DONE
113 * before actually ending the program.
115 bool early_exit_needs_done_msg(const int block_idx, const size_t instr_idx) const
117 return block_idx <= end_block_idx_ &&
118 (block_idx != end_block_idx_ || instr_idx < instr_after_end_idx_);
122 /* Initialize to an empty range for which "is inside" comparisons will be failing for any
125 int end_block_idx_ = -1;
126 size_t instr_after_end_idx_ = 0;
129 /* used by handle_operands() indirectly through Builder::copy */
130 uint8_t int8_mul_table[512] = {
131 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9,
132 1, 10, 1, 11, 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19,
133 1, 20, 1, 21, 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29,
134 1, 30, 1, 31, 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39,
135 1, 40, 1, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49,
136 1, 50, 1, 51, 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59,
137 1, 60, 1, 61, 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23,
138 2, 35, 11, 53, 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243,
139 2, 40, 3, 27, 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23,
140 2, 45, 7, 13, 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33,
141 2, 50, 7, 51, 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27,
142 2, 55, 3, 37, 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17,
143 2, 60, 9, 241, 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43,
144 5, 26, 9, 43, 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243,
145 4, 35, 3, 47, 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45,
146 3, 50, 7, 241, 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53,
147 4, 40, 7, 23, 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13,
148 5, 34, 3, 57, 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245,
149 3, 60, 5, 241, 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63,
150 5, 38, 5, 243, 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35,
151 4, 242, 5, 245, 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19,
152 5, 42, 3, 241, 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43,
153 3, 244, 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247,
154 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247, 9, 55,
155 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247, 1, 248, 1, 249,
156 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255};
159 get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
161 /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
162 * 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
163 * DPP with the arithmetic instructions. This requires to sign-extend.
168 if (gfx_level >= GFX10) {
169 return aco_opcode::v_add_u32;
170 } else if (gfx_level >= GFX8) {
171 return aco_opcode::v_add_u16;
173 return aco_opcode::v_add_co_u32;
178 if (gfx_level >= GFX10) {
179 return aco_opcode::v_mul_lo_u16_e64;
180 } else if (gfx_level >= GFX8) {
181 return aco_opcode::v_mul_lo_u16;
183 return aco_opcode::v_mul_u32_u24;
186 case fadd16: return aco_opcode::v_add_f16;
187 case fmul16: return aco_opcode::v_mul_f16;
190 if (gfx_level >= GFX10) {
191 return aco_opcode::v_max_i32;
192 } else if (gfx_level >= GFX8) {
193 return aco_opcode::v_max_i16;
195 return aco_opcode::v_max_i32;
200 if (gfx_level >= GFX10) {
201 return aco_opcode::v_min_i32;
202 } else if (gfx_level >= GFX8) {
203 return aco_opcode::v_min_i16;
205 return aco_opcode::v_min_i32;
210 if (gfx_level >= GFX10) {
211 return aco_opcode::v_min_u32;
212 } else if (gfx_level >= GFX8) {
213 return aco_opcode::v_min_u16;
215 return aco_opcode::v_min_u32;
220 if (gfx_level >= GFX10) {
221 return aco_opcode::v_max_u32;
222 } else if (gfx_level >= GFX8) {
223 return aco_opcode::v_max_u16;
225 return aco_opcode::v_max_u32;
228 case fmin16: return aco_opcode::v_min_f16;
229 case fmax16: return aco_opcode::v_max_f16;
230 case iadd32: return gfx_level >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
231 case imul32: return aco_opcode::v_mul_lo_u32;
232 case fadd32: return aco_opcode::v_add_f32;
233 case fmul32: return aco_opcode::v_mul_f32;
234 case imax32: return aco_opcode::v_max_i32;
235 case imin32: return aco_opcode::v_min_i32;
236 case umin32: return aco_opcode::v_min_u32;
237 case umax32: return aco_opcode::v_max_u32;
238 case fmin32: return aco_opcode::v_min_f32;
239 case fmax32: return aco_opcode::v_max_f32;
242 case iand32: return aco_opcode::v_and_b32;
245 case ixor32: return aco_opcode::v_xor_b32;
248 case ior32: return aco_opcode::v_or_b32;
249 case iadd64: return aco_opcode::num_opcodes;
250 case imul64: return aco_opcode::num_opcodes;
251 case fadd64: return aco_opcode::v_add_f64;
252 case fmul64: return aco_opcode::v_mul_f64;
253 case imin64: return aco_opcode::num_opcodes;
254 case imax64: return aco_opcode::num_opcodes;
255 case umin64: return aco_opcode::num_opcodes;
256 case umax64: return aco_opcode::num_opcodes;
257 case fmin64: return aco_opcode::v_min_f64;
258 case fmax64: return aco_opcode::v_max_f64;
259 case iand64: return aco_opcode::num_opcodes;
260 case ior64: return aco_opcode::num_opcodes;
261 case ixor64: return aco_opcode::num_opcodes;
262 default: return aco_opcode::num_opcodes;
267 is_vop3_reduce_opcode(aco_opcode opcode)
269 /* 64-bit reductions are VOP3. */
270 if (opcode == aco_opcode::num_opcodes)
273 return instr_info.format[(int)opcode] == Format::VOP3;
277 emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
279 Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
280 if (instr->definitions.size() >= 2) {
281 assert(instr->definitions[1].regClass() == bld.lm);
282 instr->definitions[1].setFixed(vcc);
287 emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
288 PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,
289 unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)
291 Builder bld(ctx->program, &ctx->instructions);
292 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
293 Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};
294 Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};
295 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
296 Operand src1_64 = Operand(src1_reg, v2);
297 Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};
298 Operand vtmp_op64 = Operand(vtmp_reg, v2);
300 if (ctx->program->gfx_level >= GFX10) {
302 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
303 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
305 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
307 bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
308 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
310 bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
311 Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
312 } else if (op == iand64) {
313 bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
315 bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
317 } else if (op == ior64) {
318 bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
320 bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
322 } else if (op == ixor64) {
323 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
325 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
327 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
328 aco_opcode cmp = aco_opcode::num_opcodes;
330 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
331 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
332 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
333 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
338 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
339 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
341 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
343 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,
346 bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
347 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
348 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
349 } else if (op == imul64) {
351 * t1 = umul_lo(t4, y_lo)
353 * t0 = umul_lo(t3, y_hi)
355 * t5 = umul_hi(t3, y_lo)
356 * res_hi = iadd(t2, t5)
357 * res_lo = umul_lo(t3, y_lo)
358 * Requires that res_hi != src0[0] and res_hi != src1[0]
359 * and that vtmp[0] != res_hi.
362 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
363 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,
365 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
367 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
368 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
370 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
371 emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
373 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
374 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
376 bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
377 emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
379 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
380 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
382 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
387 emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
390 Builder bld(ctx->program, &ctx->instructions);
391 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
392 RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
393 Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};
394 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
395 Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
396 Operand src1_64 = Operand(src1_reg, v2);
399 (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
400 assert(vtmp.reg() != 0);
401 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
402 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
404 src0[0] = Operand(vtmp, v1);
405 src0[1] = Operand(PhysReg{vtmp + 1}, v1);
406 src0_64 = Operand(vtmp, v2);
407 } else if (src0_rc == s1 && op == iadd64) {
408 assert(vtmp.reg() != 0);
409 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
410 src0[1] = Operand(PhysReg{vtmp + 1}, v1);
414 if (ctx->program->gfx_level >= GFX10) {
415 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
417 bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
419 bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
420 Operand(vcc, bld.lm));
421 } else if (op == iand64) {
422 bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
423 bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
424 } else if (op == ior64) {
425 bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
426 bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
427 } else if (op == ixor64) {
428 bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
429 bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
430 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
431 aco_opcode cmp = aco_opcode::num_opcodes;
433 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
434 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
435 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
436 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
440 bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
441 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
442 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
443 } else if (op == imul64) {
444 if (src1_reg == dst_reg) {
445 /* it's fine if src0==dst but not if src1==dst */
446 std::swap(src0_reg, src1_reg);
447 std::swap(src0[0], src1[0]);
448 std::swap(src0[1], src1[1]);
449 std::swap(src0_64, src1_64);
451 assert(!(src0_reg == src1_reg));
452 /* t1 = umul_lo(x_hi, y_lo)
453 * t0 = umul_lo(x_lo, y_hi)
455 * t5 = umul_hi(x_lo, y_lo)
456 * res_hi = iadd(t2, t5)
457 * res_lo = umul_lo(x_lo, y_lo)
458 * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
460 Definition tmp0_def(PhysReg{src0_reg + 1}, v1);
461 Definition tmp1_def(PhysReg{src1_reg + 1}, v1);
462 Operand tmp0_op = src0[1];
463 Operand tmp1_op = src1[1];
464 bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
465 bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
466 emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
467 bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
468 emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
469 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
474 emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
475 ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,
476 bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */
478 Builder bld(ctx->program, &ctx->instructions);
479 RegClass rc = RegClass(RegType::vgpr, size);
480 Definition dst(dst_reg, rc);
481 Operand src0(src0_reg, rc);
482 Operand src1(src1_reg, rc);
484 aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
485 bool vop3 = is_vop3_reduce_opcode(opcode);
488 if (opcode == aco_opcode::v_add_co_u32)
489 bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask,
492 bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
496 if (opcode == aco_opcode::num_opcodes) {
497 emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,
498 bound_ctrl, identity);
503 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
504 if (identity && size >= 2)
505 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);
507 for (unsigned i = 0; i < size; i++)
508 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
509 Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
511 bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
515 emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
516 ReduceOp op, unsigned size)
518 Builder bld(ctx->program, &ctx->instructions);
519 RegClass rc = RegClass(RegType::vgpr, size);
520 Definition dst(dst_reg, rc);
521 Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
522 Operand src1(src1_reg, rc);
524 aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
525 bool vop3 = is_vop3_reduce_opcode(opcode);
527 if (opcode == aco_opcode::num_opcodes) {
528 emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
533 bld.vop3(opcode, dst, src0, src1);
534 } else if (opcode == aco_opcode::v_add_co_u32) {
535 bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
537 bld.vop2(opcode, dst, src0, src1);
542 emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,
543 unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
545 Builder bld(ctx->program, &ctx->instructions);
546 for (unsigned i = 0; i < size; i++) {
547 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),
548 Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
553 emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
555 for (unsigned i = 0; i < size; i++) {
556 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),
557 Operand(PhysReg{src + i}, v1), ds_pattern);
562 emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,
563 PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
565 assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
566 assert(cluster_size <= ctx->program->wave_size);
568 Builder bld(ctx->program, &ctx->instructions);
571 identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0));
572 identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1));
573 Operand vcndmask_identity[2] = {identity[0], identity[1]};
575 /* First, copy the source to tmp and set inactive lanes to the identity */
576 bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),
577 Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));
579 /* On GFX10+ v_writelane_b32/v_cndmask_b32_e64 can take a literal */
580 if (ctx->program->gfx_level < GFX10) {
581 for (unsigned i = 0; i < src.size(); i++) {
582 /* p_exclusive_scan uses identity for v_writelane_b32 */
583 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
584 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);
585 identity[i] = Operand(PhysReg{sitmp + i}, s1);
587 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
588 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
589 } else if (identity[i].isLiteral()) {
590 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
591 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
596 for (unsigned i = 0; i < src.size(); i++) {
597 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
598 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
599 Operand(stmp, bld.lm));
602 if (src.regClass() == v1b) {
603 if (ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11) {
604 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
605 aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
606 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
607 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
608 bool sext = reduce_op == imin8 || reduce_op == imax8;
609 sdwa->sel[0] = SubdwordSel(1, 0, sext);
610 sdwa->dst_sel = SubdwordSel::dword;
611 bld.insert(std::move(sdwa));
615 if (reduce_op == imin8 || reduce_op == imax8)
616 opcode = aco_opcode::v_bfe_i32;
618 opcode = aco_opcode::v_bfe_u32;
620 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
623 } else if (src.regClass() == v2b) {
624 bool is_add_cmp = reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
625 reduce_op == umin16 || reduce_op == umax16;
626 if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level < GFX11 && is_add_cmp) {
627 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
628 aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
629 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
630 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
631 bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
632 sdwa->sel[0] = SubdwordSel(2, 0, sext);
633 sdwa->dst_sel = SubdwordSel::dword;
634 bld.insert(std::move(sdwa));
635 } else if (ctx->program->gfx_level <= GFX7 ||
636 (ctx->program->gfx_level >= GFX11 && is_add_cmp)) {
639 if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
640 opcode = aco_opcode::v_bfe_i32;
642 opcode = aco_opcode::v_bfe_u32;
644 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
649 bool reduction_needs_last_op = false;
651 case aco_opcode::p_reduce:
652 if (cluster_size == 1)
655 if (ctx->program->gfx_level <= GFX7) {
656 reduction_needs_last_op = true;
657 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
658 if (cluster_size == 2)
660 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
661 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
662 if (cluster_size == 4)
664 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
665 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
666 if (cluster_size == 8)
668 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
669 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
670 if (cluster_size == 16)
672 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
673 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
674 if (cluster_size == 32)
676 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
677 for (unsigned i = 0; i < src.size(); i++)
678 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
680 // TODO: it would be more effective to do the last reduction step on SALU
681 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
682 reduction_needs_last_op = false;
686 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,
688 if (cluster_size == 2)
690 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,
692 if (cluster_size == 4)
694 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,
696 if (cluster_size == 8)
698 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
699 if (cluster_size == 16)
702 if (ctx->program->gfx_level >= GFX10) {
703 /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
704 for (unsigned i = 0; i < src.size(); i++)
705 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
706 Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero());
708 if (cluster_size == 32) {
709 reduction_needs_last_op = true;
713 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
714 for (unsigned i = 0; i < src.size(); i++)
715 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
717 // TODO: it would be more effective to do the last reduction step on SALU
718 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
722 if (cluster_size == 32) {
723 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
724 reduction_needs_last_op = true;
727 assert(cluster_size == 64);
728 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
730 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
733 case aco_opcode::p_exclusive_scan:
734 if (ctx->program->gfx_level >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
735 /* shift rows right */
736 emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
738 /* fill in the gaps in rows 1 and 3 */
739 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10000u));
740 if (ctx->program->wave_size == 64)
741 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand::c32(0x10000u));
742 for (unsigned i = 0; i < src.size(); i++) {
744 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
745 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
746 Operand::c32(0xffffffffu))
748 perm->valu().opsel = 1; /* FI (Fetch Inactive) */
750 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand::c64(UINT64_MAX));
752 if (ctx->program->wave_size == 64) {
753 /* fill in the gap in row 2 */
754 for (unsigned i = 0; i < src.size(); i++) {
755 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
757 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
758 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
761 std::swap(tmp, vtmp);
762 } else if (ctx->program->gfx_level >= GFX8) {
763 emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
765 // TODO: use LDS on CS with a single write and shifted read
766 /* wavefront shift_right by 1 on SI/CI */
767 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
768 emit_ds_swizzle(bld, tmp, tmp, src.size(),
769 ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
770 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10101010u));
771 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
772 for (unsigned i = 0; i < src.size(); i++)
773 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
774 Operand(PhysReg{tmp + i}, v1));
776 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
777 emit_ds_swizzle(bld, tmp, tmp, src.size(),
778 ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
779 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x01000100u));
780 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
781 for (unsigned i = 0; i < src.size(); i++)
782 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
783 Operand(PhysReg{tmp + i}, v1));
785 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
786 emit_ds_swizzle(bld, tmp, tmp, src.size(),
787 ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
788 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(1u),
790 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(1u),
792 for (unsigned i = 0; i < src.size(); i++)
793 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
794 Operand(PhysReg{tmp + i}, v1));
796 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
797 for (unsigned i = 0; i < src.size(); i++) {
798 bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(),
799 Operand(PhysReg{vtmp + i}, v1));
800 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
802 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
803 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
804 identity[i] = Operand::zero(); /* prevent further uses of identity */
806 std::swap(tmp, vtmp);
809 for (unsigned i = 0; i < src.size(); i++) {
810 if (!identity[i].isConstant() ||
811 identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
812 if (ctx->program->gfx_level < GFX10)
813 assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
814 identity[i].physReg() == PhysReg{sitmp + i});
815 bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(),
816 Operand(PhysReg{tmp + i}, v1));
820 case aco_opcode::p_inclusive_scan:
821 assert(cluster_size == ctx->program->wave_size);
822 if (ctx->program->gfx_level <= GFX7) {
823 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
824 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xAAAAAAAAu));
825 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
826 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
828 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
829 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
830 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xCCCCCCCCu));
831 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
832 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
834 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
835 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
836 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xF0F0F0F0u));
837 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
838 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
840 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
841 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
842 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xFF00FF00u));
843 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
844 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
846 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
847 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
848 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),
850 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(16u),
852 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
854 for (unsigned i = 0; i < src.size(); i++)
855 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
857 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),
859 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
863 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,
865 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,
867 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,
869 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,
871 if (ctx->program->gfx_level >= GFX10) {
872 if (ctx->program->wave_size == 64) {
873 bld.sop1(aco_opcode::s_bitreplicate_b64_b32, Definition(exec, s2),
874 Operand::c32(0xff00ff00u));
876 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),
879 for (unsigned i = 0; i < src.size(); i++) {
881 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
882 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
883 Operand::c32(0xffffffffu))
885 perm->valu().opsel = 1; /* FI (Fetch Inactive) */
887 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
889 if (ctx->program->wave_size == 64) {
890 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),
892 for (unsigned i = 0; i < src.size(); i++)
893 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
895 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
898 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
900 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
904 default: unreachable("Invalid reduction mode");
907 if (op == aco_opcode::p_reduce) {
908 if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
909 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
910 emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
914 if (reduction_needs_last_op)
915 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
919 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
921 if (dst.regClass().type() == RegType::sgpr) {
922 for (unsigned k = 0; k < src.size(); k++) {
923 bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),
924 Operand::c32(ctx->program->wave_size - 1));
926 } else if (dst.physReg() != tmp) {
927 for (unsigned k = 0; k < src.size(); k++) {
928 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
929 Operand(PhysReg{tmp + k}, v1));
935 adjust_bpermute_dst(Builder& bld, Definition dst, Operand input_data)
937 /* RA assumes that the result is always in the low part of the register, so we have to shift,
938 * if it's not there already.
940 if (input_data.physReg().byte()) {
941 unsigned right_shift = input_data.physReg().byte() * 8;
942 bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),
943 Operand(dst.physReg(), dst.regClass()));
948 emit_bpermute_permlane(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
950 /* Emulates proper bpermute on GFX11 in wave64 mode.
952 * Similar to emit_gfx10_wave64_bpermute, but uses the new
953 * v_permlane64_b32 instruction to swap data between lo and hi halves.
956 assert(program->gfx_level >= GFX11);
957 assert(program->wave_size == 64);
959 Definition dst = instr->definitions[0];
960 Definition tmp_exec = instr->definitions[1];
961 Definition clobber_scc = instr->definitions[2];
962 Operand tmp_op = instr->operands[0];
963 Operand index_x4 = instr->operands[1];
964 Operand input_data = instr->operands[2];
965 Operand same_half = instr->operands[3];
967 assert(dst.regClass() == v1);
968 assert(tmp_exec.regClass() == bld.lm);
969 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
970 assert(same_half.regClass() == bld.lm);
971 assert(tmp_op.regClass() == v1.as_linear());
972 assert(index_x4.regClass() == v1);
973 assert(input_data.regClass().type() == RegType::vgpr);
974 assert(input_data.bytes() <= 4);
976 Definition tmp_def(tmp_op.physReg(), tmp_op.regClass());
978 /* Permute the input within the same half-wave. */
979 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
981 /* Save EXEC and enable all lanes. */
982 bld.sop1(aco_opcode::s_or_saveexec_b64, tmp_exec, clobber_scc, Definition(exec, s2),
983 Operand::c32(-1u), Operand(exec, s2));
985 /* Copy input data from other half to current half's linear VGPR. */
986 bld.vop1(aco_opcode::v_permlane64_b32, tmp_def, input_data);
988 /* Permute the input from the other half-wave, write to linear VGPR. */
989 bld.ds(aco_opcode::ds_bpermute_b32, tmp_def, index_x4, tmp_op);
991 /* Restore saved EXEC. */
992 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
994 /* Select correct permute result. */
995 bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, tmp_op, Operand(dst.physReg(), dst.regClass()),
998 adjust_bpermute_dst(bld, dst, input_data);
1002 emit_bpermute_shared_vgpr(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
1004 /* Emulates proper bpermute on GFX10 in wave64 mode.
1006 * This is necessary because on GFX10 the bpermute instruction only works
1007 * on half waves (you can think of it as having a cluster size of 32), so we
1008 * manually swap the data between the two halves using two shared VGPRs.
1011 assert(program->gfx_level >= GFX10 && program->gfx_level <= GFX10_3);
1012 assert(program->wave_size == 64);
1014 unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256;
1015 Definition dst = instr->definitions[0];
1016 Definition tmp_exec = instr->definitions[1];
1017 Definition clobber_scc = instr->definitions[2];
1018 Operand index_x4 = instr->operands[0];
1019 Operand input_data = instr->operands[1];
1020 Operand same_half = instr->operands[2];
1022 assert(dst.regClass() == v1);
1023 assert(tmp_exec.regClass() == bld.lm);
1024 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1025 assert(same_half.regClass() == bld.lm);
1026 assert(index_x4.regClass() == v1);
1027 assert(input_data.regClass().type() == RegType::vgpr);
1028 assert(input_data.bytes() <= 4);
1029 assert(dst.physReg() != index_x4.physReg());
1030 assert(dst.physReg() != input_data.physReg());
1031 assert(tmp_exec.physReg() != same_half.physReg());
1033 PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
1034 PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
1036 /* Permute the input within the same half-wave */
1037 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1039 /* HI: Copy data from high lanes 32-63 to shared vgpr */
1040 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,
1041 dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
1043 bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
1044 /* Set EXEC to enable LO lanes only */
1045 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::zero());
1046 /* LO: Copy data from low lanes 0-31 to shared vgpr */
1047 bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
1048 /* LO: bpermute shared vgpr (high lanes' data) */
1049 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,
1050 Operand(shared_vgpr_hi, v1));
1051 /* Set EXEC to enable HI lanes only */
1052 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::c32(32u));
1053 /* HI: bpermute shared vgpr (low lanes' data) */
1054 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,
1055 Operand(shared_vgpr_lo, v1));
1057 /* Only enable lanes which use the other half's data */
1058 bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,
1059 Operand(tmp_exec.physReg(), s2), same_half);
1060 /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
1061 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),
1063 /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
1064 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),
1067 /* Restore saved EXEC */
1068 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1070 adjust_bpermute_dst(bld, dst, input_data);
1074 emit_bpermute_readlane(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
1076 /* Emulates bpermute using readlane instructions */
1078 Operand index = instr->operands[0];
1079 Operand input = instr->operands[1];
1080 Definition dst = instr->definitions[0];
1081 Definition temp_exec = instr->definitions[1];
1082 Definition clobber_vcc = instr->definitions[2];
1084 assert(dst.regClass() == v1);
1085 assert(temp_exec.regClass() == bld.lm);
1086 assert(clobber_vcc.regClass() == bld.lm);
1087 assert(clobber_vcc.physReg() == vcc);
1088 assert(index.regClass() == v1);
1089 assert(index.physReg() != dst.physReg());
1090 assert(input.regClass().type() == RegType::vgpr);
1091 assert(input.bytes() <= 4);
1092 assert(input.physReg() != dst.physReg());
1094 /* Save original EXEC */
1095 bld.sop1(Builder::s_mov, temp_exec, Operand(exec, bld.lm));
1097 /* An "unrolled loop" that is executed per each lane.
1098 * This takes only a few instructions per lane, as opposed to a "real" loop
1099 * with branching, where the branch instruction alone would take 16+ cycles.
1101 for (unsigned n = 0; n < program->wave_size; ++n) {
1102 /* Activate the lane which has N for its source index */
1103 bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), clobber_vcc, Operand::c32(n),
1105 /* Read the data from lane N */
1106 bld.readlane(Definition(vcc, s1), input, Operand::c32(n));
1107 /* On the active lane, move the data we read from lane N to the destination VGPR */
1108 bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
1109 /* Restore original EXEC */
1110 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(temp_exec.physReg(), bld.lm));
1113 adjust_bpermute_dst(bld, dst, input);
1116 struct copy_operation {
1122 uint64_t is_used = 0;
1127 split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
1128 const copy_operation& src, bool ignore_uses, unsigned max_size)
1130 PhysReg def_reg = src.def.physReg();
1131 PhysReg op_reg = src.op.physReg();
1132 def_reg.reg_b += offset;
1133 op_reg.reg_b += offset;
1135 /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10, and on GFX11
1136 * v_lshrrev_b64 doesn't get dual issued. */
1137 if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) &&
1138 src.def.regClass().type() == RegType::vgpr)
1139 max_size = MIN2(max_size, 4);
1140 unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
1142 /* make sure the size is a power of two and reg % bytes == 0 */
1144 for (; bytes <= max_size; bytes *= 2) {
1145 unsigned next = bytes * 2u;
1146 bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
1147 offset + next <= src.bytes && next <= max_size;
1148 if (!src.op.isConstant() && can_increase)
1149 can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
1150 for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
1151 can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
1156 *def = Definition(src.def.tempId(), def_reg, src.def.regClass().resize(bytes));
1157 if (src.op.isConstant()) {
1158 assert(bytes >= 1 && bytes <= 8);
1159 uint64_t val = src.op.constantValue64() >> (offset * 8u);
1160 *op = Operand::get_const(ctx->program->gfx_level, val, bytes);
1162 RegClass op_cls = src.op.regClass().resize(bytes);
1163 *op = Operand(op_reg, op_cls);
1164 op->setTemp(Temp(src.op.tempId(), op_cls));
1169 get_intersection_mask(int a_start, int a_size, int b_start, int b_size)
1171 int intersection_start = MAX2(b_start - a_start, 0);
1172 int intersection_end = MAX2(b_start + b_size - a_start, 0);
1173 if (intersection_start >= a_size || intersection_end == 0)
1176 uint32_t mask = u_bit_consecutive(0, a_size);
1177 return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
1180 /* src1 are bytes 0-3. dst/src0 are bytes 4-7. */
1182 create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1,
1183 Operand src0 = Operand(v1))
1185 uint32_t swiz_packed =
1186 swiz[0] | ((uint32_t)swiz[1] << 8) | ((uint32_t)swiz[2] << 16) | ((uint32_t)swiz[3] << 24);
1188 dst = Definition(PhysReg(dst.physReg().reg()), v1);
1189 if (!src1.isConstant())
1190 src1 = Operand(PhysReg(src1.physReg().reg()), v1);
1191 if (src0.isUndefined())
1192 src0 = Operand(dst.physReg(), v1);
1193 else if (!src0.isConstant())
1194 src0 = Operand(PhysReg(src0.physReg().reg()), v1);
1195 bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed));
1199 emit_v_mov_b16(Builder& bld, Definition dst, Operand op)
1201 /* v_mov_b16 uses 32bit inline constants. */
1202 if (op.isConstant()) {
1203 if (!op.isLiteral() && op.physReg() >= 240) {
1204 /* v_add_f16 is smaller because it can use 16bit fp inline constants. */
1205 Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero());
1206 instr->valu().opsel[3] = dst.physReg().byte() == 2;
1209 op = Operand::c32((int32_t)(int16_t)op.constantValue());
1212 Instruction* instr = bld.vop1(aco_opcode::v_mov_b16, dst, op);
1213 instr->valu().opsel[0] = op.physReg().byte() == 2;
1214 instr->valu().opsel[3] = dst.physReg().byte() == 2;
1218 copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
1220 assert(op.bytes() == dst.bytes());
1222 if (dst.bytes() == 4 && op.isLiteral()) {
1223 uint32_t imm = op.constantValue();
1224 if (dst.regClass() == s1 && (imm >= 0xffff8000 || imm <= 0x7fff)) {
1225 bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
1227 } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) {
1228 uint32_t rev = util_bitreverse(imm);
1229 if (dst.regClass() == s1)
1230 bld.sop1(aco_opcode::s_brev_b32, dst, Operand::c32(rev));
1232 bld.vop1(aco_opcode::v_bfrev_b32, dst, Operand::c32(rev));
1234 } else if (dst.regClass() == s1) {
1235 unsigned start = (ffs(imm) - 1) & 0x1f;
1236 unsigned size = util_bitcount(imm) & 0x1f;
1237 if (BITFIELD_RANGE(start, size) == imm) {
1238 bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start));
1241 if (ctx->program->gfx_level >= GFX9) {
1242 Operand op_lo = Operand::c32(int32_t(int16_t(imm)));
1243 Operand op_hi = Operand::c32(int32_t(int16_t(imm >> 16)));
1244 if (!op_lo.isLiteral() && !op_hi.isLiteral()) {
1245 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op_lo, op_hi);
1252 if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->gfx_level >= GFX8)
1253 op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
1255 if (dst.regClass() == s1) {
1256 bld.sop1(aco_opcode::s_mov_b32, dst, op);
1257 } else if (dst.regClass() == s2) {
1258 /* s_ashr_i64 writes SCC, so we can't use it */
1259 assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false));
1260 uint64_t imm = op.constantValue64();
1261 if (op.isLiteral()) {
1262 unsigned start = (ffsll(imm) - 1) & 0x3f;
1263 unsigned size = util_bitcount64(imm) & 0x3f;
1264 if (BITFIELD64_RANGE(start, size) == imm) {
1265 bld.sop2(aco_opcode::s_bfm_b64, dst, Operand::c32(size), Operand::c32(start));
1269 bld.sop1(aco_opcode::s_mov_b64, dst, op);
1270 } else if (dst.regClass() == v2) {
1271 if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
1272 bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op);
1274 assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
1275 bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op);
1277 } else if (dst.regClass() == v1) {
1278 bld.vop1(aco_opcode::v_mov_b32, dst, op);
1280 assert(dst.regClass() == v1b || dst.regClass() == v2b);
1282 bool use_sdwa = ctx->program->gfx_level >= GFX9 && ctx->program->gfx_level < GFX11;
1283 /* We need the v_perm_b32 (VOP3) to be able to take literals, and that's a GFX10+ feature. */
1284 bool can_use_perm = ctx->program->gfx_level >= GFX10 &&
1285 (op.constantEquals(0) || op.constantEquals(0xff) ||
1286 op.constantEquals(0xffff) || op.constantEquals(0xff00));
1287 if (dst.regClass() == v1b && use_sdwa) {
1288 uint8_t val = op.constantValue();
1289 Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
1290 if (op32.isLiteral()) {
1291 uint32_t a = (uint32_t)int8_mul_table[val * 2];
1292 uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
1293 bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,
1294 Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)),
1295 Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u)));
1297 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
1299 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1300 emit_v_mov_b16(bld, dst, op);
1301 } else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) {
1302 if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
1303 /* use v_mov_b32 to avoid possible issues with denormal flushing or
1304 * NaN. v_add_f16 is still needed for float constants. */
1305 uint32_t val32 = (int32_t)(int16_t)op.constantValue();
1306 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32));
1308 bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero());
1310 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10 &&
1311 (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in)) {
1312 if (dst.physReg().byte() == 2) {
1313 Operand def_lo(dst.physReg().advance(-2), v2b);
1314 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, def_lo, op);
1315 instr->valu().opsel = 0;
1317 assert(dst.physReg().byte() == 0);
1318 Operand def_hi(dst.physReg().advance(2), v2b);
1319 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, op, def_hi);
1320 instr->valu().opsel = 2;
1322 } else if (can_use_perm) {
1323 uint8_t swiz[] = {4, 5, 6, 7};
1324 swiz[dst.physReg().byte()] = op.constantValue() & 0xff ? bperm_255 : bperm_0;
1325 if (dst.bytes() == 2)
1326 swiz[dst.physReg().byte() + 1] = op.constantValue() >> 8 ? bperm_255 : bperm_0;
1327 create_bperm(bld, swiz, dst, Operand::zero());
1329 uint32_t offset = dst.physReg().byte() * 8u;
1330 uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset;
1331 uint32_t val = (op.constantValue() << offset) & mask;
1332 dst = Definition(PhysReg(dst.physReg().reg()), v1);
1333 Operand def_op(dst.physReg(), v1);
1335 bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op);
1337 bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op);
1343 copy_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
1346 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
1348 for (unsigned i = 0; i < 2; i++) {
1349 if (def.size() == 2)
1350 bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1352 bld.vop1(aco_opcode::v_mov_b32, def, op);
1354 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
1355 Operand(exec, bld.lm));
1359 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
1364 swap_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
1367 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
1369 Operand def_as_op = Operand(def.physReg(), def.regClass());
1370 Definition op_as_def = Definition(op.physReg(), op.regClass());
1372 for (unsigned i = 0; i < 2; i++) {
1373 if (bld.program->gfx_level >= GFX9) {
1374 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1376 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1377 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1378 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1381 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
1382 Operand(exec, bld.lm));
1386 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
1391 addsub_subdword_gfx11(Builder& bld, Definition dst, Operand src0, Operand src1, bool sub)
1393 Instruction* instr =
1394 bld.vop3(sub ? aco_opcode::v_sub_u16_e64 : aco_opcode::v_add_u16_e64, dst, src0, src1).instr;
1395 if (src0.physReg().byte() == 2)
1396 instr->valu().opsel |= 0x1;
1397 if (src1.physReg().byte() == 2)
1398 instr->valu().opsel |= 0x2;
1399 if (dst.physReg().byte() == 2)
1400 instr->valu().opsel |= 0x8;
1404 do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
1405 PhysReg scratch_sgpr)
1407 bool did_copy = false;
1408 for (unsigned offset = 0; offset < copy.bytes;) {
1409 if (copy.uses[offset]) {
1416 split_copy(ctx, offset, &def, &op, copy, false, 8);
1418 if (def.physReg() == scc) {
1419 bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero());
1420 *preserve_scc = true;
1421 } else if (op.isConstant()) {
1422 copy_constant(ctx, bld, def, op);
1423 } else if (def.regClass().is_linear_vgpr()) {
1424 copy_linear_vgpr(bld, def, op, *preserve_scc, scratch_sgpr);
1425 } else if (def.regClass() == v1) {
1426 bld.vop1(aco_opcode::v_mov_b32, def, op);
1427 } else if (def.regClass() == v2) {
1428 bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1429 } else if (def.regClass() == s1) {
1430 bld.sop1(aco_opcode::s_mov_b32, def, op);
1431 } else if (def.regClass() == s2) {
1432 bld.sop1(aco_opcode::s_mov_b64, def, op);
1433 } else if (def.regClass().is_subdword() && ctx->program->gfx_level < GFX8) {
1434 if (op.physReg().byte()) {
1435 assert(def.physReg().byte() == 0);
1436 bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand::c32(op.physReg().byte() * 8), op);
1437 } else if (def.physReg().byte()) {
1438 assert(op.physReg().byte() == 0);
1439 /* preserve the target's lower half */
1440 uint32_t bits = def.physReg().byte() * 8;
1441 PhysReg lo_reg = PhysReg(def.physReg().reg());
1442 Definition lo_half =
1443 Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
1445 Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
1447 if (def.physReg().reg() == op.physReg().reg()) {
1448 bld.vop2(aco_opcode::v_and_b32, lo_half, Operand::c32((1 << bits) - 1u),
1449 Operand(lo_reg, lo_half.regClass()));
1450 if (def.physReg().byte() == 1) {
1451 bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand::c32((1 << bits) + 1u), op);
1452 } else if (def.physReg().byte() == 2) {
1453 bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
1454 } else if (def.physReg().byte() == 3) {
1455 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1),
1456 Operand::c32((1 << bits) + 1u));
1457 bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
1460 lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
1461 bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand::c32(32 - bits),
1462 Operand(lo_reg, lo_half.regClass()));
1463 bld.vop3(aco_opcode::v_alignbyte_b32, dst, op,
1464 Operand(lo_half.physReg(), lo_half.regClass()),
1465 Operand::c32(4 - def.physReg().byte()));
1468 bld.vop1(aco_opcode::v_mov_b32, def, op);
1470 } else if (def.regClass() == v1b && ctx->program->gfx_level >= GFX11) {
1471 uint8_t swiz[] = {4, 5, 6, 7};
1472 swiz[def.physReg().byte()] = op.physReg().byte();
1473 create_bperm(bld, swiz, def, op);
1474 } else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1475 emit_v_mov_b16(bld, def, op);
1476 } else if (def.regClass().is_subdword()) {
1477 bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
1479 unreachable("unsupported copy");
1483 offset += def.bytes();
1489 swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
1491 if (def.physReg().reg() == op.physReg().reg()) {
1492 assert(def.bytes() != 2); /* handled by caller */
1493 uint8_t swiz[] = {4, 5, 6, 7};
1494 std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]);
1495 create_bperm(bld, swiz, def, Operand::zero());
1499 if (def.bytes() == 2) {
1500 Operand def_as_op = Operand(def.physReg(), def.regClass());
1501 Definition op_as_def = Definition(op.physReg(), op.regClass());
1502 addsub_subdword_gfx11(bld, def, def_as_op, op, false);
1503 addsub_subdword_gfx11(bld, op_as_def, def_as_op, op, true);
1504 addsub_subdword_gfx11(bld, def, def_as_op, op, true);
1506 PhysReg op_half = op.physReg();
1507 op_half.reg_b &= ~1;
1509 PhysReg def_other_half = def.physReg();
1510 def_other_half.reg_b &= ~1;
1511 def_other_half.reg_b ^= 2;
1513 /* We can only swap individual bytes within a single VGPR, so temporarily move both bytes
1514 * into the same VGPR.
1516 swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1517 swap_subdword_gfx11(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b));
1518 swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1523 do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,
1524 Pseudo_instruction* pi)
1526 unsigned offset = 0;
1528 if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1529 (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1530 /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte
1532 PhysReg op = copy.op.physReg();
1533 PhysReg def = copy.def.physReg();
1538 tmp.op = Operand(op, v1);
1539 tmp.def = Definition(def, v1);
1541 memset(tmp.uses, 1, 4);
1542 do_swap(ctx, bld, tmp, preserve_scc, pi);
1544 op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1545 def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1546 tmp.op = Operand(op, v1b);
1547 tmp.def = Definition(def, v1b);
1550 do_swap(ctx, bld, tmp, preserve_scc, pi);
1552 offset = copy.bytes;
1555 for (; offset < copy.bytes;) {
1558 unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
1559 split_copy(ctx, offset, &def, &op, copy, true, max_size);
1561 assert(op.regClass() == def.regClass());
1562 Operand def_as_op = Operand(def.physReg(), def.regClass());
1563 Definition op_as_def = Definition(op.physReg(), op.regClass());
1564 if (def.regClass().is_linear_vgpr()) {
1565 swap_linear_vgpr(bld, def, op, preserve_scc, pi->scratch_sgpr);
1566 } else if (ctx->program->gfx_level >= GFX9 && def.regClass() == v1) {
1567 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1568 } else if (def.regClass() == v1) {
1569 assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1570 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1571 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1572 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1573 } else if (op.physReg() == scc || def.physReg() == scc) {
1574 /* we need to swap scc and another sgpr */
1575 assert(!preserve_scc);
1577 PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1579 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1580 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1),
1582 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1583 } else if (def.regClass() == s1) {
1585 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1586 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1587 bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1589 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1590 bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1591 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1593 } else if (def.regClass() == s2) {
1595 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1596 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1597 bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1598 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1600 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
1602 } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1603 bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
1606 assert(def.regClass().is_subdword());
1607 if (ctx->program->gfx_level >= GFX11) {
1608 swap_subdword_gfx11(bld, def, op);
1610 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1611 bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1612 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1616 offset += def.bytes();
1619 if (ctx->program->gfx_level <= GFX7)
1622 /* fixup in case we swapped bytes we shouldn't have */
1623 copy_operation tmp_copy = copy;
1624 tmp_copy.op.setFixed(copy.def.physReg());
1625 tmp_copy.def.setFixed(copy.op.physReg());
1626 do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1630 do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1632 if (lo.isConstant() && hi.isConstant()) {
1633 copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
1637 bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
1638 (ctx->program->gfx_level >= GFX10 ||
1639 (ctx->program->gfx_level >= GFX9 && !lo.isLiteral() && !hi.isLiteral()));
1642 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1643 /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1644 instr->valu().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1648 /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
1649 if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1650 (!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
1651 ctx->program->gfx_level >= GFX10)))) {
1652 if (hi.isConstant())
1653 bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
1656 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1660 Definition def_lo = Definition(def.physReg(), v2b);
1661 Definition def_hi = Definition(def.physReg().advance(2), v2b);
1663 if (lo.isConstant()) {
1664 /* move hi and zero low bits */
1665 if (hi.physReg().byte() == 0)
1666 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1668 bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
1669 if (lo.constantValue())
1670 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1671 Operand(def.physReg(), v1));
1674 if (hi.isConstant()) {
1675 /* move lo and zero high bits */
1676 if (lo.physReg().byte() == 2)
1677 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1678 else if (ctx->program->gfx_level >= GFX11)
1679 bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
1681 bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
1682 if (hi.constantValue())
1683 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1684 Operand(def.physReg(), v1));
1688 if (lo.physReg().reg() == def.physReg().reg()) {
1689 /* lo is in the high bits of def */
1690 assert(lo.physReg().byte() == 2);
1691 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1692 lo.setFixed(def.physReg());
1693 } else if (hi.physReg() == def.physReg()) {
1694 /* hi is in the low bits of def */
1695 assert(hi.physReg().byte() == 0);
1696 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1697 hi.setFixed(def.physReg().advance(2));
1698 } else if (ctx->program->gfx_level >= GFX8) {
1699 /* Either lo or hi can be placed with just a v_mov. SDWA is not needed, because
1700 * op.physReg().byte()==def.physReg().byte() and the other half will be overwritten.
1702 assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
1703 Operand& op = lo.physReg().byte() == 0 ? lo : hi;
1704 PhysReg reg = def.physReg().advance(op.physReg().byte());
1705 bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
1709 /* either hi or lo are already placed correctly */
1710 if (ctx->program->gfx_level >= GFX11) {
1711 if (lo.physReg().reg() == def.physReg().reg())
1712 emit_v_mov_b16(bld, def_hi, hi);
1714 emit_v_mov_b16(bld, def_lo, lo);
1716 } else if (ctx->program->gfx_level >= GFX8) {
1717 if (lo.physReg().reg() == def.physReg().reg())
1718 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
1720 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
1724 /* alignbyte needs the operands in the following way:
1725 * | xx hi | lo xx | >> 2 byte */
1726 if (lo.physReg().byte() != hi.physReg().byte()) {
1727 /* | xx lo | hi xx | => | lo hi | lo hi | */
1728 assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 2);
1729 bld.vop3(aco_opcode::v_alignbyte_b32, def, lo, hi, Operand::c32(2u));
1730 lo = Operand(def_hi.physReg(), v2b);
1731 hi = Operand(def_lo.physReg(), v2b);
1732 } else if (lo.physReg().byte() == 0) {
1733 /* | xx hi | xx lo | => | xx hi | lo 00 | */
1734 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), lo);
1735 lo = Operand(def_hi.physReg(), v2b);
1737 /* | hi xx | lo xx | => | 00 hi | lo xx | */
1738 assert(hi.physReg().byte() == 2);
1739 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), hi);
1740 hi = Operand(def_lo.physReg(), v2b);
1742 /* perform the alignbyte */
1743 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1747 try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,
1748 copy_operation& copy)
1750 // TODO try more relaxed alignment for subdword copies
1751 unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);
1752 unsigned next_op_align = next_def_align;
1753 if (copy.def.regClass().type() == RegType::vgpr)
1754 next_def_align = MIN2(next_def_align, 4);
1755 if (copy.op.regClass().type() == RegType::vgpr)
1756 next_op_align = MIN2(next_op_align, 4);
1758 if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align ||
1759 (!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align))
1762 auto other = copy_map.find(copy.def.physReg().advance(copy.bytes));
1763 if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 ||
1764 copy.op.isConstant() != other->second.op.isConstant())
1767 /* don't create 64-bit copies before GFX10 */
1768 if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
1769 ctx->program->gfx_level < GFX10)
1772 unsigned new_size = copy.bytes + other->second.bytes;
1773 if (copy.op.isConstant()) {
1775 copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
1776 if (!util_is_power_of_two_or_zero(new_size))
1778 if (!Operand::is_constant_representable(val, new_size, true,
1779 copy.def.regClass().type() == RegType::vgpr))
1781 copy.op = Operand::get_const(ctx->program->gfx_level, val, new_size);
1783 if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))
1785 copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size));
1788 copy.bytes = new_size;
1789 copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes));
1790 copy_map.erase(other);
1794 handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
1795 amd_gfx_level gfx_level, Pseudo_instruction* pi)
1797 Builder bld(ctx->program, &ctx->instructions);
1798 unsigned num_instructions_before = ctx->instructions.size();
1799 aco_ptr<Instruction> mov;
1800 bool writes_scc = false;
1802 /* count the number of uses for each dst reg */
1803 for (auto it = copy_map.begin(); it != copy_map.end();) {
1805 if (it->second.def.physReg() == scc)
1808 assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
1810 /* if src and dst reg are the same, remove operation */
1811 if (it->first == it->second.op.physReg()) {
1812 it = copy_map.erase(it);
1816 /* split large copies */
1817 if (it->second.bytes > 8) {
1818 assert(!it->second.op.isConstant());
1819 assert(!it->second.def.regClass().is_subdword());
1820 RegClass rc = RegClass(it->second.def.regClass().type(), it->second.def.size() - 2);
1821 Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1822 rc = RegClass(it->second.op.regClass().type(), it->second.op.size() - 2);
1823 Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1824 copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1825 copy_map[hi_def.physReg()] = copy;
1826 assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1827 it->second.op = Operand(it->second.op.physReg(),
1828 it->second.op.regClass().type() == RegType::sgpr ? s2 : v2);
1829 it->second.def = Definition(it->second.def.physReg(),
1830 it->second.def.regClass().type() == RegType::sgpr ? s2 : v2);
1831 it->second.bytes = 8;
1834 try_coalesce_copies(ctx, copy_map, it->second);
1836 /* check if the definition reg is used by another copy operation */
1837 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1838 if (copy.second.op.isConstant())
1840 for (uint16_t i = 0; i < it->second.bytes; i++) {
1841 /* distance might underflow */
1842 unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1843 if (distance < copy.second.bytes)
1844 it->second.uses[i] += 1;
1851 /* first, handle paths in the location transfer graph */
1852 bool preserve_scc = pi->tmp_in_scc && !writes_scc;
1853 bool skip_partial_copies = true;
1854 for (auto it = copy_map.begin();;) {
1855 if (copy_map.empty()) {
1856 ctx->program->statistics[aco_statistic_copies] +=
1857 ctx->instructions.size() - num_instructions_before;
1860 if (it == copy_map.end()) {
1861 if (!skip_partial_copies)
1863 skip_partial_copies = false;
1864 it = copy_map.begin();
1867 /* check if we can pack one register at once */
1868 if (it->first.byte() == 0 && it->second.bytes == 2) {
1869 PhysReg reg_hi = it->first.advance(2);
1870 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1871 if (other != copy_map.end() && other->second.bytes == 2) {
1872 /* check if the target register is otherwise unused */
1873 bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&
1874 other->second.op.physReg() == it->first);
1875 bool unused_hi = !other->second.is_used ||
1876 (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);
1877 if (unused_lo && unused_hi) {
1878 Operand lo = it->second.op;
1879 Operand hi = other->second.op;
1880 do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1882 copy_map.erase(other);
1884 for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) {
1885 for (uint16_t i = 0; i < other2.second.bytes; i++) {
1886 /* distance might underflow */
1887 unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b;
1888 unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b;
1889 if (distance_lo < 2 || distance_hi < 2)
1890 other2.second.uses[i] -= 1;
1893 it = copy_map.begin();
1899 /* on GFX6/7, we need some small workarounds as there is no
1900 * SDWA instruction to do partial register writes */
1901 if (ctx->program->gfx_level < GFX8 && it->second.bytes < 4) {
1902 if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used &&
1903 pi->opcode == aco_opcode::p_split_vector) {
1904 /* Other operations might overwrite the high bits, so change all users
1905 * of the high bits to the new target where they are still available.
1906 * This mechanism depends on also emitting dead definitions. */
1907 PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes);
1908 while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) {
1909 std::map<PhysReg, copy_operation>::iterator other = copy_map.begin();
1910 for (other = copy_map.begin(); other != copy_map.end(); other++) {
1911 /* on GFX6/7, if the high bits are used as operand, they cannot be a target */
1912 if (other->second.op.physReg() == reg_hi) {
1913 other->second.op.setFixed(it->first.advance(reg_hi.byte()));
1914 break; /* break because an operand can only be used once */
1917 reg_hi = reg_hi.advance(it->second.bytes);
1919 } else if (it->first.byte()) {
1920 assert(pi->opcode == aco_opcode::p_create_vector);
1921 /* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,
1922 * move to the target operand's high bits. This is save to do as it cannot be an operand
1924 PhysReg lo = PhysReg(it->first.reg());
1925 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);
1926 if (other != copy_map.end()) {
1927 assert(other->second.bytes == it->first.byte());
1928 PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte());
1929 it->second.def = Definition(new_reg_hi, it->second.def.regClass());
1930 it->second.is_used = 0;
1931 other->second.bytes += it->second.bytes;
1932 other->second.def.setTemp(Temp(other->second.def.tempId(),
1933 RegClass::get(RegType::vgpr, other->second.bytes)));
1934 other->second.op.setTemp(Temp(other->second.op.tempId(),
1935 RegClass::get(RegType::vgpr, other->second.bytes)));
1936 /* if the new target's high bits are also a target, change uses */
1937 std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);
1938 if (target != copy_map.end()) {
1939 for (unsigned i = 0; i < it->second.bytes; i++)
1940 target->second.uses[i]++;
1946 /* find portions where the target reg is not used as operand for any other copy */
1947 if (it->second.is_used) {
1948 if (it->second.op.isConstant() || skip_partial_copies) {
1949 /* we have to skip constants until is_used=0.
1950 * we also skip partial copies at the beginning to help coalescing */
1955 unsigned has_zero_use_bytes = 0;
1956 for (unsigned i = 0; i < it->second.bytes; i++)
1957 has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1959 if (has_zero_use_bytes) {
1960 /* Skipping partial copying and doing a v_swap_b32 and then fixup
1961 * copies is usually beneficial for sub-dword copies, but if doing
1962 * a partial copy allows further copies, it should be done instead. */
1963 bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1964 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1965 /* on GFX6/7, we can only do copies with full registers */
1966 if (partial_copy || ctx->program->gfx_level <= GFX7)
1968 for (uint16_t i = 0; i < copy.second.bytes; i++) {
1969 /* distance might underflow */
1970 unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1971 if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1972 !it->second.uses[distance])
1973 partial_copy = true;
1977 if (!partial_copy) {
1982 /* full target reg is used: register swapping needed */
1988 bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1989 skip_partial_copies = did_copy;
1990 std::pair<PhysReg, copy_operation> copy = *it;
1992 if (it->second.is_used == 0) {
1993 /* the target reg is not used as operand for any other copy, so we
1994 * copied to all of it */
1996 it = copy_map.begin();
1998 /* we only performed some portions of this copy, so split it to only
1999 * leave the portions that still need to be done */
2000 copy_operation original = it->second; /* the map insertion below can overwrite this */
2002 for (unsigned offset = 0; offset < original.bytes;) {
2003 if (original.uses[offset] == 0) {
2009 split_copy(ctx, offset, &def, &op, original, false, 8);
2011 copy_operation new_copy = {op, def, def.bytes()};
2012 for (unsigned i = 0; i < new_copy.bytes; i++)
2013 new_copy.uses[i] = original.uses[i + offset];
2014 copy_map[def.physReg()] = new_copy;
2016 offset += def.bytes();
2019 it = copy_map.begin();
2022 /* Reduce the number of uses of the operand reg by one. Do this after
2023 * splitting the copy or removing it in case the copy writes to it's own
2024 * operand (for example, v[7:8] = v[8:9]) */
2025 if (did_copy && !copy.second.op.isConstant()) {
2026 for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
2027 for (uint16_t i = 0; i < other.second.bytes; i++) {
2028 /* distance might underflow */
2029 unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
2030 if (distance < copy.second.bytes && !copy.second.uses[distance])
2031 other.second.uses[i] -= 1;
2037 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
2038 unsigned largest = 0;
2039 for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
2040 largest = MAX2(largest, op.second.bytes);
2042 while (!copy_map.empty()) {
2044 /* Perform larger swaps first, because larger swaps swaps can make other
2045 * swaps unnecessary. */
2046 auto it = copy_map.begin();
2047 for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
2048 if (it2->second.bytes > it->second.bytes) {
2050 if (it->second.bytes == largest)
2055 /* should already be done */
2056 assert(!it->second.op.isConstant());
2058 assert(it->second.op.isFixed());
2059 assert(it->second.def.regClass() == it->second.op.regClass());
2061 if (it->first == it->second.op.physReg()) {
2066 if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
2067 assert(!(it->second.def.physReg() == pi->scratch_sgpr));
2069 /* to resolve the cycle, we have to swap the src reg with the dst reg */
2070 copy_operation swap = it->second;
2072 /* if this is self-intersecting, we have to split it because
2073 * self-intersecting swaps don't make sense */
2074 PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
2075 if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
2076 unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
2078 copy_operation remaining;
2079 src.reg_b += offset;
2080 dst.reg_b += offset;
2081 remaining.bytes = swap.bytes - offset;
2082 memcpy(remaining.uses, swap.uses + offset, remaining.bytes);
2083 remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes));
2084 remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes));
2085 copy_map[dst] = remaining;
2087 memset(swap.uses + offset, 0, swap.bytes - offset);
2088 swap.bytes = offset;
2091 /* GFX6-7 can only swap full registers */
2092 if (ctx->program->gfx_level <= GFX7)
2093 swap.bytes = align(swap.bytes, 4);
2095 do_swap(ctx, bld, swap, preserve_scc, pi);
2097 /* remove from map */
2100 /* change the operand reg of the target's uses and split uses if needed */
2101 uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
2102 for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {
2103 if (target->second.op.physReg() == swap.def.physReg() &&
2104 swap.bytes == target->second.bytes) {
2105 target->second.op.setFixed(swap.op.physReg());
2110 get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
2111 target->second.op.physReg().reg_b, target->second.bytes);
2116 int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
2118 /* split and update the middle (the portion that reads the swap's
2119 * definition) to read the swap's operand instead */
2120 int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
2121 int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
2122 int before_bytes = MAX2(-offset, 0);
2123 int after_bytes = MAX2(target_op_end - swap_def_end, 0);
2124 int middle_bytes = target->second.bytes - before_bytes - after_bytes;
2127 unsigned after_offset = before_bytes + middle_bytes;
2128 assert(after_offset > 0);
2129 copy_operation copy;
2130 copy.bytes = after_bytes;
2131 memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
2132 RegClass rc = target->second.op.regClass().resize(after_bytes);
2133 copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
2134 copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
2135 copy_map[copy.def.physReg()] = copy;
2139 copy_operation copy;
2140 copy.bytes = middle_bytes;
2141 memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
2142 RegClass rc = target->second.op.regClass().resize(middle_bytes);
2143 copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
2144 copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
2145 copy_map[copy.def.physReg()] = copy;
2149 copy_operation copy;
2150 target->second.bytes = before_bytes;
2151 RegClass rc = target->second.op.regClass().resize(before_bytes);
2152 target->second.op = Operand(target->second.op.physReg(), rc);
2153 target->second.def = Definition(target->second.def.physReg(), rc);
2154 memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
2157 /* break early since we know each byte of the swap's definition is used
2159 bytes_left &= ~imask;
2164 ctx->program->statistics[aco_statistic_copies] +=
2165 ctx->instructions.size() - num_instructions_before;
2169 emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
2171 if (bld.program->gfx_level >= GFX10) {
2173 bld.sopp(aco_opcode::s_round_mode, -1, new_mode.round);
2175 bld.sopp(aco_opcode::s_denorm_mode, -1, new_mode.denorm);
2176 } else if (set_round || set_denorm) {
2177 /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
2178 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(new_mode.val), (7 << 11) | 1);
2183 emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set)
2185 float_mode config_mode;
2186 config_mode.val = program.config->float_mode;
2188 bool set_round = always_set && block->fp_mode.round != config_mode.round;
2189 bool set_denorm = always_set && block->fp_mode.denorm != config_mode.denorm;
2190 if (block->kind & block_kind_top_level) {
2191 for (unsigned pred : block->linear_preds) {
2192 if (program.blocks[pred].fp_mode.round != block->fp_mode.round)
2194 if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm)
2198 /* only allow changing modes at top-level blocks so this doesn't break
2199 * the "jump over empty blocks" optimization */
2200 assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));
2201 emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
2205 hw_init_scratch(Builder& bld, Definition def, Operand scratch_addr, Operand scratch_offset)
2207 /* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16
2208 * bits in the same instruction that we add the carry.
2210 Operand hi_add = Operand::c32(0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1));
2211 Operand scratch_addr_lo(scratch_addr.physReg(), s1);
2212 Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1);
2214 if (bld.program->gfx_level >= GFX10) {
2215 PhysReg scratch_lo = def.physReg();
2216 PhysReg scratch_hi = def.physReg().advance(4);
2218 bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo, s1), Definition(scc, s1),
2219 scratch_addr_lo, scratch_offset);
2220 bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi, s1), Definition(scc, s1),
2221 scratch_addr_hi, hi_add, Operand(scc, s1));
2223 /* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register
2225 bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_lo, s1), (31 << 11) | 20);
2226 bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_hi, s1), (31 << 11) | 21);
2228 bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1),
2229 scratch_addr_lo, scratch_offset);
2230 bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1),
2231 scratch_addr_hi, hi_add, Operand(scc, s1));
2236 lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
2238 Operand linear_vgpr = instr->operands[3];
2240 unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
2241 unsigned vaddr_size = linear_vgpr.size();
2242 unsigned num_copied_vgprs = instr->operands.size() - 4;
2243 nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
2248 unsigned num_vaddr = 0;
2251 assert(num_copied_vgprs <= nsa_size);
2252 for (unsigned i = 0; i < num_copied_vgprs; i++)
2253 vaddr[num_vaddr++] = instr->operands[4 + i];
2254 for (unsigned i = num_copied_vgprs; i < std::min(vaddr_size, nsa_size); i++)
2255 vaddr[num_vaddr++] = Operand(linear_vgpr.physReg().advance(i * 4), v1);
2256 if (vaddr_size > nsa_size) {
2257 RegClass rc = RegClass::get(RegType::vgpr, (vaddr_size - nsa_size) * 4);
2258 vaddr[num_vaddr++] = Operand(PhysReg(linear_vgpr.physReg().advance(nsa_size * 4)), rc);
2261 PhysReg reg = linear_vgpr.physReg();
2262 std::map<PhysReg, copy_operation> copy_operations;
2263 for (unsigned i = 4; i < instr->operands.size(); i++) {
2264 Operand arg = instr->operands[i];
2265 Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
2266 copy_operations[def.physReg()] = {arg, def, def.bytes()};
2267 reg = reg.advance(arg.bytes());
2269 vaddr[num_vaddr++] = linear_vgpr;
2271 Pseudo_instruction pi = {};
2272 handle_operands(copy_operations, ctx, ctx->program->gfx_level, &pi);
2275 instr->mimg().strict_wqm = false;
2277 if ((3 + num_vaddr) > instr->operands.size()) {
2278 MIMG_instruction* new_instr = create_instruction<MIMG_instruction>(
2279 instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
2280 std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
2281 new_instr->definitions.begin());
2282 new_instr->operands[0] = instr->operands[0];
2283 new_instr->operands[1] = instr->operands[1];
2284 new_instr->operands[2] = instr->operands[2];
2285 memcpy((uint8_t*)new_instr + sizeof(Instruction), (uint8_t*)instr.get() + sizeof(Instruction),
2286 sizeof(MIMG_instruction) - sizeof(Instruction));
2287 instr.reset(new_instr);
2289 while (instr->operands.size() > (3 + num_vaddr))
2290 instr->operands.pop_back();
2292 std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
2296 lower_to_hw_instr(Program* program)
2298 gfx9_pops_done_msg_bounds pops_done_msg_bounds;
2299 if (program->has_pops_overlapped_waves_wait && program->gfx_level < GFX11) {
2300 pops_done_msg_bounds = gfx9_pops_done_msg_bounds(program);
2303 Block* discard_exit_block = NULL;
2304 Block* discard_pops_done_and_exit_block = NULL;
2306 bool should_dealloc_vgprs = dealloc_vgprs(program);
2308 for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
2309 Block* block = &program->blocks[block_idx];
2311 ctx.program = program;
2313 ctx.instructions.reserve(block->instructions.size());
2314 Builder bld(program, &ctx.instructions);
2316 emit_set_mode_from_block(bld, *program, block, (block_idx == 0));
2318 for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) {
2319 aco_ptr<Instruction>& instr = block->instructions[instr_idx];
2321 /* Send the ordered section done message from the middle of the block if needed (if the
2322 * ordered section is ended by an instruction inside this block).
2323 * Also make sure the done message is sent if it's needed in case early exit happens for
2326 if ((block_idx == pops_done_msg_bounds.end_block_idx() &&
2327 instr_idx == pops_done_msg_bounds.instr_after_end_idx()) ||
2328 (instr->opcode == aco_opcode::s_endpgm &&
2329 pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx))) {
2330 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2333 aco_ptr<Instruction> mov;
2334 if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) {
2335 Pseudo_instruction* pi = &instr->pseudo();
2337 switch (instr->opcode) {
2338 case aco_opcode::p_extract_vector: {
2339 PhysReg reg = instr->operands[0].physReg();
2340 Definition& def = instr->definitions[0];
2341 reg.reg_b += instr->operands[1].constantValue() * def.bytes();
2343 if (reg == def.physReg())
2346 RegClass op_rc = def.regClass().is_subdword()
2348 : RegClass(instr->operands[0].getTemp().type(), def.size());
2349 std::map<PhysReg, copy_operation> copy_operations;
2350 copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
2351 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2354 case aco_opcode::p_create_vector: {
2355 std::map<PhysReg, copy_operation> copy_operations;
2356 PhysReg reg = instr->definitions[0].physReg();
2358 for (const Operand& op : instr->operands) {
2359 if (op.isConstant()) {
2360 const Definition def = Definition(
2361 reg, instr->definitions[0].getTemp().regClass().resize(op.bytes()));
2362 copy_operations[reg] = {op, def, op.bytes()};
2363 reg.reg_b += op.bytes();
2366 if (op.isUndefined()) {
2367 // TODO: coalesce subdword copies if dst byte is 0
2368 reg.reg_b += op.bytes();
2373 op.regClass().is_subdword()
2375 : instr->definitions[0].getTemp().regClass().resize(op.bytes());
2376 const Definition def = Definition(reg, rc_def);
2377 copy_operations[def.physReg()] = {op, def, op.bytes()};
2378 reg.reg_b += op.bytes();
2380 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2383 case aco_opcode::p_split_vector: {
2384 std::map<PhysReg, copy_operation> copy_operations;
2385 PhysReg reg = instr->operands[0].physReg();
2387 for (const Definition& def : instr->definitions) {
2388 RegClass rc_op = def.regClass().is_subdword()
2390 : instr->operands[0].getTemp().regClass().resize(def.bytes());
2391 const Operand op = Operand(reg, rc_op);
2392 copy_operations[def.physReg()] = {op, def, def.bytes()};
2393 reg.reg_b += def.bytes();
2395 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2398 case aco_opcode::p_parallelcopy: {
2399 std::map<PhysReg, copy_operation> copy_operations;
2400 for (unsigned j = 0; j < instr->operands.size(); j++) {
2401 assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
2402 copy_operations[instr->definitions[j].physReg()] = {
2403 instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
2405 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2408 case aco_opcode::p_start_linear_vgpr: {
2409 if (instr->operands.empty())
2412 Definition def(instr->definitions[0].physReg(),
2413 RegClass::get(RegType::vgpr, instr->definitions[0].bytes()));
2415 std::map<PhysReg, copy_operation> copy_operations;
2416 copy_operations[def.physReg()] = {instr->operands[0], def,
2417 instr->operands[0].bytes()};
2418 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2421 case aco_opcode::p_exit_early_if: {
2422 /* don't bother with an early exit near the end of the program */
2423 if ((block->instructions.size() - 1 - instr_idx) <= 4 &&
2424 block->instructions.back()->opcode == aco_opcode::s_endpgm) {
2425 unsigned null_exp_dest =
2426 program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2427 bool ignore_early_exit = true;
2429 for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {
2430 const aco_ptr<Instruction>& instr2 = block->instructions[k];
2431 if (instr2->opcode == aco_opcode::s_endpgm ||
2432 instr2->opcode == aco_opcode::p_logical_end)
2434 else if (instr2->opcode == aco_opcode::exp &&
2435 instr2->exp().dest == null_exp_dest &&
2436 instr2->exp().enabled_mask == 0)
2438 else if (instr2->opcode == aco_opcode::p_parallelcopy &&
2439 instr2->definitions[0].isFixed() &&
2440 instr2->definitions[0].physReg() == exec)
2443 ignore_early_exit = false;
2446 if (ignore_early_exit)
2450 const bool discard_sends_pops_done =
2451 pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx);
2453 Block* discard_block =
2454 discard_sends_pops_done ? discard_pops_done_and_exit_block : discard_exit_block;
2455 if (!discard_block) {
2456 discard_block = program->create_and_insert_block();
2457 discard_block->kind = block_kind_discard_early_exit;
2458 if (discard_sends_pops_done) {
2459 discard_pops_done_and_exit_block = discard_block;
2461 discard_exit_block = discard_block;
2463 block = &program->blocks[block_idx];
2465 bld.reset(discard_block);
2466 if (program->has_pops_overlapped_waves_wait &&
2467 (program->gfx_level >= GFX11 || discard_sends_pops_done)) {
2468 /* If this discard early exit potentially exits the POPS ordered section, do
2469 * the waitcnt necessary before resuming overlapping waves as the normal
2470 * waitcnt insertion doesn't work in a discard early exit block.
2472 if (program->gfx_level >= GFX10)
2473 bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0);
2474 wait_imm pops_exit_wait_imm;
2475 pops_exit_wait_imm.vm = 0;
2476 if (program->has_smem_buffer_or_global_loads)
2477 pops_exit_wait_imm.lgkm = 0;
2478 bld.sopp(aco_opcode::s_waitcnt, -1,
2479 pops_exit_wait_imm.pack(program->gfx_level));
2481 if (discard_sends_pops_done)
2482 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2483 unsigned target = V_008DFC_SQ_EXP_NULL;
2484 if (program->gfx_level >= GFX11)
2486 program->has_color_exports ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_MRTZ;
2487 if (program->stage == fragment_fs)
2488 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
2489 target, false, true, true);
2490 if (should_dealloc_vgprs)
2491 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
2492 bld.sopp(aco_opcode::s_endpgm);
2494 bld.reset(&ctx.instructions);
2497 assert(instr->operands[0].physReg() == scc);
2498 bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
2500 discard_block->linear_preds.push_back(block->index);
2501 block->linear_succs.push_back(discard_block->index);
2504 case aco_opcode::p_spill: {
2505 assert(instr->operands[0].regClass() == v1.as_linear());
2506 for (unsigned i = 0; i < instr->operands[2].size(); i++) {
2508 instr->operands[2].isConstant()
2509 ? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))
2510 : Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
2511 bld.writelane(bld.def(v1, instr->operands[0].physReg()), src,
2512 Operand::c32(instr->operands[1].constantValue() + i),
2513 instr->operands[0]);
2517 case aco_opcode::p_reload: {
2518 assert(instr->operands[0].regClass() == v1.as_linear());
2519 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
2520 bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2522 Operand::c32(instr->operands[1].constantValue() + i));
2525 case aco_opcode::p_as_uniform: {
2526 if (instr->operands[0].isConstant() ||
2527 instr->operands[0].regClass().type() == RegType::sgpr) {
2528 std::map<PhysReg, copy_operation> copy_operations;
2529 copy_operations[instr->definitions[0].physReg()] = {
2530 instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
2531 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2533 assert(instr->operands[0].regClass().type() == RegType::vgpr);
2534 assert(instr->definitions[0].regClass().type() == RegType::sgpr);
2535 assert(instr->operands[0].size() == instr->definitions[0].size());
2536 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
2537 bld.vop1(aco_opcode::v_readfirstlane_b32,
2538 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2539 Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
2544 case aco_opcode::p_pops_gfx9_add_exiting_wave_id: {
2545 bld.sop2(aco_opcode::s_add_i32, instr->definitions[0], instr->definitions[1],
2546 Operand(pops_exiting_wave_id, s1), instr->operands[0]);
2549 case aco_opcode::p_bpermute_readlane: {
2550 emit_bpermute_readlane(program, instr, bld);
2553 case aco_opcode::p_bpermute_shared_vgpr: {
2554 emit_bpermute_shared_vgpr(program, instr, bld);
2557 case aco_opcode::p_bpermute_permlane: {
2558 emit_bpermute_permlane(program, instr, bld);
2561 case aco_opcode::p_constaddr: {
2562 unsigned id = instr->definitions[0].tempId();
2563 PhysReg reg = instr->definitions[0].physReg();
2564 bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));
2565 bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2566 Operand(reg, s1), instr->operands[0], Operand::c32(id));
2567 /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2570 case aco_opcode::p_resume_shader_address: {
2571 /* Find index of resume block. */
2572 unsigned resume_idx = instr->operands[0].constantValue();
2573 unsigned resume_block_idx = 0;
2574 for (Block& resume_block : program->blocks) {
2575 if (resume_block.kind & block_kind_resume) {
2576 if (resume_idx == 0) {
2577 resume_block_idx = resume_block.index;
2583 assert(resume_block_idx != 0);
2584 unsigned id = instr->definitions[0].tempId();
2585 PhysReg reg = instr->definitions[0].physReg();
2586 bld.sop1(aco_opcode::p_resumeaddr_getpc, instr->definitions[0], Operand::c32(id));
2587 bld.sop2(aco_opcode::p_resumeaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2588 Operand(reg, s1), Operand::c32(resume_block_idx), Operand::c32(id));
2589 /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2592 case aco_opcode::p_extract: {
2593 assert(instr->operands[1].isConstant());
2594 assert(instr->operands[2].isConstant());
2595 assert(instr->operands[3].isConstant());
2596 if (instr->definitions[0].regClass() == s1)
2597 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2598 Definition dst = instr->definitions[0];
2599 Operand op = instr->operands[0];
2600 unsigned bits = instr->operands[2].constantValue();
2601 unsigned index = instr->operands[1].constantValue();
2602 unsigned offset = index * bits;
2603 bool signext = !instr->operands[3].constantEquals(0);
2605 if (dst.regClass() == s1) {
2606 if (offset == (32 - bits)) {
2607 bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
2608 bld.def(s1, scc), op, Operand::c32(offset));
2609 } else if (offset == 0 && signext && (bits == 8 || bits == 16)) {
2610 bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
2612 } else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2613 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2615 bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
2616 bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
2618 } else if ((dst.regClass() == v1 && op.physReg().byte() == 0) ||
2619 ctx.program->gfx_level <= GFX7) {
2620 assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
2621 if (offset == (32 - bits) && op.regClass() != s1) {
2622 bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
2623 Operand::c32(offset), op);
2624 } else if (offset == 0 && bits == 16 && ctx.program->gfx_level >= GFX11) {
2625 bld.vop1(signext ? aco_opcode::v_cvt_i32_i16 : aco_opcode::v_cvt_u32_u16, dst,
2628 bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
2629 Operand::c32(offset), Operand::c32(bits));
2632 assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b ||
2633 op.regClass() == v1b);
2634 if (ctx.program->gfx_level >= GFX11) {
2635 unsigned op_vgpr_byte = op.physReg().byte() + offset / 8;
2636 unsigned sign_byte = op_vgpr_byte + bits / 8 - 1;
2638 uint8_t swiz[4] = {4, 5, 6, 7};
2639 swiz[dst.physReg().byte()] = op_vgpr_byte;
2641 swiz[dst.physReg().byte() + 1] = op_vgpr_byte + 1;
2642 for (unsigned i = bits / 8; i < dst.bytes(); i++) {
2643 uint8_t ext = bperm_0;
2646 ext = bperm_b1_sign;
2647 else if (sign_byte == 3)
2648 ext = bperm_b3_sign;
2649 else /* replicate so sign-extension can be done later */
2652 swiz[dst.physReg().byte() + i] = ext;
2654 create_bperm(bld, swiz, dst, op);
2656 if (signext && sign_byte != 3 && sign_byte != 1) {
2658 assert(dst.regClass() == v2b || dst.regClass() == v1);
2659 uint8_t ext_swiz[4] = {4, 5, 6, 7};
2660 uint8_t ext = dst.physReg().byte() == 2 ? bperm_b7_sign : bperm_b5_sign;
2661 memset(ext_swiz + dst.physReg().byte() + 1, ext, dst.bytes() - 1);
2662 create_bperm(bld, ext_swiz, dst, Operand::zero());
2665 SDWA_instruction& sdwa = bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa();
2666 sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext);
2671 case aco_opcode::p_insert: {
2672 assert(instr->operands[1].isConstant());
2673 assert(instr->operands[2].isConstant());
2674 if (instr->definitions[0].regClass() == s1)
2675 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2676 Definition dst = instr->definitions[0];
2677 Operand op = instr->operands[0];
2678 unsigned bits = instr->operands[2].constantValue();
2679 unsigned index = instr->operands[1].constantValue();
2680 unsigned offset = index * bits;
2682 bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11;
2683 if (dst.regClass() == s1) {
2684 if (offset == (32 - bits)) {
2685 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op,
2686 Operand::c32(offset));
2687 } else if (offset == 0) {
2688 bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2689 Operand::c32(bits << 16));
2691 bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2692 Operand::c32(bits << 16));
2693 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc),
2694 Operand(dst.physReg(), s1), Operand::c32(offset));
2696 } else if (dst.regClass() == v1 || !has_sdwa) {
2697 if (offset == (dst.bytes() * 8u - bits) &&
2698 (dst.regClass() == v1 || program->gfx_level <= GFX7)) {
2699 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
2700 } else if (offset == 0 && (dst.regClass() == v1 || program->gfx_level <= GFX7)) {
2701 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2702 } else if (has_sdwa && (op.regClass() != s1 || program->gfx_level >= GFX9)) {
2703 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().dst_sel =
2704 SubdwordSel(bits / 8, offset / 8, false);
2705 } else if (program->gfx_level >= GFX11) {
2706 uint8_t swiz[] = {4, 5, 6, 7};
2707 for (unsigned i = 0; i < dst.bytes(); i++)
2708 swiz[dst.physReg().byte() + i] = bperm_0;
2709 for (unsigned i = 0; i < bits / 8; i++)
2710 swiz[dst.physReg().byte() + i + offset / 8] = op.physReg().byte() + i;
2711 create_bperm(bld, swiz, dst, op);
2713 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2714 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),
2715 Operand(dst.physReg(), v1));
2718 assert(dst.regClass() == v2b);
2719 bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op)
2721 .sel[1] = SubdwordSel::ubyte;
2725 case aco_opcode::p_init_scratch: {
2726 assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3);
2727 if (!program->config->scratch_bytes_per_wave)
2730 Operand scratch_addr = instr->operands[0];
2731 if (scratch_addr.isUndefined()) {
2732 PhysReg reg = instr->definitions[0].physReg();
2733 bld.sop1(aco_opcode::p_load_symbol, Definition(reg, s1),
2734 Operand::c32(aco_symbol_scratch_addr_lo));
2735 bld.sop1(aco_opcode::p_load_symbol, Definition(reg.advance(4), s1),
2736 Operand::c32(aco_symbol_scratch_addr_hi));
2737 scratch_addr.setFixed(reg);
2738 } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
2739 bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr,
2741 scratch_addr.setFixed(instr->definitions[0].physReg());
2744 hw_init_scratch(bld, instr->definitions[0], scratch_addr, instr->operands[1]);
2747 case aco_opcode::p_jump_to_epilog: {
2748 if (pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx)) {
2749 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2751 bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
2754 case aco_opcode::p_interp_gfx11: {
2755 assert(instr->definitions[0].regClass() == v1 ||
2756 instr->definitions[0].regClass() == v2b);
2757 assert(instr->operands[0].regClass() == v1.as_linear());
2758 assert(instr->operands[1].isConstant());
2759 assert(instr->operands[2].isConstant());
2760 assert(instr->operands.back().physReg() == m0);
2761 Definition dst = instr->definitions[0];
2762 PhysReg lin_vgpr = instr->operands[0].physReg();
2763 unsigned attribute = instr->operands[1].constantValue();
2764 unsigned component = instr->operands[2].constantValue();
2765 uint16_t dpp_ctrl = 0;
2766 Operand coord1, coord2;
2767 if (instr->operands.size() == 6) {
2768 assert(instr->operands[3].regClass() == v1);
2769 assert(instr->operands[4].regClass() == v1);
2770 coord1 = instr->operands[3];
2771 coord2 = instr->operands[4];
2773 assert(instr->operands[3].isConstant());
2774 dpp_ctrl = instr->operands[3].constantValue();
2777 bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1),
2778 attribute, component);
2780 Operand p(lin_vgpr, v1);
2781 Operand dst_op(dst.physReg(), v1);
2782 if (instr->operands.size() == 5) {
2783 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
2784 } else if (dst.regClass() == v2b) {
2785 bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p,
2787 bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p,
2790 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1,
2792 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2,
2797 case aco_opcode::p_dual_src_export_gfx11: {
2798 PhysReg dst0 = instr->definitions[0].physReg();
2799 PhysReg dst1 = instr->definitions[1].physReg();
2800 Definition exec_tmp = instr->definitions[2];
2801 Definition not_vcc_tmp = instr->definitions[3];
2802 Definition clobber_vcc = instr->definitions[4];
2803 Definition clobber_scc = instr->definitions[5];
2805 assert(exec_tmp.regClass() == bld.lm);
2806 assert(not_vcc_tmp.regClass() == bld.lm);
2807 assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
2808 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
2810 bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm),
2811 Operand(exec, bld.lm));
2812 bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
2813 Operand(exec, bld.lm));
2815 uint8_t enabled_channels = 0;
2816 Operand mrt0[4], mrt1[4];
2818 bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg(), s1),
2819 Operand::c32(0x55555555));
2820 if (ctx.program->wave_size == 64)
2821 bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg().advance(4), s1),
2822 Operand::c32(0x55555555));
2824 Operand src_even = Operand(clobber_vcc.physReg(), bld.lm);
2826 bld.sop1(Builder::s_not, not_vcc_tmp, clobber_scc, src_even);
2828 Operand src_odd = Operand(not_vcc_tmp.physReg(), bld.lm);
2830 for (unsigned i = 0; i < 4; i++) {
2831 if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
2832 mrt0[i] = instr->operands[i];
2833 mrt1[i] = instr->operands[i + 4];
2837 Operand src0 = instr->operands[i];
2838 Operand src1 = instr->operands[i + 4];
2840 /* | even lanes | odd lanes
2841 * mrt0 | src0 even | src1 even
2842 * mrt1 | src0 odd | src1 odd
2844 bld.vop2_dpp(aco_opcode::v_cndmask_b32, Definition(dst0, v1), src1, src0,
2845 src_even, dpp_row_xmask(1));
2846 bld.vop2_e64_dpp(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src0, src1,
2847 src_odd, dpp_row_xmask(1));
2849 mrt0[i] = Operand(dst0, v1);
2850 mrt1[i] = Operand(dst1, v1);
2852 enabled_channels |= 1 << i;
2854 dst0 = dst0.advance(4);
2855 dst1 = dst1.advance(4);
2858 bld.sop1(Builder::s_mov, Definition(exec, bld.lm),
2859 Operand(exec_tmp.physReg(), bld.lm));
2861 /* Force export all channels when everything is undefined. */
2862 if (!enabled_channels)
2863 enabled_channels = 0xf;
2865 bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels,
2866 V_008DFC_SQ_EXP_MRT + 21, false);
2867 bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels,
2868 V_008DFC_SQ_EXP_MRT + 22, false);
2873 } else if (instr->isBranch()) {
2874 Pseudo_branch_instruction* branch = &instr->branch();
2875 const uint32_t target = branch->target[0];
2876 const bool uniform_branch = !((branch->opcode == aco_opcode::p_cbranch_z ||
2877 branch->opcode == aco_opcode::p_cbranch_nz) &&
2878 branch->operands[0].physReg() == exec);
2880 /* Check if the branch instruction can be removed.
2881 * This is beneficial when executing the next block with an empty exec mask
2882 * is faster than the branch instruction itself.
2884 * Override this judgement when:
2885 * - The application prefers to remove control flow
2886 * - The compiler stack knows that it's a divergent branch always taken
2888 const bool prefer_remove =
2889 branch->selection_control_remove && ctx.program->gfx_level >= GFX10;
2890 bool can_remove = block->index < target;
2891 unsigned num_scalar = 0;
2892 unsigned num_vector = 0;
2894 /* Check the instructions between branch and target */
2895 for (unsigned i = block->index + 1; i < branch->target[0]; i++) {
2896 /* Uniform conditional branches must not be ignored if they
2897 * are about to jump over actual instructions */
2898 if (uniform_branch && !program->blocks[i].instructions.empty())
2904 for (aco_ptr<Instruction>& inst : program->blocks[i].instructions) {
2905 if (inst->isSOPP()) {
2906 /* Discard early exits and loop breaks and continues should work fine with an
2909 bool is_break_continue =
2910 program->blocks[i].kind & (block_kind_break | block_kind_continue);
2911 bool discard_early_exit =
2912 inst->sopp().block != -1 &&
2913 (program->blocks[inst->sopp().block].kind & block_kind_discard_early_exit);
2914 if ((inst->opcode != aco_opcode::s_cbranch_scc0 &&
2915 inst->opcode != aco_opcode::s_cbranch_scc1) ||
2916 (!discard_early_exit && !is_break_continue))
2918 } else if (inst->isSALU()) {
2920 } else if (inst->isVALU() || inst->isVINTRP()) {
2922 /* VALU which writes SGPRs are always executed on GFX10+ */
2923 if (ctx.program->gfx_level >= GFX10) {
2924 for (Definition& def : inst->definitions) {
2925 if (def.regClass().type() == RegType::sgpr)
2929 } else if (inst->isEXP()) {
2930 /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs). */
2932 } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
2934 // TODO: GFX6-9 can use vskip
2935 can_remove = prefer_remove;
2936 } else if (inst->isSMEM()) {
2937 /* SMEM are at least as expensive as branches */
2938 can_remove = prefer_remove;
2939 } else if (inst->isBarrier()) {
2940 can_remove = prefer_remove;
2943 assert(false && "Pseudo instructions should be lowered by this point.");
2946 if (!prefer_remove) {
2947 /* Under these conditions, we shouldn't remove the branch.
2948 * Don't care about the estimated cycles when the shader prefers flattening.
2950 unsigned est_cycles;
2951 if (ctx.program->gfx_level >= GFX10)
2952 est_cycles = num_scalar * 2 + num_vector;
2954 est_cycles = num_scalar * 4 + num_vector * 4;
2956 if (est_cycles > 16)
2968 /* emit branch instruction */
2969 switch (instr->opcode) {
2970 case aco_opcode::p_branch:
2971 assert(block->linear_succs[0] == target);
2972 bld.sopp(aco_opcode::s_branch, branch->definitions[0], target);
2974 case aco_opcode::p_cbranch_nz:
2975 assert(block->linear_succs[1] == target);
2976 if (branch->operands[0].physReg() == exec)
2977 bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target);
2978 else if (branch->operands[0].physReg() == vcc)
2979 bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target);
2981 assert(branch->operands[0].physReg() == scc);
2982 bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target);
2985 case aco_opcode::p_cbranch_z:
2986 assert(block->linear_succs[1] == target);
2987 if (branch->operands[0].physReg() == exec)
2988 bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target);
2989 else if (branch->operands[0].physReg() == vcc)
2990 bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target);
2992 assert(branch->operands[0].physReg() == scc);
2993 bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target);
2996 default: unreachable("Unknown Pseudo branch instruction!");
2999 } else if (instr->isReduction()) {
3000 Pseudo_reduction_instruction& reduce = instr->reduction();
3001 emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
3002 reduce.operands[1].physReg(), // tmp
3003 reduce.definitions[1].physReg(), // stmp
3004 reduce.operands[2].physReg(), // vtmp
3005 reduce.definitions[2].physReg(), // sitmp
3006 reduce.operands[0], reduce.definitions[0]);
3007 } else if (instr->isBarrier()) {
3008 Pseudo_barrier_instruction& barrier = instr->barrier();
3010 /* Anything larger than a workgroup isn't possible. Anything
3011 * smaller requires no instructions and this pseudo instruction
3012 * would only be included to control optimizations. */
3013 bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
3014 program->workgroup_size > program->wave_size;
3016 bld.insert(std::move(instr));
3018 bld.sopp(aco_opcode::s_barrier);
3019 } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
3020 float_mode new_mode = block->fp_mode;
3021 new_mode.round16_64 = fp_round_ne;
3022 bool set_round = new_mode.round != block->fp_mode.round;
3024 emit_set_mode(bld, new_mode, set_round, false);
3026 instr->opcode = aco_opcode::v_cvt_f16_f32;
3027 ctx.instructions.emplace_back(std::move(instr));
3029 emit_set_mode(bld, block->fp_mode, set_round, false);
3030 } else if (instr->isMIMG() && instr->mimg().strict_wqm) {
3031 lower_image_sample(&ctx, instr);
3032 ctx.instructions.emplace_back(std::move(instr));
3034 ctx.instructions.emplace_back(std::move(instr));
3038 /* Send the ordered section done message from this block if it's needed in this block, but
3039 * instr_after_end_idx() points beyond the end of its instructions. This may commonly happen
3040 * if the common post-dominator of multiple end locations turns out to be an empty block.
3042 if (block_idx == pops_done_msg_bounds.end_block_idx() &&
3043 pops_done_msg_bounds.instr_after_end_idx() >= block->instructions.size()) {
3044 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
3047 block->instructions = std::move(ctx.instructions);