2 * Copyright © 2018 Valve Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 #include "aco_opcodes.h"
29 #include "aco_shader_info.h"
32 #include "util/compiler.h"
34 #include "ac_binary.h"
35 #include "amd_family.h"
41 typedef struct nir_shader nir_shader;
45 extern uint64_t debug_flags;
48 DEBUG_VALIDATE_IR = 0x1,
49 DEBUG_VALIDATE_RA = 0x2,
51 DEBUG_FORCE_WAITCNT = 0x8,
54 DEBUG_NO_SCHED = 0x40,
55 DEBUG_PERF_INFO = 0x80,
56 DEBUG_LIVE_INFO = 0x100,
57 DEBUG_FORCE_WAITDEPS = 0x200,
58 DEBUG_NO_VALIDATE_IR = 0x400,
62 * Representation of the instruction's microcode encoding format
63 * Note: Some Vector ALU Formats can be combined, such that:
64 * - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
65 * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
66 * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
68 * (*) The same is applicable for VOP1 and VOPC instructions.
70 enum class Format : std::uint16_t {
71 /* Pseudo Instruction Format */
73 /* Scalar ALU & Control Formats */
79 /* Scalar Memory Format */
84 /* Vector Memory Buffer Formats */
87 /* Vector Memory Image Format */
98 PSEUDO_REDUCTION = 19,
100 /* Vector ALU Formats */
107 /* Vector Parameter Interpolation Format */
114 enum class instr_class : uint8_t {
118 valu_quarter_rate32 = 3,
120 valu_transcendental32 = 5,
123 valu_double_convert = 8,
124 valu_double_transcendental = 9,
138 enum storage_class : uint8_t {
139 storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
140 storage_buffer = 0x1, /* SSBOs and global memory */
143 storage_shared = 0x8, /* or TCS output */
144 storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
145 storage_task_payload = 0x20,/* Task-Mesh payload */
146 storage_scratch = 0x40,
147 storage_vgpr_spill = 0x80,
148 storage_count = 8, /* not counting storage_none */
151 enum memory_semantics : uint8_t {
153 /* for loads: don't move any access after this load to before this load (even other loads)
154 * for barriers: don't move any access after the barrier to before any
155 * atomics/control_barriers/sendmsg_gs_done before the barrier */
156 semantic_acquire = 0x1,
157 /* for stores: don't move any access before this store to after this store
158 * for barriers: don't move any access before the barrier to after any
159 * atomics/control_barriers/sendmsg_gs_done after the barrier */
160 semantic_release = 0x2,
162 /* the rest are for load/stores/atomics only */
163 /* cannot be DCE'd or CSE'd */
164 semantic_volatile = 0x4,
165 /* does not interact with barriers and assumes this lane is the only lane
166 * accessing this memory */
167 semantic_private = 0x8,
168 /* this operation can be reordered around operations of the same storage.
169 * says nothing about barriers */
170 semantic_can_reorder = 0x10,
171 /* this is a atomic instruction (may only read or write memory) */
172 semantic_atomic = 0x20,
173 /* this is instruction both reads and writes memory */
176 semantic_acqrel = semantic_acquire | semantic_release,
177 semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
180 enum sync_scope : uint8_t {
181 scope_invocation = 0,
184 scope_queuefamily = 3,
188 struct memory_sync_info {
189 memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
190 memory_sync_info(int storage_, int semantics_ = 0, sync_scope scope_ = scope_invocation)
191 : storage((storage_class)storage_), semantics((memory_semantics)semantics_), scope(scope_)
194 storage_class storage : 8;
195 memory_semantics semantics : 8;
196 sync_scope scope : 8;
198 bool operator==(const memory_sync_info& rhs) const
200 return storage == rhs.storage && semantics == rhs.semantics && scope == rhs.scope;
203 bool can_reorder() const
205 if (semantics & semantic_acqrel)
207 /* Also check storage so that zero-initialized memory_sync_info can be
209 return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
212 static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding");
222 /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and
223 * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */
224 fp_denorm_flush = 0x0,
225 fp_denorm_keep_in = 0x1,
226 fp_denorm_keep_out = 0x2,
227 fp_denorm_keep = 0x3,
231 /* matches encoding of the MODE register */
234 fp_round round32 : 2;
235 fp_round round16_64 : 2;
236 unsigned denorm32 : 2;
237 unsigned denorm16_64 : 2;
245 /* if false, optimizations which may remove infs/nan/-0.0 can be done */
246 bool preserve_signed_zero_inf_nan32 : 1;
247 bool preserve_signed_zero_inf_nan16_64 : 1;
248 /* if false, optimizations which may remove denormal flushing can be done */
249 bool must_flush_denorms32 : 1;
250 bool must_flush_denorms16_64 : 1;
251 bool care_about_round32 : 1;
252 bool care_about_round16_64 : 1;
254 /* Returns true if instructions using the mode "other" can safely use the
255 * current one instead. */
256 bool canReplace(float_mode other) const noexcept
258 return val == other.val &&
259 (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) &&
260 (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) &&
261 (must_flush_denorms32 || !other.must_flush_denorms32) &&
262 (must_flush_denorms16_64 || !other.must_flush_denorms16_64) &&
263 (care_about_round32 || !other.care_about_round32) &&
264 (care_about_round16_64 || !other.care_about_round16_64);
269 static const uint8_t unset_counter = 0xff;
277 wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
278 wait_imm(enum amd_gfx_level chip, uint16_t packed);
280 uint16_t pack(enum amd_gfx_level chip) const;
282 bool combine(const wait_imm& other);
288 asVOP3(Format format)
290 return (Format)((uint32_t)Format::VOP3 | (uint32_t)format);
294 asSDWA(Format format)
296 assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC);
297 return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
301 withoutDPP(Format format)
303 return (Format)((uint32_t)format & ~((uint32_t)Format::DPP16 | (uint32_t)Format::DPP8));
307 withoutVOP3(Format format)
309 return (Format)((uint32_t)format & ~((uint32_t)Format::VOP3));
335 /* byte-sized register class */
342 /* these are used for WWM and spills to vgpr */
343 v1_linear = v1 | (1 << 6),
344 v2_linear = v2 | (1 << 6),
347 RegClass() = default;
348 constexpr RegClass(RC rc_) : rc(rc_) {}
349 constexpr RegClass(RegType type, unsigned size)
350 : rc((RC)((type == RegType::vgpr ? 1 << 5 : 0) | size))
353 constexpr operator RC() const { return rc; }
354 explicit operator bool() = delete;
356 constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
357 constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
358 constexpr bool is_subdword() const { return rc & (1 << 7); }
359 constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
360 // TODO: use size() less in favor of bytes()
361 constexpr unsigned size() const { return (bytes() + 3) >> 2; }
362 constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
363 constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
364 constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
366 static constexpr RegClass get(RegType type, unsigned bytes)
368 if (type == RegType::sgpr) {
369 return RegClass(type, DIV_ROUND_UP(bytes, 4u));
371 return bytes % 4u ? RegClass(type, bytes).as_subdword() : RegClass(type, bytes / 4u);
375 constexpr RegClass resize(unsigned bytes) const
377 if (is_linear_vgpr()) {
378 assert(bytes % 4u == 0);
379 return get(RegType::vgpr, bytes).as_linear();
381 return get(type(), bytes);
388 /* transitional helper expressions */
389 static constexpr RegClass s1{RegClass::s1};
390 static constexpr RegClass s2{RegClass::s2};
391 static constexpr RegClass s3{RegClass::s3};
392 static constexpr RegClass s4{RegClass::s4};
393 static constexpr RegClass s8{RegClass::s8};
394 static constexpr RegClass s16{RegClass::s16};
395 static constexpr RegClass v1{RegClass::v1};
396 static constexpr RegClass v2{RegClass::v2};
397 static constexpr RegClass v3{RegClass::v3};
398 static constexpr RegClass v4{RegClass::v4};
399 static constexpr RegClass v5{RegClass::v5};
400 static constexpr RegClass v6{RegClass::v6};
401 static constexpr RegClass v7{RegClass::v7};
402 static constexpr RegClass v8{RegClass::v8};
403 static constexpr RegClass v1b{RegClass::v1b};
404 static constexpr RegClass v2b{RegClass::v2b};
405 static constexpr RegClass v3b{RegClass::v3b};
406 static constexpr RegClass v4b{RegClass::v4b};
407 static constexpr RegClass v6b{RegClass::v6b};
408 static constexpr RegClass v8b{RegClass::v8b};
412 * Each temporary virtual register has a
413 * register class (i.e. size and type)
417 Temp() noexcept : id_(0), reg_class(0) {}
418 constexpr Temp(uint32_t id, RegClass cls) noexcept : id_(id), reg_class(uint8_t(cls)) {}
420 constexpr uint32_t id() const noexcept { return id_; }
421 constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; }
423 constexpr unsigned bytes() const noexcept { return regClass().bytes(); }
424 constexpr unsigned size() const noexcept { return regClass().size(); }
425 constexpr RegType type() const noexcept { return regClass().type(); }
426 constexpr bool is_linear() const noexcept { return regClass().is_linear(); }
428 constexpr bool operator<(Temp other) const noexcept { return id() < other.id(); }
429 constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); }
430 constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); }
434 uint32_t reg_class : 8;
439 * Represents the physical register for each
440 * Operand and Definition.
443 constexpr PhysReg() = default;
444 explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {}
445 constexpr unsigned reg() const { return reg_b >> 2; }
446 constexpr unsigned byte() const { return reg_b & 0x3; }
447 constexpr operator unsigned() const { return reg(); }
448 constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; }
449 constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; }
450 constexpr bool operator<(PhysReg other) const { return reg_b < other.reg_b; }
451 constexpr PhysReg advance(int bytes) const
461 /* helper expressions for special registers */
462 static constexpr PhysReg m0{124};
463 static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */
464 static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */
465 static constexpr PhysReg vcc{106};
466 static constexpr PhysReg vcc_hi{107};
467 static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
468 static constexpr PhysReg tma{110}; /* GFX6-GFX8 */
469 static constexpr PhysReg ttmp0{112};
470 static constexpr PhysReg ttmp1{113};
471 static constexpr PhysReg ttmp2{114};
472 static constexpr PhysReg ttmp3{115};
473 static constexpr PhysReg ttmp4{116};
474 static constexpr PhysReg ttmp5{117};
475 static constexpr PhysReg ttmp6{118};
476 static constexpr PhysReg ttmp7{119};
477 static constexpr PhysReg ttmp8{120};
478 static constexpr PhysReg ttmp9{121};
479 static constexpr PhysReg ttmp10{122};
480 static constexpr PhysReg ttmp11{123};
481 static constexpr PhysReg sgpr_null{125}; /* GFX10+ */
482 static constexpr PhysReg exec{126};
483 static constexpr PhysReg exec_lo{126};
484 static constexpr PhysReg exec_hi{127};
485 static constexpr PhysReg vccz{251};
486 static constexpr PhysReg execz{252};
487 static constexpr PhysReg scc{253};
491 * Initially, each Operand refers to either
492 * a temporary virtual register
493 * or to a constant value
494 * Temporary registers get mapped to physical register during RA
495 * Constant values are inlined into the instruction sequence.
497 class Operand final {
500 : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), isKill_(false),
501 isUndef_(true), isFirstKill_(false), constSize(0), isLateKill_(false), is16bit_(false),
502 is24bit_(false), signext(false)
505 explicit Operand(Temp r) noexcept
512 setFixed(PhysReg{128});
515 explicit Operand(Temp r, PhysReg reg) noexcept
517 assert(r.id()); /* Don't allow fixing an undef to a register */
524 static Operand c8(uint8_t v) noexcept
526 /* 8-bit constants are only used for copies and copies from any 8-bit
527 * constant can be implemented with a SDWA v_mul_u32_u24. So consider all
528 * to be inline constants. */
532 op.isConstant_ = true;
534 op.setFixed(PhysReg{0u});
538 /* 16-bit constant */
539 static Operand c16(uint16_t v) noexcept
544 op.isConstant_ = true;
547 op.setFixed(PhysReg{128u + v});
548 else if (v >= 0xFFF0) /* [-16 .. -1] */
549 op.setFixed(PhysReg{(unsigned)(192 - (int16_t)v)});
550 else if (v == 0x3800) /* 0.5 */
551 op.setFixed(PhysReg{240});
552 else if (v == 0xB800) /* -0.5 */
553 op.setFixed(PhysReg{241});
554 else if (v == 0x3C00) /* 1.0 */
555 op.setFixed(PhysReg{242});
556 else if (v == 0xBC00) /* -1.0 */
557 op.setFixed(PhysReg{243});
558 else if (v == 0x4000) /* 2.0 */
559 op.setFixed(PhysReg{244});
560 else if (v == 0xC000) /* -2.0 */
561 op.setFixed(PhysReg{245});
562 else if (v == 0x4400) /* 4.0 */
563 op.setFixed(PhysReg{246});
564 else if (v == 0xC400) /* -4.0 */
565 op.setFixed(PhysReg{247});
566 else if (v == 0x3118) /* 1/2 PI */
567 op.setFixed(PhysReg{248});
568 else /* Literal Constant */
569 op.setFixed(PhysReg{255});
573 /* 32-bit constant */
574 static Operand c32(uint32_t v) noexcept { return c32_or_c64(v, false); }
576 /* 64-bit constant */
577 static Operand c64(uint64_t v) noexcept
581 op.isConstant_ = true;
584 op.data_.i = (uint32_t)v;
585 op.setFixed(PhysReg{128 + (uint32_t)v});
586 } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */
587 op.data_.i = (uint32_t)v;
588 op.setFixed(PhysReg{192 - (uint32_t)v});
589 } else if (v == 0x3FE0000000000000) { /* 0.5 */
590 op.data_.i = 0x3f000000;
591 op.setFixed(PhysReg{240});
592 } else if (v == 0xBFE0000000000000) { /* -0.5 */
593 op.data_.i = 0xbf000000;
594 op.setFixed(PhysReg{241});
595 } else if (v == 0x3FF0000000000000) { /* 1.0 */
596 op.data_.i = 0x3f800000;
597 op.setFixed(PhysReg{242});
598 } else if (v == 0xBFF0000000000000) { /* -1.0 */
599 op.data_.i = 0xbf800000;
600 op.setFixed(PhysReg{243});
601 } else if (v == 0x4000000000000000) { /* 2.0 */
602 op.data_.i = 0x40000000;
603 op.setFixed(PhysReg{244});
604 } else if (v == 0xC000000000000000) { /* -2.0 */
605 op.data_.i = 0xc0000000;
606 op.setFixed(PhysReg{245});
607 } else if (v == 0x4010000000000000) { /* 4.0 */
608 op.data_.i = 0x40800000;
609 op.setFixed(PhysReg{246});
610 } else if (v == 0xC010000000000000) { /* -4.0 */
611 op.data_.i = 0xc0800000;
612 op.setFixed(PhysReg{247});
613 } else { /* Literal Constant: we don't know if it is a long or double.*/
614 op.signext = v >> 63;
615 op.data_.i = v & 0xffffffffu;
616 op.setFixed(PhysReg{255});
617 assert(op.constantValue64() == v &&
618 "attempt to create a unrepresentable 64-bit literal constant");
623 /* 32-bit constant stored as a 32-bit or 64-bit operand */
624 static Operand c32_or_c64(uint32_t v, bool is64bit) noexcept
629 op.isConstant_ = true;
630 op.constSize = is64bit ? 3 : 2;
632 op.setFixed(PhysReg{128 + v});
633 else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */
634 op.setFixed(PhysReg{192 - v});
635 else if (v == 0x3f000000) /* 0.5 */
636 op.setFixed(PhysReg{240});
637 else if (v == 0xbf000000) /* -0.5 */
638 op.setFixed(PhysReg{241});
639 else if (v == 0x3f800000) /* 1.0 */
640 op.setFixed(PhysReg{242});
641 else if (v == 0xbf800000) /* -1.0 */
642 op.setFixed(PhysReg{243});
643 else if (v == 0x40000000) /* 2.0 */
644 op.setFixed(PhysReg{244});
645 else if (v == 0xc0000000) /* -2.0 */
646 op.setFixed(PhysReg{245});
647 else if (v == 0x40800000) /* 4.0 */
648 op.setFixed(PhysReg{246});
649 else if (v == 0xc0800000) /* -4.0 */
650 op.setFixed(PhysReg{247});
651 else { /* Literal Constant */
652 assert(!is64bit && "attempt to create a 64-bit literal constant");
653 op.setFixed(PhysReg{255});
658 static Operand literal32(uint32_t v) noexcept
663 op.isConstant_ = true;
665 op.setFixed(PhysReg{255});
669 explicit Operand(RegClass type) noexcept
672 data_.temp = Temp(0, type);
673 setFixed(PhysReg{128});
675 explicit Operand(PhysReg reg, RegClass type) noexcept
677 data_.temp = Temp(0, type);
681 static Operand zero(unsigned bytes = 4) noexcept
684 return Operand::c64(0);
686 return Operand::c32(0);
688 return Operand::c16(0);
690 return Operand::c8(0);
693 /* This is useful over the constructors when you want to take a gfx level
694 * for 1/2 PI or an unknown operand size.
696 static Operand get_const(enum amd_gfx_level chip, uint64_t val, unsigned bytes)
698 if (val == 0x3e22f983 && bytes == 4 && chip >= GFX8) {
699 /* 1/2 PI can be an inline constant on GFX8+ */
700 Operand op = Operand::c32(val);
701 op.setFixed(PhysReg{248});
706 return Operand::c64(val);
708 return Operand::c32(val);
710 return Operand::c16(val);
712 return Operand::c8(val);
715 static bool is_constant_representable(uint64_t val, unsigned bytes, bool zext = false,
721 if (zext && (val & 0xFFFFFFFF00000000) == 0x0000000000000000)
723 uint64_t upper33 = val & 0xFFFFFFFF80000000;
724 if (sext && (upper33 == 0xFFFFFFFF80000000 || upper33 == 0))
727 return val >= 0xFFFFFFFFFFFFFFF0 || val <= 64 || /* [-16 .. 64] */
728 val == 0x3FE0000000000000 || /* 0.5 */
729 val == 0xBFE0000000000000 || /* -0.5 */
730 val == 0x3FF0000000000000 || /* 1.0 */
731 val == 0xBFF0000000000000 || /* -1.0 */
732 val == 0x4000000000000000 || /* 2.0 */
733 val == 0xC000000000000000 || /* -2.0 */
734 val == 0x4010000000000000 || /* 4.0 */
735 val == 0xC010000000000000; /* -4.0 */
738 constexpr bool isTemp() const noexcept { return isTemp_; }
740 constexpr void setTemp(Temp t) noexcept
742 assert(!isConstant_);
747 constexpr Temp getTemp() const noexcept { return data_.temp; }
749 constexpr uint32_t tempId() const noexcept { return data_.temp.id(); }
751 constexpr bool hasRegClass() const noexcept { return !isConstant(); }
753 constexpr RegClass regClass() const noexcept { return data_.temp.regClass(); }
755 constexpr unsigned bytes() const noexcept
758 return 1 << constSize;
760 return data_.temp.bytes();
763 constexpr unsigned size() const noexcept
766 return constSize > 2 ? 2 : 1;
768 return data_.temp.size();
771 constexpr bool isFixed() const noexcept { return isFixed_; }
773 constexpr PhysReg physReg() const noexcept { return reg_; }
775 constexpr void setFixed(PhysReg reg) noexcept
777 isFixed_ = reg != unsigned(-1);
781 constexpr bool isConstant() const noexcept { return isConstant_; }
783 constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
785 constexpr bool isUndefined() const noexcept { return isUndef_; }
787 constexpr uint32_t constantValue() const noexcept { return data_.i; }
789 constexpr bool constantEquals(uint32_t cmp) const noexcept
791 return isConstant() && constantValue() == cmp;
794 constexpr uint64_t constantValue64() const noexcept
796 if (constSize == 3) {
799 else if (reg_ <= 208)
800 return 0xFFFFFFFFFFFFFFFF - (reg_ - 193);
803 case 240: return 0x3FE0000000000000;
804 case 241: return 0xBFE0000000000000;
805 case 242: return 0x3FF0000000000000;
806 case 243: return 0xBFF0000000000000;
807 case 244: return 0x4000000000000000;
808 case 245: return 0xC000000000000000;
809 case 246: return 0x4010000000000000;
810 case 247: return 0xC010000000000000;
812 return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i;
814 unreachable("invalid register for 64-bit constant");
820 /* Value if this were used with vop3/opsel or vop3p. */
821 constexpr uint16_t constantValue16(bool opsel) const noexcept
823 assert(bytes() == 2 || bytes() == 4);
825 if (bytes() == 2 && int16_t(data_.i) >= -16 && int16_t(data_.i) <= 64 && !isLiteral())
826 return int16_t(data_.i) >> 16; /* 16-bit inline integers are sign-extended, even with fp16 instrs */
828 return data_.i >> 16;
833 constexpr bool isOfType(RegType type) const noexcept
835 return hasRegClass() && regClass().type() == type;
838 /* Indicates that the killed operand's live range intersects with the
839 * instruction's definitions. Unlike isKill() and isFirstKill(), this is
840 * not set by liveness analysis. */
841 constexpr void setLateKill(bool flag) noexcept { isLateKill_ = flag; }
843 constexpr bool isLateKill() const noexcept { return isLateKill_; }
845 constexpr void setKill(bool flag) noexcept
852 constexpr bool isKill() const noexcept { return isKill_ || isFirstKill(); }
854 constexpr void setFirstKill(bool flag) noexcept
861 /* When there are multiple operands killing the same temporary,
862 * isFirstKill() is only returns true for the first one. */
863 constexpr bool isFirstKill() const noexcept { return isFirstKill_; }
865 constexpr bool isKillBeforeDef() const noexcept { return isKill() && !isLateKill(); }
867 constexpr bool isFirstKillBeforeDef() const noexcept { return isFirstKill() && !isLateKill(); }
869 constexpr bool operator==(Operand other) const noexcept
871 if (other.size() != size())
873 if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef())
875 if (isFixed() && other.isFixed() && physReg() != other.physReg())
878 return other.isLiteral() && other.constantValue() == constantValue();
879 else if (isConstant())
880 return other.isConstant() && other.physReg() == physReg();
881 else if (isUndefined())
882 return other.isUndefined() && other.regClass() == regClass();
884 return other.isTemp() && other.getTemp() == getTemp();
887 constexpr bool operator!=(Operand other) const noexcept { return !operator==(other); }
889 constexpr void set16bit(bool flag) noexcept { is16bit_ = flag; }
891 constexpr bool is16bit() const noexcept { return is16bit_; }
893 constexpr void set24bit(bool flag) noexcept { is24bit_ = flag; }
895 constexpr bool is24bit() const noexcept { return is24bit_; }
902 } data_ = {Temp(0, s1)};
907 uint8_t isFixed_ : 1;
908 uint8_t isConstant_ : 1;
910 uint8_t isUndef_ : 1;
911 uint8_t isFirstKill_ : 1;
912 uint8_t constSize : 2;
913 uint8_t isLateKill_ : 1;
914 uint8_t is16bit_ : 1;
915 uint8_t is24bit_ : 1;
918 /* can't initialize bit-fields in c++11, so work around using a union */
919 uint16_t control_ = 0;
925 * Definitions are the results of Instructions
926 * and refer to temporary virtual registers
927 * which are later mapped to physical registers
929 class Definition final {
931 constexpr Definition()
932 : temp(Temp(0, s1)), reg_(0), isFixed_(0), isKill_(0), isPrecise_(0), isNUW_(0), isNoCSE_(0)
934 Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {}
935 explicit Definition(Temp tmp) noexcept : temp(tmp) {}
936 Definition(PhysReg reg, RegClass type) noexcept : temp(Temp(0, type)) { setFixed(reg); }
937 Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept : temp(Temp(tmpId, type))
942 constexpr bool isTemp() const noexcept { return tempId() > 0; }
944 constexpr Temp getTemp() const noexcept { return temp; }
946 constexpr uint32_t tempId() const noexcept { return temp.id(); }
948 constexpr void setTemp(Temp t) noexcept { temp = t; }
950 void swapTemp(Definition& other) noexcept { std::swap(temp, other.temp); }
952 constexpr RegClass regClass() const noexcept { return temp.regClass(); }
954 constexpr unsigned bytes() const noexcept { return temp.bytes(); }
956 constexpr unsigned size() const noexcept { return temp.size(); }
958 constexpr bool isFixed() const noexcept { return isFixed_; }
960 constexpr PhysReg physReg() const noexcept { return reg_; }
962 constexpr void setFixed(PhysReg reg) noexcept
968 constexpr void setKill(bool flag) noexcept { isKill_ = flag; }
970 constexpr bool isKill() const noexcept { return isKill_; }
972 constexpr void setPrecise(bool precise) noexcept { isPrecise_ = precise; }
974 constexpr bool isPrecise() const noexcept { return isPrecise_; }
976 /* No Unsigned Wrap */
977 constexpr void setNUW(bool nuw) noexcept { isNUW_ = nuw; }
979 constexpr bool isNUW() const noexcept { return isNUW_; }
981 constexpr void setNoCSE(bool noCSE) noexcept { isNoCSE_ = noCSE; }
983 constexpr bool isNoCSE() const noexcept { return isNoCSE_; }
986 Temp temp = Temp(0, s1);
990 uint8_t isFixed_ : 1;
992 uint8_t isPrecise_ : 1;
994 uint8_t isNoCSE_ : 1;
996 /* can't initialize bit-fields in c++11, so work around using a union */
997 uint8_t control_ = 0;
1003 struct Pseudo_instruction;
1004 struct SOP1_instruction;
1005 struct SOP2_instruction;
1006 struct SOPK_instruction;
1007 struct SOPP_instruction;
1008 struct SOPC_instruction;
1009 struct SMEM_instruction;
1010 struct DS_instruction;
1011 struct LDSDIR_instruction;
1012 struct MTBUF_instruction;
1013 struct MUBUF_instruction;
1014 struct MIMG_instruction;
1015 struct Export_instruction;
1016 struct FLAT_instruction;
1017 struct Pseudo_branch_instruction;
1018 struct Pseudo_barrier_instruction;
1019 struct Pseudo_reduction_instruction;
1020 struct VALU_instruction;
1021 struct VINTERP_inreg_instruction;
1022 struct VINTRP_instruction;
1023 struct DPP16_instruction;
1024 struct DPP8_instruction;
1025 struct SDWA_instruction;
1027 struct Instruction {
1030 uint32_t pass_flags;
1032 aco::span<Operand> operands;
1033 aco::span<Definition> definitions;
1035 constexpr bool usesModifiers() const noexcept;
1037 constexpr bool reads_exec() const noexcept
1039 for (const Operand& op : operands) {
1040 if (op.isFixed() && op.physReg() == exec)
1046 Pseudo_instruction& pseudo() noexcept
1049 return *(Pseudo_instruction*)this;
1051 const Pseudo_instruction& pseudo() const noexcept
1054 return *(Pseudo_instruction*)this;
1056 constexpr bool isPseudo() const noexcept { return format == Format::PSEUDO; }
1057 SOP1_instruction& sop1() noexcept
1060 return *(SOP1_instruction*)this;
1062 const SOP1_instruction& sop1() const noexcept
1065 return *(SOP1_instruction*)this;
1067 constexpr bool isSOP1() const noexcept { return format == Format::SOP1; }
1068 SOP2_instruction& sop2() noexcept
1071 return *(SOP2_instruction*)this;
1073 const SOP2_instruction& sop2() const noexcept
1076 return *(SOP2_instruction*)this;
1078 constexpr bool isSOP2() const noexcept { return format == Format::SOP2; }
1079 SOPK_instruction& sopk() noexcept
1082 return *(SOPK_instruction*)this;
1084 const SOPK_instruction& sopk() const noexcept
1087 return *(SOPK_instruction*)this;
1089 constexpr bool isSOPK() const noexcept { return format == Format::SOPK; }
1090 SOPP_instruction& sopp() noexcept
1093 return *(SOPP_instruction*)this;
1095 const SOPP_instruction& sopp() const noexcept
1098 return *(SOPP_instruction*)this;
1100 constexpr bool isSOPP() const noexcept { return format == Format::SOPP; }
1101 SOPC_instruction& sopc() noexcept
1104 return *(SOPC_instruction*)this;
1106 const SOPC_instruction& sopc() const noexcept
1109 return *(SOPC_instruction*)this;
1111 constexpr bool isSOPC() const noexcept { return format == Format::SOPC; }
1112 SMEM_instruction& smem() noexcept
1115 return *(SMEM_instruction*)this;
1117 const SMEM_instruction& smem() const noexcept
1120 return *(SMEM_instruction*)this;
1122 constexpr bool isSMEM() const noexcept { return format == Format::SMEM; }
1123 DS_instruction& ds() noexcept
1126 return *(DS_instruction*)this;
1128 const DS_instruction& ds() const noexcept
1131 return *(DS_instruction*)this;
1133 constexpr bool isDS() const noexcept { return format == Format::DS; }
1134 LDSDIR_instruction& ldsdir() noexcept
1137 return *(LDSDIR_instruction*)this;
1139 const LDSDIR_instruction& ldsdir() const noexcept
1142 return *(LDSDIR_instruction*)this;
1144 constexpr bool isLDSDIR() const noexcept { return format == Format::LDSDIR; }
1145 MTBUF_instruction& mtbuf() noexcept
1148 return *(MTBUF_instruction*)this;
1150 const MTBUF_instruction& mtbuf() const noexcept
1153 return *(MTBUF_instruction*)this;
1155 constexpr bool isMTBUF() const noexcept { return format == Format::MTBUF; }
1156 MUBUF_instruction& mubuf() noexcept
1159 return *(MUBUF_instruction*)this;
1161 const MUBUF_instruction& mubuf() const noexcept
1164 return *(MUBUF_instruction*)this;
1166 constexpr bool isMUBUF() const noexcept { return format == Format::MUBUF; }
1167 MIMG_instruction& mimg() noexcept
1170 return *(MIMG_instruction*)this;
1172 const MIMG_instruction& mimg() const noexcept
1175 return *(MIMG_instruction*)this;
1177 constexpr bool isMIMG() const noexcept { return format == Format::MIMG; }
1178 Export_instruction& exp() noexcept
1181 return *(Export_instruction*)this;
1183 const Export_instruction& exp() const noexcept
1186 return *(Export_instruction*)this;
1188 constexpr bool isEXP() const noexcept { return format == Format::EXP; }
1189 FLAT_instruction& flat() noexcept
1192 return *(FLAT_instruction*)this;
1194 const FLAT_instruction& flat() const noexcept
1197 return *(FLAT_instruction*)this;
1199 constexpr bool isFlat() const noexcept { return format == Format::FLAT; }
1200 FLAT_instruction& global() noexcept
1203 return *(FLAT_instruction*)this;
1205 const FLAT_instruction& global() const noexcept
1208 return *(FLAT_instruction*)this;
1210 constexpr bool isGlobal() const noexcept { return format == Format::GLOBAL; }
1211 FLAT_instruction& scratch() noexcept
1213 assert(isScratch());
1214 return *(FLAT_instruction*)this;
1216 const FLAT_instruction& scratch() const noexcept
1218 assert(isScratch());
1219 return *(FLAT_instruction*)this;
1221 constexpr bool isScratch() const noexcept { return format == Format::SCRATCH; }
1222 Pseudo_branch_instruction& branch() noexcept
1225 return *(Pseudo_branch_instruction*)this;
1227 const Pseudo_branch_instruction& branch() const noexcept
1230 return *(Pseudo_branch_instruction*)this;
1232 constexpr bool isBranch() const noexcept { return format == Format::PSEUDO_BRANCH; }
1233 Pseudo_barrier_instruction& barrier() noexcept
1235 assert(isBarrier());
1236 return *(Pseudo_barrier_instruction*)this;
1238 const Pseudo_barrier_instruction& barrier() const noexcept
1240 assert(isBarrier());
1241 return *(Pseudo_barrier_instruction*)this;
1243 constexpr bool isBarrier() const noexcept { return format == Format::PSEUDO_BARRIER; }
1244 Pseudo_reduction_instruction& reduction() noexcept
1246 assert(isReduction());
1247 return *(Pseudo_reduction_instruction*)this;
1249 const Pseudo_reduction_instruction& reduction() const noexcept
1251 assert(isReduction());
1252 return *(Pseudo_reduction_instruction*)this;
1254 constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
1255 constexpr bool isVOP3P() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3P; }
1256 VINTERP_inreg_instruction& vinterp_inreg() noexcept
1258 assert(isVINTERP_INREG());
1259 return *(VINTERP_inreg_instruction*)this;
1261 const VINTERP_inreg_instruction& vinterp_inreg() const noexcept
1263 assert(isVINTERP_INREG());
1264 return *(VINTERP_inreg_instruction*)this;
1266 constexpr bool isVINTERP_INREG() const noexcept { return format == Format::VINTERP_INREG; }
1267 constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
1268 constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
1269 constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
1270 constexpr bool isVOP3() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3; }
1271 VINTRP_instruction& vintrp() noexcept
1274 return *(VINTRP_instruction*)this;
1276 const VINTRP_instruction& vintrp() const noexcept
1279 return *(VINTRP_instruction*)this;
1281 constexpr bool isVINTRP() const noexcept { return (uint16_t)format & (uint16_t)Format::VINTRP; }
1282 DPP16_instruction& dpp16() noexcept
1285 return *(DPP16_instruction*)this;
1287 const DPP16_instruction& dpp16() const noexcept
1290 return *(DPP16_instruction*)this;
1292 constexpr bool isDPP16() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP16; }
1293 DPP8_instruction& dpp8() noexcept
1296 return *(DPP8_instruction*)this;
1298 const DPP8_instruction& dpp8() const noexcept
1301 return *(DPP8_instruction*)this;
1303 constexpr bool isDPP8() const noexcept { return (uint16_t)format & (uint16_t)Format::DPP8; }
1304 constexpr bool isDPP() const noexcept { return isDPP16() || isDPP8(); }
1305 SDWA_instruction& sdwa() noexcept
1308 return *(SDWA_instruction*)this;
1310 const SDWA_instruction& sdwa() const noexcept
1313 return *(SDWA_instruction*)this;
1315 constexpr bool isSDWA() const noexcept { return (uint16_t)format & (uint16_t)Format::SDWA; }
1317 FLAT_instruction& flatlike() { return *(FLAT_instruction*)this; }
1319 const FLAT_instruction& flatlike() const { return *(FLAT_instruction*)this; }
1321 constexpr bool isFlatLike() const noexcept { return isFlat() || isGlobal() || isScratch(); }
1323 VALU_instruction& valu() noexcept
1326 return *(VALU_instruction*)this;
1328 const VALU_instruction& valu() const noexcept
1331 return *(VALU_instruction*)this;
1333 constexpr bool isVALU() const noexcept
1335 return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG();
1338 constexpr bool isSALU() const noexcept
1340 return isSOP1() || isSOP2() || isSOPC() || isSOPK() || isSOPP();
1343 constexpr bool isVMEM() const noexcept { return isMTBUF() || isMUBUF() || isMIMG(); }
1345 bool isTrans() const noexcept;
1347 static_assert(sizeof(Instruction) == 16, "Unexpected padding");
1349 struct SOPK_instruction : public Instruction {
1353 static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1355 struct SOPP_instruction : public Instruction {
1359 static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1361 struct SOPC_instruction : public Instruction {
1364 static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1366 struct SOP1_instruction : public Instruction {};
1367 static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
1369 struct SOP2_instruction : public Instruction {
1372 static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1375 * Scalar Memory Format:
1376 * For s_(buffer_)load_dword*:
1377 * Operand(0): SBASE - SGPR-pair which provides base address
1378 * Operand(1): Offset - immediate (un)signed offset or SGPR
1379 * Operand(2) / Definition(0): SDATA - SGPR for read / write result
1380 * Operand(n-1): SOffset - SGPR offset (Vega only)
1382 * Having no operands is also valid for instructions such as s_dcache_inv.
1385 struct SMEM_instruction : public Instruction {
1386 memory_sync_info sync;
1387 bool glc : 1; /* VI+: globally coherent */
1388 bool dlc : 1; /* NAVI: device level coherent */
1389 bool nv : 1; /* VEGA only: Non-volatile */
1390 bool disable_wqm : 1;
1391 uint8_t padding : 4;
1393 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1395 struct VALU_instruction : public Instruction {
1397 bitfield_array8<uint32_t, 0, 3> neg; /* VOP3, SDWA, DPP16, v_fma_mix, VINTERP_inreg */
1398 bitfield_array8<uint32_t, 0, 3> neg_lo; /* VOP3P */
1400 bitfield_array8<uint32_t, 3, 3> abs; /* VOP3, SDWA, DPP16, v_fma_mix */
1401 bitfield_array8<uint32_t, 3, 3> neg_hi; /* VOP3P */
1403 bitfield_array8<uint32_t, 6, 4> opsel; /* VOP3, VOPC12(GFX11+), VINTERP_inreg */
1404 bitfield_uint8<uint32_t, 10, 2> omod; /* VOP3, SDWA(GFX9+) */
1405 bitfield_array8<uint32_t, 12, 3> opsel_lo; /* VOP3P */
1406 bitfield_array8<uint32_t, 15, 3> opsel_hi; /* VOP3P */
1407 bitfield_bool<uint32_t, 18> clamp; /* VOP3, VOP3P, SDWA, VINTERP_inreg */
1410 void swapOperands(unsigned idx0, unsigned idx1);
1412 static_assert(sizeof(VALU_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1414 struct VINTERP_inreg_instruction : public VALU_instruction {
1415 uint8_t wait_exp : 3;
1416 uint8_t padding3 : 5;
1421 static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1424 * Data Parallel Primitives Format:
1425 * This format can be used for VOP1, VOP2 or VOPC instructions.
1426 * The swizzle applies to the src0 operand.
1429 struct DPP16_instruction : public VALU_instruction {
1431 uint8_t row_mask : 4;
1432 uint8_t bank_mask : 4;
1433 bool bound_ctrl : 1;
1434 uint8_t padding3 : 7;
1436 static_assert(sizeof(DPP16_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1438 struct DPP8_instruction : public VALU_instruction {
1439 uint8_t lane_sel[8];
1441 static_assert(sizeof(DPP8_instruction) == sizeof(VALU_instruction) + 8, "Unexpected padding");
1443 struct SubdwordSel {
1444 enum sdwa_sel : uint8_t {
1449 sbyte = ubyte | sext,
1450 sword = uword | sext,
1466 SubdwordSel() : sel((sdwa_sel)0) {}
1467 constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
1468 constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
1469 : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
1471 constexpr operator sdwa_sel() const { return sel; }
1472 explicit operator bool() const { return sel != 0; }
1474 constexpr unsigned size() const { return (sel >> 2) & 0x7; }
1475 constexpr unsigned offset() const { return sel & 0x3; }
1476 constexpr bool sign_extend() const { return sel & sext; }
1477 constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
1479 reg_byte_offset += offset();
1481 return reg_byte_offset;
1482 else if (size() == 2)
1483 return 4 + (reg_byte_offset >> 1);
1493 * Sub-Dword Addressing Format:
1494 * This format can be used for VOP1, VOP2 or VOPC instructions.
1496 * omod and SGPR/constant operands are only available on GFX9+. For VOPC,
1497 * the definition doesn't have to be VCC on GFX9+.
1500 struct SDWA_instruction : public VALU_instruction {
1501 /* these destination modifiers aren't available with VOPC except for
1504 SubdwordSel dst_sel;
1507 static_assert(sizeof(SDWA_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
1509 struct VINTRP_instruction : public Instruction {
1514 static_assert(sizeof(VINTRP_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1517 * Local and Global Data Sharing instructions
1518 * Operand(0): ADDR - VGPR which supplies the address.
1519 * Operand(1): DATA0 - First data VGPR.
1520 * Operand(2): DATA1 - Second data VGPR.
1521 * Operand(n-1): M0 - LDS size.
1522 * Definition(0): VDST - Destination VGPR when results returned to VGPRs.
1525 struct DS_instruction : public Instruction {
1526 memory_sync_info sync;
1532 static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1535 * LDS Direct instructions
1537 * Definition(0): VDST - Destination VGPR
1539 struct LDSDIR_instruction : public Instruction {
1540 memory_sync_info sync;
1542 uint8_t attr_chan : 2;
1543 uint32_t wait_vdst : 4;
1544 uint32_t padding : 28;
1546 static_assert(sizeof(LDSDIR_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1549 * Vector Memory Untyped-buffer Instructions
1550 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1551 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1552 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1553 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1556 struct MUBUF_instruction : public Instruction {
1557 memory_sync_info sync;
1558 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1559 bool idxen : 1; /* Supply an index from VGPR (VADDR) */
1560 bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
1561 bool glc : 1; /* globally coherent */
1562 bool dlc : 1; /* NAVI: device level coherent */
1563 bool slc : 1; /* system level coherent */
1564 bool tfe : 1; /* texture fail enable */
1565 bool lds : 1; /* Return read-data to LDS instead of VGPRs */
1566 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1567 uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
1568 uint16_t swizzled : 1;
1569 uint16_t padding0 : 2;
1572 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1575 * Vector Memory Typed-buffer Instructions
1576 * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant)
1577 * Operand(1): VADDR - Address source. Can carry an index and/or offset
1578 * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant)
1579 * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data
1582 struct MTBUF_instruction : public Instruction {
1583 memory_sync_info sync;
1584 uint8_t dfmt : 4; /* Data Format of data in memory buffer */
1585 uint8_t nfmt : 3; /* Numeric format of data in memory */
1586 bool offen : 1; /* Supply an offset from VGPR (VADDR) */
1587 uint16_t idxen : 1; /* Supply an index from VGPR (VADDR) */
1588 uint16_t glc : 1; /* globally coherent */
1589 uint16_t dlc : 1; /* NAVI: device level coherent */
1590 uint16_t slc : 1; /* system level coherent */
1591 uint16_t tfe : 1; /* texture fail enable */
1592 uint16_t disable_wqm : 1; /* Require an exec mask without helper invocations */
1593 uint16_t padding : 10;
1594 uint16_t offset; /* Unsigned byte offset - 12 bit */
1596 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1599 * Vector Memory Image Instructions
1600 * Operand(0) SRSRC - Scalar GPR that specifies the resource constant.
1601 * Operand(1): SSAMP - Scalar GPR that specifies sampler constant.
1602 * Operand(2): VDATA - Vector GPR for write data or zero if TFE/LWE=1.
1603 * Operand(3): VADDR - Address source. Can carry an offset or an index.
1604 * Definition(0): VDATA - Vector GPR for read result.
1607 struct MIMG_instruction : public Instruction {
1608 memory_sync_info sync;
1609 uint8_t dmask; /* Data VGPR enable mask */
1610 uint8_t dim : 3; /* NAVI: dimensionality */
1611 bool unrm : 1; /* Force address to be un-normalized */
1612 bool dlc : 1; /* NAVI: device level coherent */
1613 bool glc : 1; /* globally coherent */
1614 bool slc : 1; /* system level coherent */
1615 bool tfe : 1; /* texture fail enable */
1616 bool da : 1; /* declare an array */
1617 bool lwe : 1; /* LOD warning enable */
1618 bool r128 : 1; /* NAVI: Texture resource size */
1619 bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
1620 bool d16 : 1; /* Convert 32-bit data to 16-bit data */
1621 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1622 bool strict_wqm : 1; /* VADDR is a linear VGPR and additional VGPRs may be copied into it */
1623 uint8_t padding0 : 1;
1627 static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1630 * Flat/Scratch/Global Instructions
1633 * Operand(2) / Definition(0): DATA/VDST
1636 struct FLAT_instruction : public Instruction {
1637 memory_sync_info sync;
1638 bool slc : 1; /* system level coherent */
1639 bool glc : 1; /* globally coherent */
1640 bool dlc : 1; /* NAVI: device level coherent */
1643 bool disable_wqm : 1; /* Require an exec mask without helper invocations */
1644 uint8_t padding0 : 2;
1645 int16_t offset; /* Vega/Navi only */
1648 static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
1650 struct Export_instruction : public Instruction {
1651 uint8_t enabled_mask;
1653 bool compressed : 1;
1655 bool valid_mask : 1;
1657 uint8_t padding0 : 4;
1660 static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1662 struct Pseudo_instruction : public Instruction {
1663 PhysReg scratch_sgpr; /* might not be valid if it's not needed */
1667 static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1669 struct Pseudo_branch_instruction : public Instruction {
1670 /* target[0] is the block index of the branch target.
1671 * For conditional branches, target[1] contains the fall-through alternative.
1672 * A value of 0 means the target has not been initialized (BB0 cannot be a branch target).
1676 /* Indicates that selection control prefers to remove this instruction if possible.
1677 * This is set when the branch is divergent and always taken, or flattened.
1679 bool selection_control_remove;
1681 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 12, "Unexpected padding");
1683 struct Pseudo_barrier_instruction : public Instruction {
1684 memory_sync_info sync;
1685 sync_scope exec_scope;
1687 static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
1689 enum ReduceOp : uint16_t {
1691 iadd8, iadd16, iadd32, iadd64,
1692 imul8, imul16, imul32, imul64,
1693 fadd16, fadd32, fadd64,
1694 fmul16, fmul32, fmul64,
1695 imin8, imin16, imin32, imin64,
1696 imax8, imax16, imax32, imax64,
1697 umin8, umin16, umin32, umin64,
1698 umax8, umax16, umax32, umax64,
1699 fmin16, fmin32, fmin64,
1700 fmax16, fmax32, fmax64,
1701 iand8, iand16, iand32, iand64,
1702 ior8, ior16, ior32, ior64,
1703 ixor8, ixor16, ixor32, ixor64,
1709 * Subgroup Reduction Instructions, everything except for the data to be
1710 * reduced and the result as inserted by setup_reduce_temp().
1711 * Operand(0): data to be reduced
1712 * Operand(1): reduce temporary
1713 * Operand(2): vector temporary
1714 * Definition(0): result
1715 * Definition(1): scalar temporary
1716 * Definition(2): scalar identity temporary (not used to store identity on GFX10)
1717 * Definition(3): scc clobber
1718 * Definition(4): vcc clobber
1721 struct Pseudo_reduction_instruction : public Instruction {
1723 uint16_t cluster_size; // must be 0 for scans
1725 static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
1726 "Unexpected padding");
1729 VALU_instruction::swapOperands(unsigned idx0, unsigned idx1)
1731 if (this->isSDWA() && idx0 != idx1) {
1732 assert(idx0 < 2 && idx1 < 2);
1733 std::swap(this->sdwa().sel[0], this->sdwa().sel[1]);
1735 assert(idx0 < 3 && idx1 < 3);
1736 std::swap(this->operands[idx0], this->operands[idx1]);
1737 this->neg[idx0].swap(this->neg[idx1]);
1738 this->abs[idx0].swap(this->abs[idx1]);
1739 this->opsel[idx0].swap(this->opsel[idx1]);
1740 this->opsel_lo[idx0].swap(this->opsel_lo[idx1]);
1741 this->opsel_hi[idx0].swap(this->opsel_hi[idx1]);
1744 extern thread_local aco::monotonic_buffer_resource* instruction_buffer;
1746 struct instr_deleter_functor {
1747 /* Don't yet free any instructions. They will be de-allocated
1748 * all at once after compilation finished.
1750 void operator()(void* p) { return; }
1753 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;
1755 template <typename T>
1757 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1758 uint32_t num_definitions)
1761 sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1762 void* data = instruction_buffer->allocate(size, alignof(uint32_t));
1763 memset(data, 0, size);
1766 inst->opcode = opcode;
1767 inst->format = format;
1769 uint16_t operands_offset = sizeof(T) - offsetof(Instruction, operands);
1770 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1771 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1772 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1778 Instruction::usesModifiers() const noexcept
1780 if (isDPP() || isSDWA())
1784 const VALU_instruction& vop3p = this->valu();
1785 /* opsel_hi must be 1 to not be considered a modifier - even for constants */
1786 return vop3p.opsel_lo || vop3p.clamp || vop3p.neg_lo || vop3p.neg_hi ||
1787 (vop3p.opsel_hi & BITFIELD_MASK(operands.size())) != BITFIELD_MASK(operands.size());
1788 } else if (isVALU()) {
1789 const VALU_instruction& vop3 = this->valu();
1790 return vop3.opsel || vop3.clamp || vop3.omod || vop3.abs || vop3.neg;
1796 is_phi(Instruction* instr)
1798 return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi;
1802 is_phi(aco_ptr<Instruction>& instr)
1804 return is_phi(instr.get());
1807 memory_sync_info get_sync_info(const Instruction* instr);
1810 is_dead(const std::vector<uint16_t>& uses, const Instruction* instr)
1812 if (instr->definitions.empty() || instr->isBranch() ||
1813 instr->opcode == aco_opcode::p_startpgm ||
1814 instr->opcode == aco_opcode::p_init_scratch ||
1815 instr->opcode == aco_opcode::p_dual_src_export_gfx11)
1818 if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
1819 [&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; }))
1822 return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
1825 bool can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx);
1826 bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
1827 bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
1828 uint8_t get_gfx11_true16_mask(aco_opcode op);
1829 bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
1830 bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8);
1831 bool can_write_m0(const aco_ptr<Instruction>& instr);
1832 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
1833 aco_ptr<Instruction> convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr);
1834 aco_ptr<Instruction> convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr,
1836 bool needs_exec_mask(const Instruction* instr);
1838 aco_opcode get_ordered(aco_opcode op);
1839 aco_opcode get_unordered(aco_opcode op);
1840 aco_opcode get_inverse(aco_opcode op);
1841 aco_opcode get_swapped(aco_opcode op);
1842 aco_opcode get_f32_cmp(aco_opcode op);
1843 aco_opcode get_vcmpx(aco_opcode op);
1844 unsigned get_cmp_bitsize(aco_opcode op);
1845 bool is_fp_cmp(aco_opcode op);
1846 bool is_cmpx(aco_opcode op);
1848 bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0 = 0,
1851 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
1853 unsigned get_mimg_nsa_dwords(const Instruction* instr);
1855 unsigned get_operand_size(aco_ptr<Instruction>& instr, unsigned index);
1857 bool should_form_clause(const Instruction* a, const Instruction* b);
1860 /* uniform indicates that leaving this block,
1861 * all actives lanes stay active */
1862 block_kind_uniform = 1 << 0,
1863 block_kind_top_level = 1 << 1,
1864 block_kind_loop_preheader = 1 << 2,
1865 block_kind_loop_header = 1 << 3,
1866 block_kind_loop_exit = 1 << 4,
1867 block_kind_continue = 1 << 5,
1868 block_kind_break = 1 << 6,
1869 block_kind_continue_or_break = 1 << 7,
1870 block_kind_branch = 1 << 8,
1871 block_kind_merge = 1 << 9,
1872 block_kind_invert = 1 << 10,
1873 block_kind_discard_early_exit = 1 << 11,
1874 block_kind_uses_discard = 1 << 12,
1875 block_kind_needs_lowering = 1 << 13,
1876 block_kind_export_end = 1 << 15,
1879 struct RegisterDemand {
1880 constexpr RegisterDemand() = default;
1881 constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept : vgpr{v}, sgpr{s} {}
1885 constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept
1887 return a.vgpr == b.vgpr && a.sgpr == b.sgpr;
1890 constexpr bool exceeds(const RegisterDemand other) const noexcept
1892 return vgpr > other.vgpr || sgpr > other.sgpr;
1895 constexpr RegisterDemand operator+(const Temp t) const noexcept
1897 if (t.type() == RegType::sgpr)
1898 return RegisterDemand(vgpr, sgpr + t.size());
1900 return RegisterDemand(vgpr + t.size(), sgpr);
1903 constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept
1905 return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr);
1908 constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept
1910 return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr);
1913 constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept
1920 constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept
1927 constexpr RegisterDemand& operator+=(const Temp t) noexcept
1929 if (t.type() == RegType::sgpr)
1936 constexpr RegisterDemand& operator-=(const Temp t) noexcept
1938 if (t.type() == RegType::sgpr)
1945 constexpr void update(const RegisterDemand other) noexcept
1947 vgpr = std::max(vgpr, other.vgpr);
1948 sgpr = std::max(sgpr, other.sgpr);
1956 unsigned offset = 0;
1957 std::vector<aco_ptr<Instruction>> instructions;
1958 std::vector<unsigned> logical_preds;
1959 std::vector<unsigned> linear_preds;
1960 std::vector<unsigned> logical_succs;
1961 std::vector<unsigned> linear_succs;
1962 RegisterDemand register_demand = RegisterDemand();
1963 uint16_t loop_nest_depth = 0;
1964 uint16_t divergent_if_logical_depth = 0;
1965 uint16_t uniform_if_depth = 0;
1967 int logical_idom = -1;
1968 int linear_idom = -1;
1970 /* this information is needed for predecessors to blocks with phis when
1971 * moving out of ssa */
1972 bool scc_live_out = false;
1974 Block() : index(0) {}
1978 * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
1980 enum class SWStage : uint16_t {
1982 VS = 1 << 0, /* Vertex Shader */
1983 GS = 1 << 1, /* Geometry Shader */
1984 TCS = 1 << 2, /* Tessellation Control aka Hull Shader */
1985 TES = 1 << 3, /* Tessellation Evaluation aka Domain Shader */
1986 FS = 1 << 4, /* Fragment aka Pixel Shader */
1987 CS = 1 << 5, /* Compute Shader */
1988 TS = 1 << 6, /* Task Shader */
1989 MS = 1 << 7, /* Mesh Shader */
1990 RT = 1 << 8, /* Raytracing Shader */
1992 /* Stage combinations merged to run on a single HWStage */
1999 operator|(SWStage a, SWStage b)
2001 return static_cast<SWStage>(static_cast<uint16_t>(a) | static_cast<uint16_t>(b));
2005 * Shader stages as running on the AMD GPU.
2007 * The relation between HWStages and SWStages is not a one-to-one mapping:
2008 * Some SWStages are merged by ACO to run on a single HWStage.
2009 * See README.md for details.
2011 enum class HWStage : uint8_t {
2013 ES, /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
2014 GS, /* Geometry shader on GFX10/legacy and GFX6-9. */
2015 NGG, /* Primitive shader, used to implement VS, TES, GS. */
2016 LS, /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
2017 HS, /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
2023 * Set of SWStages to be merged into a single shader paired with the
2024 * HWStage it will run on.
2027 constexpr Stage() = default;
2029 explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) {}
2031 /* Check if the given SWStage is included */
2032 constexpr bool has(SWStage stage) const
2034 return (static_cast<uint16_t>(sw) & static_cast<uint16_t>(stage));
2037 unsigned num_sw_stages() const { return util_bitcount(static_cast<uint16_t>(sw)); }
2039 constexpr bool operator==(const Stage& other) const { return sw == other.sw && hw == other.hw; }
2041 constexpr bool operator!=(const Stage& other) const { return sw != other.sw || hw != other.hw; }
2043 /* Mask of merged software stages */
2044 SWStage sw = SWStage::None;
2046 /* Active hardware stage */
2050 /* possible settings of Program::stage */
2051 static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
2052 static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
2053 static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
2054 static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
2055 /* Mesh shading pipeline */
2056 static constexpr Stage task_cs(HWStage::CS, SWStage::TS);
2057 static constexpr Stage mesh_ngg(HWStage::NGG, SWStage::MS);
2059 static constexpr Stage vertex_ngg(HWStage::NGG, SWStage::VS);
2060 static constexpr Stage vertex_geometry_ngg(HWStage::NGG, SWStage::VS_GS);
2061 static constexpr Stage tess_eval_ngg(HWStage::NGG, SWStage::TES);
2062 static constexpr Stage tess_eval_geometry_ngg(HWStage::NGG, SWStage::TES_GS);
2063 /* GFX9 (and GFX10 if NGG isn't used) */
2064 static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
2065 static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
2066 static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
2068 static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tessellation control */
2069 static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
2070 static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
2071 static constexpr Stage tess_eval_es(HWStage::ES,
2072 SWStage::TES); /* tessellation evaluation before geometry */
2073 static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);
2075 static constexpr Stage raytracing_cs(HWStage::CS, SWStage::RT);
2078 uint16_t lds_encoding_granule;
2079 uint16_t lds_alloc_granule;
2080 uint32_t lds_limit; /* in bytes */
2081 bool has_16bank_lds;
2082 uint16_t physical_sgprs;
2083 uint16_t physical_vgprs;
2084 uint16_t vgpr_limit;
2085 uint16_t sgpr_limit;
2086 uint16_t sgpr_alloc_granule;
2087 uint16_t vgpr_alloc_granule;
2088 unsigned scratch_alloc_granule;
2089 unsigned max_wave64_per_simd;
2090 unsigned simd_per_cu;
2091 bool has_fast_fma32 = false;
2092 bool has_mac_legacy32 = false;
2093 bool fused_mad_mix = false;
2094 bool xnack_enabled = false;
2095 bool sram_ecc_enabled = false;
2097 int16_t scratch_global_offset_min;
2098 int16_t scratch_global_offset_max;
2099 unsigned max_nsa_vgprs;
2102 enum class CompilationProgress {
2108 class Program final {
2110 aco::monotonic_buffer_resource m{65536};
2111 std::vector<Block> blocks;
2112 std::vector<RegClass> temp_rc = {s1};
2113 RegisterDemand max_reg_demand = RegisterDemand();
2114 ac_shader_config* config;
2115 struct aco_shader_info info;
2116 enum amd_gfx_level gfx_level;
2117 enum radeon_family family;
2122 bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
2123 bool needs_wqm = false; /* there exists a p_wqm instruction */
2124 bool has_color_exports = false;
2126 std::vector<uint8_t> constant_data;
2127 Temp private_segment_buffer;
2128 Temp scratch_offset;
2130 uint16_t num_waves = 0;
2131 uint16_t min_waves = 0;
2132 unsigned workgroup_size; /* if known; otherwise UINT_MAX */
2135 bool needs_vcc = false;
2137 CompilationProgress progress;
2139 bool collect_statistics = false;
2140 uint32_t statistics[aco_num_statistics];
2142 float_mode next_fp_mode;
2143 unsigned next_loop_depth = 0;
2144 unsigned next_divergent_if_logical_depth = 0;
2145 unsigned next_uniform_if_depth = 0;
2147 std::vector<Definition> args_pending_vmem;
2150 FILE* output = stderr;
2151 bool shorten_messages = false;
2152 void (*func)(void* private_data, enum aco_compiler_debug_level level, const char* message);
2156 uint32_t allocateId(RegClass rc)
2158 assert(allocationID <= 16777215);
2159 temp_rc.push_back(rc);
2160 return allocationID++;
2163 void allocateRange(unsigned amount)
2165 assert(allocationID + amount <= 16777216);
2166 temp_rc.resize(temp_rc.size() + amount);
2167 allocationID += amount;
2170 Temp allocateTmp(RegClass rc) { return Temp(allocateId(rc), rc); }
2172 uint32_t peekAllocationId() { return allocationID; }
2174 friend void reindex_ssa(Program* program);
2175 friend void reindex_ssa(Program* program, std::vector<IDSet>& live_out);
2177 Block* create_and_insert_block()
2180 return insert_block(std::move(block));
2183 Block* insert_block(Block&& block)
2185 block.index = blocks.size();
2186 block.fp_mode = next_fp_mode;
2187 block.loop_nest_depth = next_loop_depth;
2188 block.divergent_if_logical_depth = next_divergent_if_logical_depth;
2189 block.uniform_if_depth = next_uniform_if_depth;
2190 blocks.emplace_back(std::move(block));
2191 return &blocks.back();
2195 uint32_t allocationID = 1;
2199 /* live temps out per block */
2200 std::vector<IDSet> live_out;
2201 /* register demand (sgpr/vgpr) per instruction per block */
2202 std::vector<std::vector<RegisterDemand>> register_demand;
2205 struct ra_test_policy {
2206 /* Force RA to always use its pessimistic fallback algorithm */
2207 bool skip_optimistic_path = false;
2212 void init_program(Program* program, Stage stage, const struct aco_shader_info* info,
2213 enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
2214 ac_shader_config* config);
2216 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
2217 ac_shader_config* config, const struct aco_compiler_options* options,
2218 const struct aco_shader_info* info,
2219 const struct ac_shader_args* args);
2220 void select_trap_handler_shader(Program* program, struct nir_shader* shader,
2221 ac_shader_config* config,
2222 const struct aco_compiler_options* options,
2223 const struct aco_shader_info* info,
2224 const struct ac_shader_args* args);
2225 void select_rt_prolog(Program* program, ac_shader_config* config,
2226 const struct aco_compiler_options* options,
2227 const struct aco_shader_info* info, const struct ac_shader_args* in_args,
2228 const struct ac_shader_args* out_args);
2229 void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo,
2230 ac_shader_config* config, const struct aco_compiler_options* options,
2231 const struct aco_shader_info* info, const struct ac_shader_args* args);
2233 void select_ps_epilog(Program* program, const struct aco_ps_epilog_info* epilog_info,
2234 ac_shader_config* config, const struct aco_compiler_options* options,
2235 const struct aco_shader_info* info, const struct ac_shader_args* args);
2237 void lower_phis(Program* program);
2238 void calc_min_waves(Program* program);
2239 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
2240 live live_var_analysis(Program* program);
2241 std::vector<uint16_t> dead_code_analysis(Program* program);
2242 void dominator_tree(Program* program);
2243 void insert_exec_mask(Program* program);
2244 void value_numbering(Program* program);
2245 void optimize(Program* program);
2246 void optimize_postRA(Program* program);
2247 void setup_reduce_temp(Program* program);
2248 void lower_to_cssa(Program* program, live& live_vars);
2249 void register_allocation(Program* program, std::vector<IDSet>& live_out_per_block,
2250 ra_test_policy = {});
2251 void ssa_elimination(Program* program);
2252 void lower_to_hw_instr(Program* program);
2253 void schedule_program(Program* program, live& live_vars);
2254 void spill(Program* program, live& live_vars);
2255 void insert_wait_states(Program* program);
2256 bool dealloc_vgprs(Program* program);
2257 void insert_NOPs(Program* program);
2258 void form_hard_clauses(Program* program);
2259 unsigned emit_program(Program* program, std::vector<uint32_t>& code,
2260 std::vector<struct aco_symbol> *symbols);
2262 * Returns true if print_asm can disassemble the given program for the current build/runtime
2265 bool check_print_asm_support(Program* program);
2266 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
2267 bool validate_ir(Program* program);
2268 bool validate_ra(Program* program);
2270 void perfwarn(Program* program, bool cond, const char* msg, Instruction* instr = NULL);
2272 #define perfwarn(program, cond, msg, ...) \
2277 void collect_presched_stats(Program* program);
2278 void collect_preasm_stats(Program* program);
2279 void collect_postasm_stats(Program* program, const std::vector<uint32_t>& code);
2281 struct Instruction_cycle_info {
2282 /* Latency until the result is ready (if not needing a waitcnt) */
2285 /* How many cycles issuing this instruction takes (i.e. cycles till the next instruction can be
2287 unsigned issue_cycles;
2290 Instruction_cycle_info get_cycle_info(const Program& program, const Instruction& instr);
2294 print_perf_info = 0x2,
2296 print_live_vars = 0x8,
2299 void aco_print_operand(const Operand* operand, FILE* output, unsigned flags = 0);
2300 void aco_print_instr(enum amd_gfx_level gfx_level, const Instruction* instr, FILE* output,
2301 unsigned flags = 0);
2302 void aco_print_program(const Program* program, FILE* output, unsigned flags = 0);
2303 void aco_print_program(const Program* program, FILE* output, const live& live_vars,
2304 unsigned flags = 0);
2306 void _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...);
2307 void _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...);
2309 #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__)
2310 #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__)
2312 int get_op_fixed_to_def(Instruction* instr);
2314 /* utilities for dealing with register demand */
2315 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr);
2316 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr);
2317 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
2318 aco_ptr<Instruction>& instr_before);
2320 /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
2321 uint16_t get_extra_sgprs(Program* program);
2323 /* adjust num_waves for workgroup size and LDS limits */
2324 uint16_t max_suitable_waves(Program* program, uint16_t waves);
2326 /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
2327 uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
2328 uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
2330 /* return number of addressable sgprs/vgprs for max_waves */
2331 uint16_t get_addr_sgpr_from_waves(Program* program, uint16_t max_waves);
2332 uint16_t get_addr_vgpr_from_waves(Program* program, uint16_t max_waves);
2335 const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)];
2336 const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
2337 const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
2338 const int16_t opcode_gfx11[static_cast<int>(aco_opcode::num_opcodes)];
2339 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers;
2340 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers;
2341 const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic;
2342 const char* name[static_cast<int>(aco_opcode::num_opcodes)];
2343 const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
2344 /* sizes used for input/output modifiers and constants */
2345 const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
2346 const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
2349 extern const Info instr_info;
2353 #endif /* ACO_IR_H */