}
break;
}
+ case Format::VINTERP_INREG: {
+ VINTERP_inreg_instruction& interp = instr->vinterp_inreg();
+ uint32_t encoding = (0b11001101 << 24);
+ encoding |= reg(ctx, instr->definitions[0], 8);
+ encoding |= (uint32_t)interp.wait_exp << 8;
+ encoding |= (uint32_t)interp.opsel << 11;
+ encoding |= (uint32_t)interp.clamp << 15;
+ encoding |= opcode << 16;
+ out.push_back(encoding);
+
+ encoding = 0;
+ for (unsigned i = 0; i < instr->operands.size(); i++)
+ encoding |= reg(ctx, instr->operands[i]) << (i * 9);
+ for (unsigned i = 0; i < 3; i++)
+ encoding |= interp.neg[i] << (29 + i);
+ out.push_back(encoding);
+ break;
+ }
case Format::DS: {
DS_instruction& ds = instr->ds();
uint32_t encoding = (0b110110 << 26);
("vopc_sdwa", [Format.VOPC, Format.SDWA], 'SDWA_instruction', itertools.product([1, 2], [2])),
("vop3", [Format.VOP3], 'VOP3_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]),
("vop3p", [Format.VOP3P], 'VOP3P_instruction', [(1, 2), (1, 3)]),
+ ("vinterp_inreg", [Format.VINTERP_INREG], 'VINTERP_inreg_instruction', [(1, 3)]),
("vintrp", [Format.VINTRP], 'VINTRP_instruction', [(1, 2), (1, 3)]),
("vop1_dpp", [Format.VOP1, Format.DPP16], 'DPP16_instruction', [(1, 1)]),
("vop2_dpp", [Format.VOP2, Format.DPP16], 'DPP16_instruction', itertools.product([1, 2], [2, 3])),
gen(instr.get(), ctx);
if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
+ if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) {
+ instr->vinterp_inreg().wait_exp = MIN2(instr->vinterp_inreg().wait_exp, queued_imm.exp);
+ queued_imm.exp = wait_imm::unset_counter;
+ }
+
if (!queued_imm.empty())
emit_waitcnt(ctx, new_instructions, queued_imm);
case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
case aco_opcode::v_dot2_f16_f16:
case aco_opcode::v_dot2_bf16_bf16: return idx == -1 || idx == 2;
+ // TODO: This matches what LLVM allows. We should see if this matches what the hardware allows.
+ case aco_opcode::v_interp_p10_f16_f32_inreg:
+ case aco_opcode::v_interp_p10_rtz_f16_f32_inreg: return idx == 0 || idx == 2;
+ case aco_opcode::v_interp_p2_f16_f32_inreg:
+ case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return idx == -1 || idx == 0;
default: return false;
}
}
bool
instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
{
+ // TODO: VINTERP (v_interp_p2_f16_f32, v_interp_p2_rtz_f16_f32)
+
/* partial register writes are GFX9+, only */
if (gfx_level < GFX9)
return false;
/* Vector ALU Formats */
VOP3P = 20,
+ VINTERP_INREG = 21,
VOP1 = 1 << 8,
VOP2 = 1 << 9,
VOPC = 1 << 10,
struct Pseudo_barrier_instruction;
struct Pseudo_reduction_instruction;
struct VOP3P_instruction;
+struct VINTERP_inreg_instruction;
struct VOP1_instruction;
struct VOP2_instruction;
struct VOPC_instruction;
return *(VOP3P_instruction*)this;
}
constexpr bool isVOP3P() const noexcept { return format == Format::VOP3P; }
+ VINTERP_inreg_instruction& vinterp_inreg() noexcept
+ {
+ assert(isVINTERP_INREG());
+ return *(VINTERP_inreg_instruction*)this;
+ }
+ const VINTERP_inreg_instruction& vinterp_inreg() const noexcept
+ {
+ assert(isVINTERP_INREG());
+ return *(VINTERP_inreg_instruction*)this;
+ }
+ constexpr bool isVINTERP_INREG() const noexcept { return format == Format::VINTERP_INREG; }
VOP1_instruction& vop1() noexcept
{
assert(isVOP1());
};
static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
+struct VINTERP_inreg_instruction : public Instruction {
+ uint8_t wait_exp : 3;
+ bool clamp : 1;
+ uint8_t opsel : 4;
+ bool neg[3];
+};
+static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+
/**
* Data Parallel Primitives Format:
* This format can be used for VOP1, VOP2 or VOPC instructions.
can_remove = false;
} else if (inst->isSALU()) {
num_scalar++;
- } else if (inst->isVALU() || inst->isVINTRP()) {
+ } else if (inst->isVALU() || inst->isVINTRP() || instr->isVINTERP_INREG()) {
num_vector++;
/* VALU which writes SGPRs are always executed on GFX10+ */
if (ctx.program->gfx_level >= GFX10) {
PSEUDO_BARRIER = 18
PSEUDO_REDUCTION = 19
VOP3P = 20
+ VINTERP_INREG = 21
VOP1 = 1 << 8
VOP2 = 1 << 9
VOPC = 1 << 10
elif self == Format.VOP3P:
return [('uint8_t', 'opsel_lo', None),
('uint8_t', 'opsel_hi', None)]
+ elif self == Format.VINTERP_INREG:
+ return [('unsigned', 'wait_exp', 7),
+ ('uint8_t', 'opsel', 0)]
elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
return [('int16_t', 'offset', 0),
('memory_sync_info', 'sync', 'memory_sync_info()'),
opcode("v_dot2_f32_bf16", -1, -1, -1, 0x1a, Format.VOP3P, InstrClass.Valu32)
-# VINTERP instructions:
+# VINTRP (GFX6 - GFX10.3) instructions:
VINTRP = {
(0x00, "v_interp_p1_f32"),
(0x01, "v_interp_p2_f32"),
for (code, name) in VINTRP:
opcode(name, code, code, code, -1, Format.VINTRP, InstrClass.Valu32)
+
+# VINTERP (GFX11+) instructions:
+VINTERP = {
+ (0x00, "v_interp_p10_f32_inreg"),
+ (0x01, "v_interp_p2_f32_inreg"),
+ (0x02, "v_interp_p10_f16_f32_inreg"),
+ (0x03, "v_interp_p2_f16_f32_inreg"),
+ (0x04, "v_interp_p10_rtz_f16_f32_inreg"),
+ (0x05, "v_interp_p2_rtz_f16_f32_inreg"),
+}
+for (code, name) in VINTERP:
+ opcode(name, -1, -1, -1, code, Format.VINTERP_INREG, InstrClass.Valu32)
+
+
# VOP3 instructions: 3 inputs, 1 output
# VOP3b instructions: have a unique scalar output, e.g. VOP2 with vcc out
VOP3 = {
switch (instr->format) {
case Format::SMEM: return hash_murmur_32<SMEM_instruction>(instr);
case Format::VINTRP: return hash_murmur_32<VINTRP_instruction>(instr);
+ case Format::VINTERP_INREG: return hash_murmur_32<VINTERP_inreg_instruction>(instr);
case Format::DS: return hash_murmur_32<DS_instruction>(instr);
case Format::SOPP: return hash_murmur_32<SOPP_instruction>(instr);
case Format::SOPK: return hash_murmur_32<SOPK_instruction>(instr);
return a3P.opsel_lo == b3P.opsel_lo && a3P.opsel_hi == b3P.opsel_hi &&
a3P.clamp == b3P.clamp;
}
+ case Format::VINTERP_INREG: {
+ VINTERP_inreg_instruction& aI = a->vinterp_inreg();
+ VINTERP_inreg_instruction& bI = b->vinterp_inreg();
+ return aI.wait_exp == bI.wait_exp && aI.clamp == bI.clamp && aI.opsel == bI.opsel &&
+ aI.neg[0] == bI.neg[0] && aI.neg[1] == bI.neg[1] && aI.neg[2] == bI.neg[2];
+ }
case Format::PSEUDO_REDUCTION: {
Pseudo_reduction_instruction& aR = a->reduction();
Pseudo_reduction_instruction& bR = b->reduction();
print_sync(smem.sync, output);
break;
}
+ case Format::VINTERP_INREG: {
+ const VINTERP_inreg_instruction& vinterp = instr->vinterp_inreg();
+ if (vinterp.wait_exp != 7)
+ fprintf(output, " wait_exp:%u", vinterp.wait_exp);
+ break;
+ }
case Format::VINTRP: {
const VINTRP_instruction& vintrp = instr->vintrp();
fprintf(output, " attr%d.%c", vintrp.attribute, "xyzw"[vintrp.component]);
default: break;
}
}
+ } else if (instr->isVINTERP_INREG()) {
+ const VINTERP_inreg_instruction& vinterp = instr->vinterp_inreg();
+ if (vinterp.clamp)
+ fprintf(output, " clamp");
+ if (vinterp.opsel & (1 << 3))
+ fprintf(output, " opsel_hi");
}
}
f2f32[i] = vop3p.opsel_hi & (1 << i);
opsel[i] = f2f32[i] && (vop3p.opsel_lo & (1 << i));
}
+ } else if (instr->isVINTERP_INREG()) {
+ const VINTERP_inreg_instruction& vinterp = instr->vinterp_inreg();
+ for (unsigned i = 0; i < MIN2(num_operands, 3); ++i) {
+ neg[i] = vinterp.neg[i];
+ opsel[i] = vinterp.opsel & (1 << i);
+ }
}
for (unsigned i = 0; i < num_operands; ++i) {
if (i)
}
assert(rc.bytes() <= 2);
- if (instr->isVALU()) {
+ if (instr->isVALU() || instr->isVINTERP_INREG()) {
if (can_use_SDWA(gfx_level, instr, false))
return rc.bytes();
if (can_use_opsel(gfx_level, instr->opcode, idx))
return;
assert(rc.bytes() <= 2);
- if (instr->isVALU()) {
+ if (instr->isVALU() || instr->isVINTERP_INREG()) {
/* check if we can use opsel */
if (instr->format == Format::VOP3) {
assert(byte == 2);
instr->vop3().opsel |= 1 << idx;
return;
}
+ if (instr->isVINTERP_INREG()) {
+ assert(byte == 2);
+ instr->vinterp_inreg().opsel |= 1 << idx;
+ return;
+ }
if (instr->isVOP3P()) {
assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx)));
instr->vop3p().opsel_lo |= 1 << idx;
return std::make_pair(4, rc.size() * 4u);
}
- if (instr->isVALU() || instr->isVINTRP()) {
+ if (instr->isVALU() || instr->isVINTRP() || instr->isVINTERP_INREG()) {
assert(rc.bytes() <= 2);
if (can_use_SDWA(gfx_level, instr, false))
if (instr->isPseudo())
return;
- if (instr->isVALU()) {
+ if (instr->isVALU() || instr->isVINTERP_INREG()) {
amd_gfx_level gfx_level = program->gfx_level;
assert(instr->definitions[0].bytes() <= 2);
assert(can_use_opsel(gfx_level, instr->opcode, -1));
instr->vop3().opsel |= (1 << 3); /* dst in high half */
return;
+ } else if (instr->isVINTERP_INREG()) {
+ assert(reg.byte() == 2);
+ assert(can_use_opsel(gfx_level, instr->opcode, -1));
+ instr->vinterp_inreg().opsel |= (1 << 3); /* dst in high half */
+ return;
}
if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
instr.get());
}
- if (instr->isSALU() || instr->isVALU()) {
+ if (instr->isSALU() || instr->isVALU() || instr->isVINTERP_INREG()) {
/* check literals */
Operand literal(s1);
for (unsigned i = 0; i < instr->operands.size(); i++) {
}
/* check num sgprs for VALU */
- if (instr->isVALU()) {
+ if (instr->isVALU() || instr->isVINTERP_INREG()) {
bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
instr->opcode == aco_opcode::v_lshrrev_b64 ||
instr->opcode == aco_opcode::v_ashrrev_i64;
if (program->gfx_level >= GFX10 && !is_shift64)
const_bus_limit = 2;
- uint32_t scalar_mask = instr->isVOP3() || instr->isVOP3P() ? 0x7 : 0x5;
+ uint32_t scalar_mask =
+ instr->isVOP3() || instr->isVOP3P() || instr->isVINTERP_INREG() ? 0x7 : 0x5;
if (instr->isSDWA())
scalar_mask = program->gfx_level >= GFX9 ? 0x7 : 0x4;
else if (instr->isDPP())
if (instr->isPseudo())
return gfx_level >= GFX8 ? def.bytes() : def.size() * 4u;
- if (instr->isVALU()) {
+ if (instr->isVALU() || instr->isVINTERP_INREG()) {
assert(def.bytes() <= 2);
if (instr->isSDWA())
return instr->sdwa().dst_sel.size();
finish_ra_test(ra_test_policy());
END_TEST
+
+BEGIN_TEST(regalloc.vinterp_fp16)
+ //>> v1: %in0:v[0], v1: %in1:v[1], v1: %in2:v[2] = p_startpgm
+ if (!setup_cs("v1 v1 v1", GFX11))
+ return;
+
+ //! v2b: %lo:v[3][0:16], v2b: %hi:v[3][16:32] = p_split_vector %in0:v[0]
+ Temp lo = bld.tmp(v2b);
+ Temp hi = bld.tmp(v2b);
+ bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), inputs[0]);
+
+ //! v1: %tmp0:v[1] = v_interp_p10_f16_f32_inreg %lo:v[3][0:16], %in1:v[1], hi(%hi:v[3][16:32])
+ //! p_unit_test %tmp0:v[1]
+ Temp tmp0 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), lo, inputs[1], hi);
+ bld.pseudo(aco_opcode::p_unit_test, tmp0);
+
+ //! v2b: %tmp1:v[0][16:32] = v_interp_p2_f16_f32_inreg %in0:v[0], %in2:v[2], %tmp0:v[1] opsel_hi
+ //! v1: %tmp2:v[0] = p_create_vector 0, %tmp1:v[0][16:32]
+ //! p_unit_test %tmp2:v[0]
+ Temp tmp1 = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[0], inputs[2], tmp0);
+ Temp tmp2 = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), Operand::zero(2), tmp1);
+ bld.pseudo(aco_opcode::p_unit_test, tmp2);
+
+ finish_ra_test(ra_test_policy());
+END_TEST