if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
- Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
.addReg(VIndex.getReg())
.addImm(AMDGPU::sub0)
bool hasGFX90AInsts() const { return GFX90AInsts; }
+ /// Return if operations acting on VGPR tuples require even alignment.
+ bool needsAlignedVGPRs() const { return GFX90AInsts; }
+
bool hasPackedTID() const { return HasPackedTID; }
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
- addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
+ const SIRegisterInfo *TRI = STI.getRegisterInfo();
+ const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
+
+ addRegisterClass(MVT::f64, V64RegClass);
+ addRegisterClass(MVT::v2f32, V64RegClass);
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
- addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+ addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
- addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
+ addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
- addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
if (Subtarget->has16BitInsts()) {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
- addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
computeRegisterProperties(Subtarget->getRegisterInfo());
//===----------------------------------------------------------------------===//
std::pair<unsigned, const TargetRegisterClass *>
-SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
StringRef Constraint,
MVT VT) const {
+ const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
+
const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
const unsigned BitWidth = VT.getSizeInBits();
RC = &AMDGPU::VGPR_32RegClass;
break;
default:
- RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth);
+ RC = TRI->getVGPRClassForBitWidth(BitWidth);
if (!RC)
return std::make_pair(0U, nullptr);
break;
RC = &AMDGPU::AGPR_32RegClass;
break;
default:
- RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth);
+ RC = TRI->getAGPRClassForBitWidth(BitWidth);
if (!RC)
return std::make_pair(0U, nullptr);
break;
return false;
}
+static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
+ switch (UnalignedClassID) {
+ case AMDGPU::VReg_64RegClassID:
+ return AMDGPU::VReg_64_Align2RegClassID;
+ case AMDGPU::VReg_96RegClassID:
+ return AMDGPU::VReg_96_Align2RegClassID;
+ case AMDGPU::VReg_128RegClassID:
+ return AMDGPU::VReg_128_Align2RegClassID;
+ case AMDGPU::VReg_256RegClassID:
+ return AMDGPU::VReg_256_Align2RegClassID;
+ case AMDGPU::VReg_512RegClassID:
+ return AMDGPU::VReg_512_Align2RegClassID;
+ case AMDGPU::AReg_64RegClassID:
+ return AMDGPU::AReg_64_Align2RegClassID;
+ case AMDGPU::AReg_96RegClassID:
+ return AMDGPU::AReg_96_Align2RegClassID;
+ case AMDGPU::AReg_128RegClassID:
+ return AMDGPU::AReg_128_Align2RegClassID;
+ case AMDGPU::AReg_256RegClassID:
+ return AMDGPU::AReg_256_Align2RegClassID;
+ case AMDGPU::AReg_512RegClassID:
+ return AMDGPU::AReg_512_Align2RegClassID;
+ case AMDGPU::AReg_1024RegClassID:
+ return AMDGPU::AReg_1024_Align2RegClassID;
+ default:
+ return -1;
+ }
+}
+
// Figure out which registers should be reserved for stack access. Only after
// the function is legalized do we know all of the non-spill stack objects or if
// calls are present.
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
if (Info->isEntryFunction()) {
// Callable functions have fixed registers used for stack access.
Info->limitOccupancy(MF);
if (ST.isWave32() && !MF.empty()) {
- const SIInstrInfo *TII = ST.getInstrInfo();
for (auto &MBB : MF) {
for (auto &MI : MBB) {
TII->fixImplicitOperands(MI);
}
}
+ // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
+ // classes if required. Ideally the register class constraints would differ
+ // per-subtarget, but there's no easy way to achieve that right now. This is
+ // not a problem for VGPRs because the correctly aligned VGPR class is implied
+ // from using them as the register class for legal types.
+ if (ST.needsAlignedVGPRs()) {
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ const Register Reg = Register::index2VirtReg(I);
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ if (!RC)
+ continue;
+ int NewClassID = getAlignedAGPRClassID(RC->getID());
+ if (NewClassID != -1)
+ MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
+ }
+ }
+
TargetLoweringBase::finalizeLowering(MF);
// Allocate a VGPR for future SGPR Spill if
return;
}
- if (RC == &AMDGPU::VReg_64RegClass &&
+ if (RC->hasSuperClassEq(&AMDGPU::VReg_64RegClass) &&
!RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) {
if (ST.hasPackedFP32Ops()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
.addImm(Value);
return;
}
- if (RegClass == &AMDGPU::VReg_64RegClass) {
+ if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
.addImm(Value);
return;
// Make sure the register classes are correct.
for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
- if (MI.getOperand(i).isFPImm()) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isFPImm()) {
ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
"all fp values to integers.";
return false;
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
- const MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
ErrInfo = "Illegal immediate value for operand.";
return false;
continue;
}
- if (!MI.getOperand(i).isReg())
+ if (!MO.isReg())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg)
continue;
+ // FIXME: Ideally we would have separate instruction definitions with the
+ // aligned register constraint.
+ // FIXME: We do not verify inline asm operands, but custom inline asm
+ // verification is broken anyway
+ if (ST.needsAlignedVGPRs()) {
+ const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
+ const bool IsVGPR = RI.hasVGPRs(RC);
+ const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC);
+ if ((IsVGPR || IsAGPR) && MO.getSubReg()) {
+ const TargetRegisterClass *SubRC =
+ RI.getSubRegClass(RC, MO.getSubReg());
+ RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
+ if (RC)
+ RC = SubRC;
+ }
+
+ // Check that this is the aligned version of the class.
+ if (!RC || ((IsVGPR && !RC->hasSuperClassEq(RI.getVGPRClassForBitWidth(
+ RI.getRegSizeInBits(*RC)))) ||
+ (IsAGPR && !RC->hasSuperClassEq(RI.getAGPRClassForBitWidth(
+ RI.getRegSizeInBits(*RC)))))) {
+ ErrInfo = "Subtarget requires even aligned vector registers";
+ return false;
+ }
+ }
+
if (RegClass != -1) {
- Register Reg = MI.getOperand(i).getReg();
- if (Reg == AMDGPU::NoRegister || Reg.isVirtual())
+ if (Reg.isVirtual())
continue;
const TargetRegisterClass *RC = RI.getRegClass(RegClass);
if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
((DstIdx >= 0 &&
- Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID) ||
- ((Src0Idx >= 0 &&
- Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID))) &&
+ (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID ||
+ Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) ||
+ ((Src0Idx >= 0 &&
+ (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID ||
+ Desc.OpInfo[Src0Idx].RegClass ==
+ AMDGPU::VReg_64_Align2RegClassID)))) &&
!AMDGPU::isLegal64BitDPPControl(DC)) {
ErrInfo = "Invalid dpp_ctrl value: "
"64 bit dpp only support row_newbcast";
Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
- if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
- VRC = &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *VRC64 = RI.getVGPR64Class();
+ if (RI.getCommonSubClass(VRC64, VRC))
+ VRC = VRC64;
else
VRC = &AMDGPU::VGPR_32RegClass;
def : GCNPat <
(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
- (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src,
+ (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src,
(as_i32timm $dpp_ctrl), (as_i32timm $row_mask),
(as_i32timm $bank_mask),
(as_i1timm $bound_ctrl))
def : GCNPat <
(i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
- (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl),
+ (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl),
(as_i32timm $row_mask), (as_i32timm $bank_mask),
(as_i1timm $bound_ctrl))
>;
return &AMDGPU::SGPR_512RegClass;
}
}
- const TargetRegisterClass *RC = nullptr;
- switch (CI.Width + Paired.Width) {
- default:
- return nullptr;
- case 2:
- RC = &AMDGPU::VReg_64RegClass;
- break;
- case 3:
- RC = &AMDGPU::VReg_96RegClass;
- break;
- case 4:
- RC = &AMDGPU::VReg_128RegClass;
- break;
- }
-
- if (TRI->hasAGPRs(getDataRegClass(*CI.I)))
- return TRI->getEquivalentAGPRClass(RC);
-
- return RC;
+ unsigned BitWidth = 32 * (CI.Width + Paired.Width);
+ return TRI->hasAGPRs(getDataRegClass(*CI.I))
+ ? TRI->getAGPRClassForBitWidth(BitWidth)
+ : TRI->getVGPRClassForBitWidth(BitWidth);
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
reserveRegisterTuples(Reserved, Reg);
}
- if (ST.hasGFX90AInsts())
- for (const TargetRegisterClass *RC : this->regclasses())
- if (getRegSizeInBits(*RC) > 32 && hasVectorRegisters(RC))
- for (unsigned Reg : *RC)
- if (getEncodingValue(Reg) & 1)
- Reserved.set(Reg);
-
// FIXME: Stop using reserved registers for this.
for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
reserveRegisterTuples(Reserved, Reg);
return AMDGPUInstPrinter::getRegisterName(Reg);
}
-const TargetRegisterClass *
-SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth == 1)
- return &AMDGPU::VReg_1RegClass;
- if (BitWidth <= 16)
- return &AMDGPU::VGPR_LO16RegClass;
- if (BitWidth <= 32)
- return &AMDGPU::VGPR_32RegClass;
+static const TargetRegisterClass *
+getAnyVGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth <= 64)
return &AMDGPU::VReg_64RegClass;
if (BitWidth <= 96)
return nullptr;
}
+static const TargetRegisterClass *
+getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
+ if (BitWidth <= 64)
+ return &AMDGPU::VReg_64_Align2RegClass;
+ if (BitWidth <= 96)
+ return &AMDGPU::VReg_96_Align2RegClass;
+ if (BitWidth <= 128)
+ return &AMDGPU::VReg_128_Align2RegClass;
+ if (BitWidth <= 160)
+ return &AMDGPU::VReg_160_Align2RegClass;
+ if (BitWidth <= 192)
+ return &AMDGPU::VReg_192_Align2RegClass;
+ if (BitWidth <= 256)
+ return &AMDGPU::VReg_256_Align2RegClass;
+ if (BitWidth <= 512)
+ return &AMDGPU::VReg_512_Align2RegClass;
+ if (BitWidth <= 1024)
+ return &AMDGPU::VReg_1024_Align2RegClass;
+
+ return nullptr;
+}
+
const TargetRegisterClass *
-SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
+SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
+ if (BitWidth == 1)
+ return &AMDGPU::VReg_1RegClass;
if (BitWidth <= 16)
- return &AMDGPU::AGPR_LO16RegClass;
+ return &AMDGPU::VGPR_LO16RegClass;
if (BitWidth <= 32)
- return &AMDGPU::AGPR_32RegClass;
+ return &AMDGPU::VGPR_32RegClass;
+ return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
+ : getAnyVGPRClassForBitWidth(BitWidth);
+}
+
+static const TargetRegisterClass *
+getAnyAGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth <= 64)
return &AMDGPU::AReg_64RegClass;
if (BitWidth <= 96)
return nullptr;
}
+static const TargetRegisterClass *
+getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
+ if (BitWidth <= 64)
+ return &AMDGPU::AReg_64_Align2RegClass;
+ if (BitWidth <= 96)
+ return &AMDGPU::AReg_96_Align2RegClass;
+ if (BitWidth <= 128)
+ return &AMDGPU::AReg_128_Align2RegClass;
+ if (BitWidth <= 160)
+ return &AMDGPU::AReg_160_Align2RegClass;
+ if (BitWidth <= 192)
+ return &AMDGPU::AReg_192_Align2RegClass;
+ if (BitWidth <= 256)
+ return &AMDGPU::AReg_256_Align2RegClass;
+ if (BitWidth <= 512)
+ return &AMDGPU::AReg_512_Align2RegClass;
+ if (BitWidth <= 1024)
+ return &AMDGPU::AReg_1024_Align2RegClass;
+
+ return nullptr;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
+ if (BitWidth <= 16)
+ return &AMDGPU::AGPR_LO16RegClass;
+ if (BitWidth <= 32)
+ return &AMDGPU::AGPR_32RegClass;
+ return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
+ : getAnyAGPRClassForBitWidth(BitWidth);
+}
+
const TargetRegisterClass *
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth <= 16)
&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
&AMDGPU::AGPR_32RegClass,
+ &AMDGPU::AGPR_32RegClass,
+ &AMDGPU::VReg_64_Align2RegClass,
&AMDGPU::VReg_64RegClass,
&AMDGPU::SReg_64RegClass,
+ &AMDGPU::AReg_64_Align2RegClass,
&AMDGPU::AReg_64RegClass,
+ &AMDGPU::VReg_96_Align2RegClass,
&AMDGPU::VReg_96RegClass,
&AMDGPU::SReg_96RegClass,
+ &AMDGPU::AReg_96_Align2RegClass,
&AMDGPU::AReg_96RegClass,
+ &AMDGPU::VReg_128_Align2RegClass,
&AMDGPU::VReg_128RegClass,
&AMDGPU::SReg_128RegClass,
+ &AMDGPU::AReg_128_Align2RegClass,
&AMDGPU::AReg_128RegClass,
+ &AMDGPU::VReg_160_Align2RegClass,
&AMDGPU::VReg_160RegClass,
&AMDGPU::SReg_160RegClass,
+ &AMDGPU::AReg_160_Align2RegClass,
&AMDGPU::AReg_160RegClass,
+ &AMDGPU::VReg_192_Align2RegClass,
&AMDGPU::VReg_192RegClass,
&AMDGPU::SReg_192RegClass,
+ &AMDGPU::AReg_192_Align2RegClass,
&AMDGPU::AReg_192RegClass,
+ &AMDGPU::VReg_256_Align2RegClass,
&AMDGPU::VReg_256RegClass,
&AMDGPU::SReg_256RegClass,
+ &AMDGPU::AReg_256_Align2RegClass,
&AMDGPU::AReg_256RegClass,
+ &AMDGPU::VReg_512_Align2RegClass,
&AMDGPU::VReg_512RegClass,
&AMDGPU::SReg_512RegClass,
+ &AMDGPU::AReg_512_Align2RegClass,
&AMDGPU::AReg_512RegClass,
&AMDGPU::SReg_1024RegClass,
+ &AMDGPU::VReg_1024_Align2RegClass,
&AMDGPU::VReg_1024RegClass,
+ &AMDGPU::AReg_1024_Align2RegClass,
&AMDGPU::AReg_1024RegClass,
&AMDGPU::SCC_CLASSRegClass,
&AMDGPU::Pseudo_SReg_32RegClass,
return RC;
}
+const TargetRegisterClass *
+SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
+ const TargetRegisterClass *SubRC,
+ unsigned SubIdx) const {
+ // Ensure this subregister index is aligned in the super register.
+ const TargetRegisterClass *MatchRC =
+ getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
+ return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
+}
+
bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
}
+const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
+ // VGPR tuples have an alignment requirement on gfx90a variants.
+ return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
+ : &AMDGPU::VReg_64RegClass;
+}
+
const TargetRegisterClass *
SIRegisterInfo::getRegClass(unsigned RCID) const {
switch ((int)RCID) {
return getEncodingValue(Reg) & 0xff;
}
- static const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth);
- static const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth);
+ LLVM_READONLY
+ const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth) const;
+
+ LLVM_READONLY
+ const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth) const;
+
+ LLVM_READONLY
static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
/// Return the 'base' register class for this register.
const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass *VRC) const;
- /// \returns The register class that is used for a sub-register of \p RC for
- /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will
- /// be returned.
+ /// \returns The canonical register class that is used for a sub-register of
+ /// \p RC for the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC
+ /// will be returned.
const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
unsigned SubIdx) const;
+ /// Returns a register class which is compatible with \p SuperRC, such that a
+ /// subregister exists with class \p SubRC with subregister index \p
+ /// SubIdx. If this is impossible (e.g., an unaligned subregister index within
+ /// a register tuple), return null.
+ const TargetRegisterClass *
+ getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
+ const TargetRegisterClass *SubRC,
+ unsigned SubIdx) const;
+
bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
unsigned DefSubReg,
const TargetRegisterClass *SrcRC,
: &AMDGPU::SReg_64_XEXECRegClass;
}
+ // Return the appropriate register class to use for 64-bit VGPRs for the
+ // subtarget.
+ const TargetRegisterClass *getVGPR64Class() const;
+
MCRegister getVCC() const;
const TargetRegisterClass *getRegClass(unsigned RCID) const;
}
// Register class for all vector registers (VGPRs + Interpolation Registers)
-class VRegClass<int numRegs, list<ValueType> regTypes, dag regList> :
+class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
RegisterClass<"AMDGPU", regTypes, 32, regList> {
let Size = !mul(numRegs, 32);
let Weight = numRegs;
}
-def VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
- (add VGPR_64)>;
-def VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-def VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>;
-def VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
-def VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>;
-def VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
-def VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
-def VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
+// Define a register tuple class, along with one requiring an even
+// aligned base register.
+multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
+ // Define the regular class.
+ def "" : VRegClassBase<numRegs, regTypes, regList>;
-class ARegClass<int numRegs, list<ValueType> regTypes, dag regList> :
- VRegClass<numRegs, regTypes, regList> {
- // Requires n v_accvgpr_write and n v_accvgpr_read to copy + burn 1 vgpr
- let CopyCost = !add(numRegs, numRegs, 1);
+ // Define 2-aligned variant
+ def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
}
-def AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
+defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
+ (add VGPR_64)>;
+defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
+defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>;
+defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
+
+defm VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>;
+defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
+defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
+defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
+
+multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
+ let CopyCost = !add(numRegs, numRegs, 1) in {
+ // Define the regular class.
+ def "" : VRegClassBase<numRegs, regTypes, regList>;
+
+ // Define 2-aligned variant
+ def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)>;
+ }
+}
+
+defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
(add AGPR_64)>;
-def AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
-def AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
-def AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
-def AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>;
-def AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
-def AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
-def AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
+defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
+defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
+defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
+defm AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>;
+defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
+defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
+defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
} // End GeneratePressureSet = 0
case AMDGPU::VReg_64RegClassID:
case AMDGPU::AReg_64RegClassID:
case AMDGPU::SReg_64_XEXECRegClassID:
+ case AMDGPU::VReg_64_Align2RegClassID:
+ case AMDGPU::AReg_64_Align2RegClassID:
return 64;
case AMDGPU::SGPR_96RegClassID:
case AMDGPU::SReg_96RegClassID:
case AMDGPU::VReg_96RegClassID:
case AMDGPU::AReg_96RegClassID:
+ case AMDGPU::VReg_96_Align2RegClassID:
+ case AMDGPU::AReg_96_Align2RegClassID:
case AMDGPU::AV_96RegClassID:
return 96;
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::SReg_128RegClassID:
case AMDGPU::VReg_128RegClassID:
case AMDGPU::AReg_128RegClassID:
+ case AMDGPU::VReg_128_Align2RegClassID:
+ case AMDGPU::AReg_128_Align2RegClassID:
case AMDGPU::AV_128RegClassID:
return 128;
case AMDGPU::SGPR_160RegClassID:
case AMDGPU::SReg_160RegClassID:
case AMDGPU::VReg_160RegClassID:
case AMDGPU::AReg_160RegClassID:
+ case AMDGPU::VReg_160_Align2RegClassID:
+ case AMDGPU::AReg_160_Align2RegClassID:
case AMDGPU::AV_160RegClassID:
return 160;
case AMDGPU::SGPR_192RegClassID:
case AMDGPU::SReg_192RegClassID:
case AMDGPU::VReg_192RegClassID:
case AMDGPU::AReg_192RegClassID:
+ case AMDGPU::VReg_192_Align2RegClassID:
+ case AMDGPU::AReg_192_Align2RegClassID:
return 192;
case AMDGPU::SGPR_256RegClassID:
case AMDGPU::SReg_256RegClassID:
case AMDGPU::VReg_256RegClassID:
case AMDGPU::AReg_256RegClassID:
+ case AMDGPU::VReg_256_Align2RegClassID:
+ case AMDGPU::AReg_256_Align2RegClassID:
return 256;
case AMDGPU::SGPR_512RegClassID:
case AMDGPU::SReg_512RegClassID:
case AMDGPU::VReg_512RegClassID:
case AMDGPU::AReg_512RegClassID:
+ case AMDGPU::VReg_512_Align2RegClassID:
+ case AMDGPU::AReg_512_Align2RegClassID:
return 512;
case AMDGPU::SGPR_1024RegClassID:
case AMDGPU::SReg_1024RegClassID:
case AMDGPU::VReg_1024RegClassID:
case AMDGPU::AReg_1024RegClassID:
+ case AMDGPU::VReg_1024_Align2RegClassID:
+ case AMDGPU::AReg_1024_Align2RegClassID:
return 1024;
default:
llvm_unreachable("Unexpected register class");
; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
; GFX90A: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
; GFX90A: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX90A: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX90A: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX90A: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX90A: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GFX90A: bb.2:
; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX90A: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX90A: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX90A: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX90A: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GFX90A: bb.2:
; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
; GFX90A: S_ENDPGM 0
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
; GFX90A: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4095
; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
; GFX90A: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
; GFX90A: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
; GFX90A: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
; GFX90A: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX90A: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX90A: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX90A: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX90A: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GFX90A: bb.2:
; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; GFX90A: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; GFX90A: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
; GFX90A: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
- ; GFX90A: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX90A: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
; GFX90A: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
; GFX90A: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
; GFX90A: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX90A: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX90A: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX90A: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX90A: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GFX90A: bb.2:
; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
; GFX90A: S_ENDPGM 0
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX90A: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
; GFX90A: S_ENDPGM 0
%ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
# RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s
-# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s
---
# GCN-LABEL: name: alloc_vgpr_64
# GFX908: $vgpr3_vgpr4 = GLOBAL_LOAD
-# GFX90A: $vgpr4_vgpr5 = GLOBAL_LOAD
name: alloc_vgpr_64
tracksRegLiveness: true
liveins:
---
# GCN-LABEL: name: alloc_vgpr_96
# GFX908: $vgpr3_vgpr4_vgpr5 = GLOBAL_LOAD
-# GFX90A: $vgpr4_vgpr5_vgpr6 = GLOBAL_LOAD
name: alloc_vgpr_96
tracksRegLiveness: true
liveins:
---
# GCN-LABEL: name: alloc_vgpr_128
# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6 = GLOBAL_LOAD
-# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = GLOBAL_LOAD
name: alloc_vgpr_128
tracksRegLiveness: true
liveins:
---
# GCN-LABEL: name: alloc_vgpr_160
# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_LOAD
-# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_LOAD
name: alloc_vgpr_160
tracksRegLiveness: true
liveins:
---
# GCN-LABEL: name: alloc_vgpr_256
# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 = COPY
-# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = COPY
name: alloc_vgpr_256
tracksRegLiveness: true
liveins:
---
# GCN-LABEL: name: alloc_vgpr_512
# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18 = IMPLICIT_DEF
-# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
name: alloc_vgpr_512
tracksRegLiveness: true
liveins:
---
# GCN-LABEL: name: alloc_vgpr_1024
# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 = IMPLICIT_DEF
-# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
name: alloc_vgpr_1024
tracksRegLiveness: true
liveins:
---
# GCN-LABEL: name: alloc_agpr_64
# GFX908: $agpr1_agpr2 = IMPLICIT_DEF
-# GFX90A: $agpr2_agpr3 = IMPLICIT_DEF
name: alloc_agpr_64
tracksRegLiveness: true
liveins:
---
# GCN-LABEL: name: alloc_agpr_128
# GFX908: $agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
-# GFX90A: $agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
name: alloc_agpr_128
tracksRegLiveness: true
liveins:
---
# GCN-LABEL: name: alloc_agpr_512
# GFX908: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = IMPLICIT_DEF
-# GFX90A: $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = IMPLICIT_DEF
name: alloc_agpr_512
tracksRegLiveness: true
liveins:
---
# GCN-LABEL: name: alloc_agpr_1024
# GFX908: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32 = IMPLICIT_DEF
-# GFX90A: $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32_agpr33 = IMPLICIT_DEF
name: alloc_agpr_1024
tracksRegLiveness: true
liveins:
--- /dev/null
+# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s
+# Using the unaligned vector tuples are OK as long as they aren't used
+# in a real instruction.
+
+---
+# GCN-LABEL: name: alloc_vgpr_64
+# GFX90A: $vgpr4_vgpr5 = GLOBAL_LOAD
+name: alloc_vgpr_64
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$vgpr2' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX2 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_96
+# GFX90A: $vgpr4_vgpr5_vgpr6 = GLOBAL_LOAD
+name: alloc_vgpr_96
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$vgpr2' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX3 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_128
+# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = GLOBAL_LOAD
+name: alloc_vgpr_128
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$vgpr2' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ %2:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_160
+# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_LOAD
+name: alloc_vgpr_160
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$vgpr2' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ %2:vreg_160_align2 = IMAGE_LOAD_V5_V1 %1, undef %3:sgpr_256, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+ GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_256
+# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = COPY
+name: alloc_vgpr_256
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$vgpr2' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ %3:sgpr_256 = IMPLICIT_DEF
+ %2:vreg_256_align2 = COPY %3:sgpr_256
+ %4:vreg_128_align2 = IMAGE_SAMPLE_C_CL_O_V4_V8 %2, %3:sgpr_256, undef %5:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4)
+ GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_512
+# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+name: alloc_vgpr_512
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$vgpr2' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ %2:vreg_512_align2 = IMPLICIT_DEF
+ GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_vgpr_1024
+# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+name: alloc_vgpr_1024
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$vgpr2' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ %2:vreg_1024_align2 = IMPLICIT_DEF
+ GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub16_sub17_sub18_sub19, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub20_sub21_sub22_sub23, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub24_sub25_sub26_sub27, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub28_sub29_sub30_sub31, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_agpr_64
+# GFX90A: $agpr1_agpr2 = IMPLICIT_DEF
+name: alloc_agpr_64
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$agpr0' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $agpr0
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %3:areg_64 = IMPLICIT_DEF
+ %2:vreg_64_align2 = COPY %3:areg_64
+ GLOBAL_STORE_DWORDX2 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+ %1:vgpr_32 = COPY $agpr0
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_agpr_128
+# GFX90A: $agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
+name: alloc_agpr_128
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$agpr0' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $agpr0
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %3:areg_128 = IMPLICIT_DEF
+ %2:vreg_128_align2 = COPY %3:areg_128
+ GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+ %1:vgpr_32 = COPY $agpr0
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_agpr_512
+# GFX90A: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = IMPLICIT_DEF
+name: alloc_agpr_512
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$agpr0' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $agpr0
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %3:areg_512 = IMPLICIT_DEF
+ %2:vreg_512_align2 = COPY %3:areg_512
+ GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec
+ %1:vgpr_32 = COPY $agpr0
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+# GCN-LABEL: name: alloc_agpr_1024
+# GFX90A: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32 = IMPLICIT_DEF
+name: alloc_agpr_1024
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0_vgpr1' }
+ - { reg: '$agpr0' }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $agpr0
+
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %3:areg_1024 = IMPLICIT_DEF
+ %2:vreg_1024_align2 = COPY %3:areg_1024
+ GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub16_sub17_sub18_sub19, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub20_sub21_sub22_sub23, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub24_sub25_sub26_sub27, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %2.sub28_sub29_sub30_sub31, 0, 0, 0, 0, 0, implicit $exec
+ %1:vgpr_32 = COPY $agpr0
+ GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec
+...
---
# GCN-LABEL: name: dpp64_old_impdef
-# GCN: %3:vreg_64 = V_CEIL_F64_dpp %1, 0, %0, 337, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %3:vreg_64_align2 = V_CEIL_F64_dpp %1, 0, %0, 337, 15, 15, 1, implicit $mode, implicit $exec
---
name: dpp64_old_impdef
tracksRegLiveness: true
body: |
bb.0:
- %0:vreg_64 = IMPLICIT_DEF
- %1:vreg_64 = IMPLICIT_DEF
- %2:vreg_64 = V_MOV_B64_DPP_PSEUDO %1:vreg_64, %0:vreg_64, 337, 15, 15, 1, implicit $exec
- %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
+ %0:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64_align2 = IMPLICIT_DEF
+ %2:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO %1, %0, 337, 15, 15, 1, implicit $exec
+ %3:vreg_64_align2 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dpp64_old_undef
-# GCN: %3:vreg_64 = V_CEIL_F64_dpp undef %1:vreg_64, 0, undef %2:vreg_64, 337, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %3:vreg_64_align2 = V_CEIL_F64_dpp undef %1:vreg_64_align2, 0, undef %2:vreg_64_align2, 337, 15, 15, 1, implicit $mode, implicit $exec
---
name: dpp64_old_undef
tracksRegLiveness: true
body: |
bb.0:
- %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 337, 15, 15, 1, implicit $exec
- %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
+ %2:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64_align2, undef %0:vreg_64_align2, 337, 15, 15, 1, implicit $exec
+ %3:vreg_64_align2 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
...
# GCN-LABEL: name: dpp64_old_is_0
-# GCN: %3:vreg_64 = V_CEIL_F64_dpp %4, 0, undef %2:vreg_64, 337, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %3:vreg_64_align2 = V_CEIL_F64_dpp %4, 0, undef %2:vreg_64_align2, 337, 15, 15, 1, implicit $mode, implicit $exec
name: dpp64_old_is_0
tracksRegLiveness: true
body: |
bb.0:
- %1:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
- %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1, undef %0:vreg_64, 337, 15, 15, 1, implicit $exec
- %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
+ %1:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+ %2:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO undef %1, undef %0:vreg_64_align2, 337, 15, 15, 1, implicit $exec
+ %3:vreg_64_align2 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
...
# DPP64 does not support all control values and must be split to become legal
# GCN-LABEL: name: dpp64_illegal_ctrl
-# GCN: %4:vgpr_32 = V_MOV_B32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, 1, 15, 15, 1, implicit $exec
-# GCN: %5:vgpr_32 = V_MOV_B32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, 1, 15, 15, 1, implicit $exec
-# GCN: %0:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1
-# GCN: %3:vreg_64 = V_CEIL_F64_e32 %0, implicit $mode, implicit $exec
+# GCN: %4:vgpr_32 = V_MOV_B32_dpp undef %1.sub0:vreg_64_align2, undef %2.sub0:vreg_64_align2, 1, 15, 15, 1, implicit $exec
+# GCN: %5:vgpr_32 = V_MOV_B32_dpp undef %1.sub1:vreg_64_align2, undef %2.sub1:vreg_64_align2, 1, 15, 15, 1, implicit $exec
+# GCN: %0:vreg_64_align2 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1
+# GCN: %3:vreg_64_align2 = V_CEIL_F64_e32 %0, implicit $mode, implicit $exec
name: dpp64_illegal_ctrl
tracksRegLiveness: true
body: |
bb.0:
- %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec
- %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
+ %2:vreg_64_align2 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64_align2, undef %0:vreg_64_align2, 1, 15, 15, 1, implicit $exec
+ %3:vreg_64_align2 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec
...
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stop-after=finalize-isel -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stop-after=finalize-isel -o - %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -stop-after=finalize-isel -o - %s | FileCheck -check-prefix=GFX90A %s
; Make sure we only use one 128-bit register instead of 2 for i128 asm
; constraints
define amdgpu_kernel void @s_input_output_i128() {
- ; CHECK-LABEL: name: s_input_output_i128
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4128778 /* regdef:SGPR_128 */, def %4
- ; CHECK: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
- ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4128777 /* reguse:SGPR_128 */, [[COPY]]
- ; CHECK: S_ENDPGM 0
+ ; GFX908-LABEL: name: s_input_output_i128
+ ; GFX908: bb.0 (%ir-block.0):
+ ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:SGPR_128 */, def %4
+ ; GFX908: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
+ ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX908: S_ENDPGM 0
+ ; GFX90A-LABEL: name: s_input_output_i128
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:SGPR_128 */, def %4
+ ; GFX90A: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
+ ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX90A: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=s"()
call void asm sideeffect "; use $0", "s"(i128 %val)
ret void
}
define amdgpu_kernel void @v_input_output_i128() {
- ; CHECK-LABEL: name: v_input_output_i128
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_128 */, def %4
- ; CHECK: [[COPY:%[0-9]+]]:vreg_128 = COPY %4
- ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_128 */, [[COPY]]
- ; CHECK: S_ENDPGM 0
+ ; GFX908-LABEL: name: v_input_output_i128
+ ; GFX908: bb.0 (%ir-block.0):
+ ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4390922 /* regdef:VReg_128 */, def %4
+ ; GFX908: [[COPY:%[0-9]+]]:vreg_128 = COPY %4
+ ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4390921 /* reguse:VReg_128 */, [[COPY]]
+ ; GFX908: S_ENDPGM 0
+ ; GFX90A-LABEL: name: v_input_output_i128
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4521994 /* regdef:VReg_128_Align2 */, def %4
+ ; GFX90A: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4
+ ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VReg_128_Align2 */, [[COPY]]
+ ; GFX90A: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=v"()
call void asm sideeffect "; use $0", "v"(i128 %val)
ret void
}
define amdgpu_kernel void @a_input_output_i128() {
- ; CHECK-LABEL: name: a_input_output_i128
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3932170 /* regdef:AReg_128 */, def %4
- ; CHECK: [[COPY:%[0-9]+]]:areg_128 = COPY %4
- ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3932169 /* reguse:AReg_128 */, [[COPY]]
- ; CHECK: S_ENDPGM 0
+ ; GFX908-LABEL: name: a_input_output_i128
+ ; GFX908: bb.0 (%ir-block.0):
+ ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4325386 /* regdef:AReg_128 */, def %4
+ ; GFX908: [[COPY:%[0-9]+]]:areg_128 = COPY %4
+ ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_128 */, [[COPY]]
+ ; GFX908: S_ENDPGM 0
+ ; GFX90A-LABEL: name: a_input_output_i128
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4456458 /* regdef:AReg_128_Align2 */, def %4
+ ; GFX90A: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4
+ ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4456457 /* reguse:AReg_128_Align2 */, [[COPY]]
+ ; GFX90A: S_ENDPGM 0
%val = call i128 asm sideeffect "; def $0", "=a"()
call void asm sideeffect "; use $0", "a"(i128 %val)
ret void
# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -enable-var-scope -check-prefix=GCN %s
# GCN-LABEL: name: ds_read_b32_v_v
-# GCN: vreg_64 = DS_READ2_B32
+# GCN: vreg_64_align2 = DS_READ2_B32
name: ds_read_b32_v_v
body: |
bb.0:
...
# GCN-LABEL: name: ds_read_b32_a_a
-# GCN: areg_64 = DS_READ2_B32
+# GCN: areg_64_align2 = DS_READ2_B32
name: ds_read_b32_a_a
body: |
bb.0:
# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-form-memory-clauses %s -o - | FileCheck -check-prefix=GCN %s
# Make sure we do not produce early-clobber list with odd subregs.
-# Odd vector subregs are reserved on gfx90a and verifier complaints after RA.
# GCN-LABEL: name: long_reg_clause
-# GCN: dead early-clobber %2.sub0_sub1_sub2_sub3:areg_512, undef early-clobber %4.sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:areg_512, dead early-clobber %3:areg_512 = BUNDLE %0, implicit $exec {
+# GCN: dead early-clobber %2.sub0_sub1_sub2_sub3:areg_512_align2, undef early-clobber %4.sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:areg_512_align2, dead early-clobber %3:areg_512_align2 = BUNDLE %0, implicit $exec {
---
name: long_reg_clause
body: |
bb.0.entry:
- %0:vreg_64 = IMPLICIT_DEF
- undef %1.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -208, 0, 0, 0, 0, implicit $exec
- %1.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -224, 0, 0, 0, 0, implicit $exec
- %1.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -240, 0, 0, 0, 0, implicit $exec
- dead %1.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -256, 0, 0, 0, 0, implicit $exec
- undef %2.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -80, 0, 0, 0, 0, implicit $exec
- %2.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -96, 0, 0, 0, 0, implicit $exec
- %2.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -112, 0, 0, 0, 0, implicit $exec
- dead %2.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -128, 0, 0, 0, 0, implicit $exec
- undef %3.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec
- %3.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec
- %3.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec
- dead %3.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
- undef %4.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 176, 0, 0, 0, 0, implicit $exec
- %4.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 160, 0, 0, 0, 0, implicit $exec
- %4.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 144, 0, 0, 0, 0, implicit $exec
+ %0:vreg_64_align2 = IMPLICIT_DEF
+ undef %1.sub12_sub13_sub14_sub15:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -208, 0, 0, 0, 0, implicit $exec
+ %1.sub8_sub9_sub10_sub11:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -224, 0, 0, 0, 0, implicit $exec
+ %1.sub4_sub5_sub6_sub7:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -240, 0, 0, 0, 0, implicit $exec
+ dead %1.sub0_sub1_sub2_sub3:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -256, 0, 0, 0, 0, implicit $exec
+ undef %2.sub12_sub13_sub14_sub15:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -80, 0, 0, 0, 0, implicit $exec
+ %2.sub8_sub9_sub10_sub11:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -96, 0, 0, 0, 0, implicit $exec
+ %2.sub4_sub5_sub6_sub7:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -112, 0, 0, 0, 0, implicit $exec
+ dead %2.sub0_sub1_sub2_sub3:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, -128, 0, 0, 0, 0, implicit $exec
+ undef %3.sub12_sub13_sub14_sub15:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec
+ %3.sub8_sub9_sub10_sub11:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec
+ %3.sub4_sub5_sub6_sub7:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec
+ dead %3.sub0_sub1_sub2_sub3:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+ undef %4.sub12_sub13_sub14_sub15:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 176, 0, 0, 0, 0, implicit $exec
+ %4.sub8_sub9_sub10_sub11:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 160, 0, 0, 0, 0, implicit $exec
+ %4.sub4_sub5_sub6_sub7:areg_512_align2 = GLOBAL_LOAD_DWORDX4 %0, 144, 0, 0, 0, 0, implicit $exec
...
---
name: test_fmamk_reg_imm_f64
registers:
- - { id: 0, class: vreg_64 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: vreg_64 }
- - { id: 3, class: vreg_64 }
+ - { id: 0, class: vreg_64_align2 }
+ - { id: 1, class: vreg_64_align2 }
+ - { id: 2, class: vreg_64_align2 }
+ - { id: 3, class: vreg_64_align2 }
body: |
bb.0:
---
name: test_fmamk_imm_reg_f64
registers:
- - { id: 0, class: vreg_128 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: vreg_64 }
- - { id: 3, class: vreg_64 }
+ - { id: 0, class: vreg_128_align2 }
+ - { id: 1, class: vreg_64_align2 }
+ - { id: 2, class: vreg_64_align2 }
+ - { id: 3, class: vreg_64_align2 }
body: |
bb.0:
---
name: test_fmaak_f64
registers:
- - { id: 0, class: vreg_128 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: vreg_64 }
+ - { id: 0, class: vreg_128_align2 }
+ - { id: 1, class: vreg_64_align2 }
+ - { id: 2, class: vreg_64_align2 }
body: |
bb.0:
...
# GCN-LABEL: name: test_fmaak_sgpr_src0_f64
-# GCN: V_FMA_F64_e64 0, killed %0, 0, %1, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
+# GCN: V_FMA_F64_e64 0, killed %0, 0, %1, 0, %2:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
---
name: test_fmaak_sgpr_src0_f64
registers:
- { id: 0, class: sreg_64 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: vreg_64 }
- - { id: 3, class: vreg_64 }
+ - { id: 1, class: vreg_64_align2 }
+ - { id: 2, class: vreg_64_align2 }
+ - { id: 3, class: vreg_64_align2 }
body: |
bb.0:
...
# GCN-LABEL: name: test_fmaak_inlineimm_src0_f64
-# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec
+# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
---
name: test_fmaak_inlineimm_src0_f64
registers:
- - { id: 0, class: vreg_64 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: vreg_64 }
+ - { id: 0, class: vreg_64_align2 }
+ - { id: 1, class: vreg_64_align2 }
+ - { id: 2, class: vreg_64_align2 }
body: |
bb.0:
...
# GCN-LABEL: name: test_fmaak_otherimm_src0_f64
-# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec
+# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
---
name: test_fmaak_otherimm_src0_f64
registers:
- - { id: 0, class: vreg_64 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: vreg_64 }
+ - { id: 0, class: vreg_64_align2 }
+ - { id: 1, class: vreg_64_align2 }
+ - { id: 2, class: vreg_64_align2 }
body: |
bb.0:
---
name: test_fmaak_other_constantlike_src0_f64
registers:
- - { id: 0, class: vreg_64 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: vreg_64 }
+ - { id: 0, class: vreg_64_align2 }
+ - { id: 1, class: vreg_64_align2 }
+ - { id: 2, class: vreg_64_align2 }
stack:
- { id: 0, name: "", type: default, offset: 0, size: 128, alignment: 8,
callee-saved-register: '', local-offset: 0, debug-info-variable: '',
---
name: test_fmamk_reg_unfoldable_literal_src0_f64
registers:
- - { id: 0, class: vreg_64 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: vreg_64 }
- - { id: 3, class: vreg_64 }
+ - { id: 0, class: vreg_64_align2 }
+ - { id: 1, class: vreg_64_align2 }
+ - { id: 2, class: vreg_64_align2 }
+ - { id: 3, class: vreg_64_align2 }
body: |
bb.0:
---
name: test_fmamk_reg_unfoldable_literal_src1_f64
registers:
- - { id: 0, class: vreg_64 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: vreg_64 }
- - { id: 3, class: vreg_64 }
+ - { id: 0, class: vreg_64_align2 }
+ - { id: 1, class: vreg_64_align2 }
+ - { id: 2, class: vreg_64_align2 }
+ - { id: 3, class: vreg_64_align2 }
body: |
bb.0:
---
name: test_fmaak_reg_unfoldable_literal_src2_f64
registers:
- - { id: 0, class: vreg_64 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: vreg_64 }
- - { id: 3, class: vreg_64 }
+ - { id: 0, class: vreg_64_align2 }
+ - { id: 1, class: vreg_64_align2 }
+ - { id: 2, class: vreg_64_align2 }
+ - { id: 3, class: vreg_64_align2 }
body: |
bb.0:
--- /dev/null
+# RUN: not --crash llc -march=amdgcn -mcpu=gfx90a -run-pass=machineverifier -o /dev/null %s 2>&1 | FileCheck %s
+
+# Implicit uses are OK.
+---
+name: implicit_use
+body: |
+ bb.0:
+ $vgpr1_vgpr2 = IMPLICIT_DEF
+ S_NOP 0, implicit $vgpr1_vgpr2
+ %0:vreg_64 = IMPLICIT_DEF
+ S_NOP 0, implicit %0
+
+ %1:sreg_64_xexec = IMPLICIT_DEF
+ %2:sreg_64_xexec = SI_CALL %1, 0, csr_amdgpu_highregs, implicit $vgpr1_vgpr2
+
+ ; noreg is OK
+ DS_WRITE_B64_gfx9 $noreg, $noreg, 0, 0, implicit $exec
+...
+
+# The unaligned registers are allowed to exist, just not on any tuple instructions.
+
+---
+name: copy_like_generic
+body: |
+ bb.0:
+ $vgpr1_vgpr2 = IMPLICIT_DEF
+ $vgpr3_vgpr4 = COPY $vgpr1_vgpr2
+ %0:vreg_64 = IMPLICIT_DEF
+ %1:vreg_64 = COPY %0
+...
+
+---
+name: mov_32_unaligned_super
+body: |
+ bb.0:
+ undef %0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ %1:vgpr_32 = V_MOV_B32_e32 undef %2.sub1:vreg_64, implicit $exec
+...
+
+# Well-aligned subregister indexes are OK
+---
+name: aligned_sub_reg
+body: |
+ bb.0:
+ %0:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_128_align2 = IMPLICIT_DEF
+ GLOBAL_STORE_DWORDX2 %0, %1.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX2 %0, %1.sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: unaligned_registers
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr3_vgpr4_vgpr5_vgpr6
+ %0:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64 = IMPLICIT_DEF
+ %2:vreg_96 = IMPLICIT_DEF
+ %3:vreg_128 = IMPLICIT_DEF
+ %4:areg_64 = IMPLICIT_DEF
+ %5:vreg_128_align2 = IMPLICIT_DEF
+
+ ; Check virtual register uses
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ GLOBAL_STORE_DWORDX2 %0, %1, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX3 %0, %2, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %3, 0, 0, 0, 0, 0, implicit $exec
+
+ ; Check virtual registers with subregisters
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ GLOBAL_STORE_DWORDX2 %0, %3.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX2 %0, %3.sub2_sub3, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX2 %0, %3.sub1_sub2, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX2 %0, %5.sub1_sub2, 0, 0, 0, 0, 0, implicit $exec
+
+ ; Check physical register uses
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr3_vgpr4_vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec
+
+ ; Check virtual register defs
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ %6:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec
+ %7:vreg_96 = GLOBAL_LOAD_DWORDX3 %0, 0, 0, 0, 0, 0, implicit $exec
+ %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ $vgpr1_vgpr2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX3 %0, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr1_vgpr2_vgpr3_vgpr4 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec
+
+ ; Check AGPRs
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ ; CHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+ %9:vgpr_32 = IMPLICIT_DEF
+ %10:areg_64 = IMPLICIT_DEF
+ %11:areg_128_align2 = IMPLICIT_DEF
+ DS_WRITE_B64_gfx9 %9, %10, 0, 0, implicit $exec
+ DS_WRITE_B64_gfx9 %9, %11.sub1_sub2, 0, 0, implicit $exec
+...
+
+# FIXME: Inline asm is not verified
+# ; Check inline asm
+# ; XCHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+# ; XCHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+# ; XCHECK: *** Bad machine code: Subtarget requires even aligned vector registers ***
+# INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 9 /* reguse */, $vgpr1_vgpr2
+# INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 9 /* reguse */, %4
+# INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 9 /* reguse */, %5.sub1_sub2