MachinePointerInfo &MPO,
ISD::ArgFlagsTy Flags) = 0;
+ /// Return the in-memory size to write for the argument at \p VA. This may
+ /// be smaller than the allocated stack slot size.
+ ///
+ /// This is overridable primarily for targets to maintain compatibility with
+ /// hacks around the existing DAG call lowering infrastructure.
+ virtual uint64_t getStackValueStoreSize(const CCValAssign &VA) const;
+
/// The specified value has been assigned to a physical register,
/// handle the appropriate COPY (either to or from) and mark any
/// relevant uses/defines as needed.
Register extendRegister(Register ValReg, CCValAssign &VA,
unsigned MaxSizeBits = 0);
- virtual bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ /// Wrap call to (typically tablegenerated CCAssignFn). This may be
+ /// overridden to track additional state information as arguments are
+ /// assigned or apply target specific hacks around the legacy
+ /// infrastructure.
+ virtual bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, const ArgInfo &Info,
ISD::ArgFlagsTy Flags, CCState &State) {
return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
/// value registers of type \p LLTy, and \p Regs contains the legalized pieces
/// with type \p PartLLT. This is used for incoming values (physregs to vregs).
static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
- ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT) {
+ ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT,
+ const ISD::ArgFlagsTy Flags) {
MachineRegisterInfo &MRI = *B.getMRI();
- // We could just insert a regular copy, but this is unreachable at the moment.
- assert(LLTy != PartLLT && "identical part types shouldn't reach here");
+ if (PartLLT == LLTy) {
+ // We should have avoided introducing a new virtual register, and just
+ // directly assigned here.
+ assert(OrigRegs[0] == Regs[0]);
+ return;
+ }
+
+ if (PartLLT.getSizeInBits() == LLTy.getSizeInBits() && OrigRegs.size() == 1 &&
+ Regs.size() == 1) {
+ B.buildBitcast(OrigRegs[0], Regs[0]);
+ return;
+ }
if (PartLLT.isVector() == LLTy.isVector() &&
- PartLLT.getScalarSizeInBits() > LLTy.getScalarSizeInBits()) {
- assert(OrigRegs.size() == 1 && Regs.size() == 1);
- B.buildTrunc(OrigRegs[0], Regs[0]);
+ PartLLT.getScalarSizeInBits() > LLTy.getScalarSizeInBits() &&
+ OrigRegs.size() == 1 && Regs.size() == 1) {
+ Register SrcReg = Regs[0];
+
+ LLT LocTy = MRI.getType(SrcReg);
+
+ if (Flags.isSExt()) {
+ SrcReg = B.buildAssertSExt(LocTy, SrcReg,
+ LLTy.getScalarSizeInBits()).getReg(0);
+ } else if (Flags.isZExt()) {
+ SrcReg = B.buildAssertZExt(LocTy, SrcReg,
+ LLTy.getScalarSizeInBits()).getReg(0);
+ }
+
+ B.buildTrunc(OrigRegs[0], SrcReg);
return;
}
}
if (PartLLT.isVector()) {
- assert(OrigRegs.size() == 1 &&
- LLTy.getScalarType() == PartLLT.getElementType());
- mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
+ assert(OrigRegs.size() == 1);
+
+ if (LLTy.getScalarType() == PartLLT.getElementType()) {
+ mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
+ } else {
+ SmallVector<Register> CastRegs(Regs.size());
+ unsigned I = 0;
+ LLT GCDTy = getGCDType(LLTy, PartLLT);
+
+ // We are both splitting a vector, and bitcasting its element types. Cast
+ // the source pieces into the appropriate number of pieces with the result
+ // element type.
+ for (Register SrcReg : Regs)
+ CastRegs[I++] = B.buildBitcast(GCDTy, SrcReg).getReg(0);
+ mergeVectorRegsToResultRegs(B, OrigRegs, CastRegs);
+ }
+
return;
}
unsigned NumArgs = Args.size();
for (unsigned i = 0; i != NumArgs; ++i) {
EVT CurVT = EVT::getEVT(Args[i].Ty);
- if (CurVT.isSimple() &&
- !Handler.assignArg(i, CurVT.getSimpleVT(), CurVT.getSimpleVT(),
- CCValAssign::Full, Args[i], Args[i].Flags[0],
- CCInfo))
- continue;
MVT NewVT = TLI->getRegisterTypeForCallingConv(
- F.getContext(), CCInfo.getCallingConv(), EVT(CurVT));
+ F.getContext(), CCInfo.getCallingConv(), CurVT);
// If we need to split the type over multiple regs, check it's a scenario
// we currently support.
if (NumParts == 1) {
// Try to use the register type if we couldn't assign the VT.
- if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i],
+ if (Handler.assignArg(i, CurVT, NewVT, NewVT, CCValAssign::Full, Args[i],
Args[i].Flags[0], CCInfo))
return false;
continue;
}
Args[i].Flags.push_back(Flags);
- if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i],
+ if (Handler.assignArg(i, CurVT, NewVT, NewVT, CCValAssign::Full, Args[i],
Args[i].Flags[Part], CCInfo)) {
// Still couldn't assign this smaller part type for some reason.
return false;
continue;
}
- const EVT VAVT = VA.getValVT();
- const LLT NewLLT(VAVT.getSimpleVT());
+ const MVT ValVT = VA.getValVT();
+ const MVT LocVT = VA.getLocVT();
+
+ const LLT LocTy(LocVT);
+ const LLT ValTy(ValVT);
+ const LLT NewLLT = Handler.isIncomingArgumentHandler() ? LocTy : ValTy;
+ const EVT OrigVT = EVT::getEVT(Args[i].Ty);
const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
// Expected to be multiple regs for a single incoming arg.
Args[i].Regs[Part] = MRI.createGenericVirtualRegister(NewLLT);
}
- const LLT VATy(VAVT.getSimpleVT());
-
assert((j + (NumParts - 1)) < ArgLocs.size() &&
"Too many regs for number of args");
// Coerce into outgoing value types before register assignment.
- if (!Handler.isIncomingArgumentHandler() && OrigTy != VATy) {
+ if (!Handler.isIncomingArgumentHandler() && OrigTy != ValTy) {
assert(Args[i].OrigRegs.size() == 1);
buildCopyToRegs(MIRBuilder, Args[i].Regs, Args[i].OrigRegs[0], OrigTy,
- VATy, extendOpFromFlags(Args[i].Flags[0]));
+ ValTy, extendOpFromFlags(Args[i].Flags[0]));
}
for (unsigned Part = 0; Part < NumParts; ++Part) {
// Individual pieces may have been spilled to the stack and others
// passed in registers.
- // FIXME: Use correct address space for pointer size
- EVT LocVT = VA.getValVT();
- unsigned MemSize = LocVT == MVT::iPTR ? DL.getPointerSize()
- : LocVT.getStoreSize();
- unsigned Offset = VA.getLocMemOffset();
+ // TODO: The memory size may be larger than the value we need to
+ // store. We may need to adjust the offset for big endian targets.
+ uint64_t MemSize = Handler.getStackValueStoreSize(VA);
+
MachinePointerInfo MPO;
Register StackAddr =
- Handler.getStackAddress(MemSize, Offset, MPO, Flags);
+ Handler.getStackAddress(MemSize, VA.getLocMemOffset(), MPO, Flags);
+
Handler.assignValueToAddress(Args[i], Part, StackAddr, MemSize, MPO,
VA);
continue;
if (i == 0 && ThisReturnReg.isValid() &&
Handler.isIncomingArgumentHandler() &&
- isTypeIsValidForThisReturn(VAVT)) {
+ isTypeIsValidForThisReturn(ValVT)) {
Handler.assignValueToReg(Args[i].Regs[i], ThisReturnReg, VA);
continue;
}
// Now that all pieces have been assigned, re-pack the register typed values
// into the original value typed registers.
- if (Handler.isIncomingArgumentHandler() && OrigTy != VATy) {
+ if (Handler.isIncomingArgumentHandler() && OrigVT != LocVT) {
// Merge the split registers into the expected larger result vregs of
// the original call.
buildCopyFromRegs(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy,
- VATy);
+ LocTy, Args[i].Flags[0]);
}
j += NumParts - 1;
return true;
}
+uint64_t CallLowering::ValueHandler::getStackValueStoreSize(
+ const CCValAssign &VA) const {
+ const EVT ValVT = VA.getValVT();
+ if (ValVT != MVT::iPTR)
+ return ValVT.getStoreSize();
+
+ const DataLayout &DL = MIRBuilder.getDataLayout();
+
+ /// FIXME: We need to get the correct pointer address space.
+ return DL.getPointerSize();
+}
+
void CallLowering::ValueHandler::copyArgumentMemory(
const ArgInfo &Arg, Register DstPtr, Register SrcPtr,
const MachinePointerInfo &DstPtrInfo, Align DstAlign,
CCValAssign &VA,
unsigned MaxSizeBits) {
LLT LocTy{VA.getLocVT()};
- LLT ValTy = MRI.getType(ValReg);
+ LLT ValTy{VA.getValVT()};
+
if (LocTy.getSizeInBits() == ValTy.getSizeInBits())
return ValReg;
}
}
+/// Check if we can use a basic COPY instruction between the two types.
+///
+/// We're currently building on top of the infrastructure using MVT, which loses
+/// pointer information in the CCValAssign. We accept copies from physical
+/// registers that have been reported as integers if it's to an equivalent sized
+/// pointer LLT.
+static bool isCopyCompatibleType(LLT SrcTy, LLT DstTy) {
+ if (SrcTy == DstTy)
+ return true;
+
+ if (SrcTy.getSizeInBits() != DstTy.getSizeInBits())
+ return false;
+
+ SrcTy = SrcTy.getScalarType();
+ DstTy = DstTy.getScalarType();
+
+ return (SrcTy.isPointer() && DstTy.isScalar()) ||
+ (DstTy.isScalar() && SrcTy.isPointer());
+}
+
void CallLowering::IncomingValueHandler::assignValueToReg(Register ValVReg,
Register PhysReg,
CCValAssign &VA) {
- const LLT LocTy(VA.getLocVT());
- const LLT ValTy = MRI.getType(ValVReg);
+ const MVT LocVT = VA.getLocVT();
+ const LLT LocTy(LocVT);
+ const LLT RegTy = MRI.getType(ValVReg);
- if (ValTy.getSizeInBits() == LocTy.getSizeInBits()) {
+ if (isCopyCompatibleType(RegTy, LocTy)) {
MIRBuilder.buildCopy(ValVReg, PhysReg);
return;
}
auto Copy = MIRBuilder.buildCopy(LocTy, PhysReg);
- auto Hint = buildExtensionHint(VA, Copy.getReg(0), ValTy);
+ auto Hint = buildExtensionHint(VA, Copy.getReg(0), RegTy);
MIRBuilder.buildTrunc(ValVReg, Hint);
}
AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
: CallLowering(&TLI) {}
+static void applyStackPassedSmallTypeDAGHack(EVT OrigVT, MVT &ValVT,
+ MVT &LocVT) {
+ // If ValVT is i1/i8/i16, we should set LocVT to i8/i8/i16. This is a legacy
+ // hack because the DAG calls the assignment function with pre-legalized
+ // register typed values, not the raw type.
+ //
+ // This hack is not applied to return values which are not passed on the
+ // stack.
+ if (OrigVT == MVT::i1 || OrigVT == MVT::i8)
+ ValVT = LocVT = MVT::i8;
+ else if (OrigVT == MVT::i16)
+ ValVT = LocVT = MVT::i16;
+}
+
+// Account for i1/i8/i16 stack passed value hack
+static uint64_t getStackValueStoreSizeHack(const CCValAssign &VA) {
+ const MVT ValVT = VA.getValVT();
+ return (ValVT == MVT::i8 || ValVT == MVT::i16) ? ValVT.getStoreSize()
+ : VA.getLocVT().getStoreSize();
+}
+
namespace {
struct IncomingArgHandler : public CallLowering::IncomingValueHandler {
IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
return AddrReg.getReg(0);
}
+ uint64_t getStackValueStoreSize(const CCValAssign &VA) const override {
+ return getStackValueStoreSizeHack(VA);
+ }
+
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
MachineFunction &MF = MIRBuilder.getMF();
// The reported memory location may be wider than the value.
- const LLT RegTy = MRI.getType(ValVReg);
- MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
+ const LLT RealRegTy = MRI.getType(ValVReg);
+ LLT ValTy(VA.getValVT());
+ LLT LocTy(VA.getLocVT());
+
+ // Fixup the types for the DAG compatibility hack.
+ if (VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16)
+ std::swap(ValTy, LocTy);
+
+ MemSize = LocTy.getSizeInBytes();
auto MMO = MF.getMachineMemOperand(
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
MemSize, inferAlignFromPtrInfo(MF, MPO));
- const LLT LocVT = LLT{VA.getLocVT()};
-
- if (RegTy.getScalarSizeInBits() < LocVT.getScalarSizeInBits()) {
- auto LocInfo = VA.getLocInfo();
- if (LocInfo == CCValAssign::LocInfo::ZExt) {
- // We know the parameter is zero-extended. Perform a load into LocVT,
- // and use G_ASSERT_ZEXT to communicate that this was zero-extended from
- // the parameter type. Move down to the parameter type using G_TRUNC.
- MIRBuilder.buildTrunc(
- ValVReg, MIRBuilder.buildAssertZExt(
- LocVT, MIRBuilder.buildLoad(LocVT, Addr, *MMO),
- RegTy.getScalarSizeInBits()));
- return;
- }
- if (LocInfo == CCValAssign::LocInfo::SExt) {
- // Same as the ZExt case, but use G_ASSERT_SEXT instead.
- MIRBuilder.buildTrunc(
- ValVReg, MIRBuilder.buildAssertSExt(
- LocVT, MIRBuilder.buildLoad(LocVT, Addr, *MMO),
- RegTy.getScalarSizeInBits()));
- return;
- }
+ if (RealRegTy.getSizeInBits() == ValTy.getSizeInBits()) {
+ // No extension information, or no extension necessary. Load into the
+ // incoming parameter type directly.
+ MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+ } else {
+ auto Tmp = MIRBuilder.buildLoad(LocTy, Addr, *MMO);
+ MIRBuilder.buildTrunc(ValVReg, Tmp);
}
+ }
- // No extension information, or no extension necessary. Load into the
- // incoming parameter type directly.
- MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+ bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State) override {
+ applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);
+ return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
}
/// How the physical register gets marked varies between formal
struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler {
OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
MachineInstrBuilder MIB, CCAssignFn *AssignFn,
- CCAssignFn *AssignFnVarArg, bool IsTailCall = false,
- int FPDiff = 0)
+ CCAssignFn *AssignFnVarArg, bool IsReturn,
+ bool IsTailCall = false, int FPDiff = 0)
: OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
- AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff),
- StackSize(0), SPReg(0),
+ AssignFnVarArg(AssignFnVarArg), IsReturn(IsReturn),
+ IsTailCall(IsTailCall), FPDiff(FPDiff), StackSize(0), SPReg(0),
Subtarget(MIRBuilder.getMF().getSubtarget<AArch64Subtarget>()) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
return AddrReg.getReg(0);
}
+ /// We need to fixup the reported store size for certain value types because
+ /// we invert the interpretation of ValVT and LocVT in certain cases. This is
+ /// for compatability with the DAG call lowering implementation, which we're
+ /// currently building on top of.
+ uint64_t getStackValueStoreSize(const CCValAssign &VA) const override {
+ return getStackValueStoreSizeHack(VA);
+ }
+
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
}
void assignValueToAddress(const CallLowering::ArgInfo &Arg, unsigned RegIndex,
- Register Addr, uint64_t Size,
+ Register Addr, uint64_t MemSize,
MachinePointerInfo &MPO, CCValAssign &VA) override {
- unsigned MaxSize = Size * 8;
+ unsigned MaxSize = MemSize * 8;
// For varargs, we always want to extend them to 8 bytes, in which case
// we disable setting a max.
if (!Arg.IsFixed)
MaxSize = 0;
- Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
- ? extendRegister(Arg.Regs[RegIndex], VA, MaxSize)
- : Arg.Regs[0];
+ Register ValVReg = Arg.Regs[RegIndex];
+ if (VA.getLocInfo() != CCValAssign::LocInfo::FPExt) {
+ MVT LocVT = VA.getLocVT();
+ MVT ValVT = VA.getValVT();
+
+ if (VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16) {
+ std::swap(ValVT, LocVT);
+ MemSize = VA.getValVT().getStoreSize();
+ }
+
+ ValVReg = extendRegister(ValVReg, VA, MaxSize);
+ const LLT RegTy = MRI.getType(ValVReg);
- // If we extended we might need to adjust the MMO's Size.
- const LLT RegTy = MRI.getType(ValVReg);
- if (RegTy.getSizeInBytes() > Size)
- Size = RegTy.getSizeInBytes();
+ if (RegTy.getSizeInBits() < LocVT.getSizeInBits())
+ ValVReg = MIRBuilder.buildTrunc(RegTy, ValVReg).getReg(0);
+ } else {
+ // The store does not cover the full allocated stack slot.
+ MemSize = VA.getValVT().getStoreSize();
+ }
- assignValueToAddress(ValVReg, Addr, Size, MPO, VA);
+ assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA);
}
- bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info,
- ISD::ArgFlagsTy Flags,
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
- bool Res;
bool IsCalleeWin = Subtarget.isCallingConvWin64(State.getCallingConv());
bool UseVarArgsCCForFixed = IsCalleeWin && State.isVarArg();
+
+ if (!State.isVarArg() && !UseVarArgsCCForFixed && !IsReturn)
+ applyStackPassedSmallTypeDAGHack(OrigVT, ValVT, LocVT);
+
+ bool Res;
if (Info.IsFixed && !UseVarArgsCCForFixed)
Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
else
MachineInstrBuilder MIB;
CCAssignFn *AssignFnVarArg;
+
+ /// Track if this is used for a return instead of function argument
+ /// passing. We apply a hack to i1/i8/i16 stack passed values, but do not use
+ /// stack passed returns for them and cannot apply the type adjustment.
+ bool IsReturn;
bool IsTailCall;
/// For tail calls, the byte offset of the call's argument area from the
splitToValueTypes(CurArgInfo, SplitArgs, DL, CC);
}
- OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn);
+ OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn,
+ /*IsReturn*/ true);
Success =
handleAssignments(MIRBuilder, SplitArgs, Handler, CC, F.isVarArg());
}
// Do the actual argument marshalling.
OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
- AssignFnVarArg, true, FPDiff);
+ AssignFnVarArg, /*IsReturn*/ false,
+ /*IsTailCall*/ true, FPDiff);
if (!handleAssignments(MIRBuilder, OutArgs, Handler, CalleeCC, Info.IsVarArg))
return false;
// Do the actual argument marshalling.
OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
- AssignFnVarArg, false);
+ AssignFnVarArg, /*IsReturn*/ false);
if (!handleAssignments(MIRBuilder, OutArgs, Handler, Info.CallConv,
Info.IsVarArg))
return false;
MIB.addUse(PhysReg, RegState::Implicit);
}
- bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info,
- ISD::ArgFlagsTy Flags,
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
}
return 1;
}
- bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
- bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
CCState &State) override {
; CHECK: $w0 = COPY [[C]](s32)
; CHECK: $d0 = COPY [[C1]](s64)
; CHECK: $x1 = COPY [[C2]](s64)
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C3]](s8)
; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C8]](s64)
- ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s8)
- ; CHECK: G_STORE [[ANYEXT]](s64), [[PTR_ADD]](p0) :: (store 8 into stack, align 1)
+ ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[ANYEXT]](s32)
+ ; CHECK: G_STORE [[ANYEXT1]](s64), [[PTR_ADD]](p0) :: (store 8 into stack, align 1)
+ ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[C4]](s16)
; CHECK: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C9]](s64)
- ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C4]](s16)
- ; CHECK: G_STORE [[ANYEXT1]](s64), [[PTR_ADD1]](p0) :: (store 8 into stack + 8, align 1)
+ ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[ANYEXT2]](s32)
+ ; CHECK: G_STORE [[ANYEXT3]](s64), [[PTR_ADD1]](p0) :: (store 8 into stack + 8, align 1)
; CHECK: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C10]](s64)
- ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32)
- ; CHECK: G_STORE [[ANYEXT2]](s64), [[PTR_ADD2]](p0) :: (store 8 into stack + 16, align 1)
+ ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C5]](s32)
+ ; CHECK: G_STORE [[ANYEXT4]](s64), [[PTR_ADD2]](p0) :: (store 8 into stack + 16, align 1)
; CHECK: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C11]](s64)
; CHECK: G_STORE [[C6]](s32), [[PTR_ADD3]](p0) :: (store 4 into stack + 24, align 1)
ret i32 %conv
}
+define void @arg_v2i64(<2 x i64> %arg) {
+ ; CHECK-LABEL: name: arg_v2i64
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $q0
+ ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; CHECK: G_STORE [[COPY]](<2 x s64>), [[DEF]](p0) :: (store 16 into `<2 x i64>* undef`)
+ ; CHECK: RET_ReallyLR
+ store <2 x i64> %arg, <2 x i64>* undef
+ ret void
+}
+
+define void @arg_v8i64(<8 x i64> %arg) {
+ ; CHECK-LABEL: name: arg_v8i64
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $q0, $q1, $q2, $q3
+ ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+ ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+ ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
+ ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s64>) = G_CONCAT_VECTORS [[COPY]](<2 x s64>), [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>), [[COPY3]](<2 x s64>)
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; CHECK: G_STORE [[CONCAT_VECTORS]](<8 x s64>), [[DEF]](p0) :: (store 64 into `<8 x i64>* undef`)
+ ; CHECK: RET_ReallyLR
+ store <8 x i64> %arg, <8 x i64>* undef
+ ret void
+}
+
+define void @arg_v4f32(<4 x float> %arg) {
+ ; CHECK-LABEL: name: arg_v4f32
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $q0
+ ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+ ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; CHECK: G_STORE [[BITCAST]](<4 x s32>), [[DEF]](p0) :: (store 16 into `<4 x float>* undef`)
+ ; CHECK: RET_ReallyLR
+ store <4 x float> %arg, <4 x float>* undef
+ ret void
+}
+
+define void @ret_arg_v16f32(<16 x float> %arg) {
+ ; CHECK-LABEL: name: ret_arg_v16f32
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK: liveins: $q0, $q1, $q2, $q3
+ ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+ ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+ ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
+ ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+ ; CHECK: [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
+ ; CHECK: [[BITCAST2:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY2]](<2 x s64>)
+ ; CHECK: [[BITCAST3:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY3]](<2 x s64>)
+ ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[BITCAST]](<4 x s32>), [[BITCAST1]](<4 x s32>), [[BITCAST2]](<4 x s32>), [[BITCAST3]](<4 x s32>)
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; CHECK: G_STORE [[CONCAT_VECTORS]](<16 x s32>), [[DEF]](p0) :: (store 64 into `<16 x float>* undef`)
+ ; CHECK: RET_ReallyLR
+ store <16 x float> %arg, <16 x float>* undef
+ ret void
+}
define void @test_i1_arg_zext(void (i1)* %f) {
; CHECK-LABEL: name: test_i1_arg_zext
; CHECK: [[I1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[I1]](s1)
-; CHECK: $w0 = COPY [[ZEXT]](s32)
+; CHECK: [[ZEXT0:%[0-9]+]]:_(s8) = G_ZEXT [[I1]](s1)
+; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ZEXT0]](s8)
+; CHECK: $w0 = COPY [[ZEXT1]](s32)
call void %f(i1 true)
ret void
}
; CHECK-NEXT: - { id: [[SLOT:[0-9]+]], type: default, offset: 0, size: 1, alignment: 16, stack-id: default,
; CHECK-NEXT: isImmutable: true,
; CHECK: [[ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[SLOT]]
-; CHECK: {{%[0-9]+}}:_(s1) = G_LOAD [[ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[SLOT]], align 16)
+; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[SLOT]], align 16)
+; CHECK-NEXT: {{%[0-9]+}}:_(s1) = G_TRUNC [[LOAD]]
define void @test_mem_i1([8 x i64], i1 %in) {
ret void
}
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q1, $s0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
- ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
- ; CHECK: [[VECREDUCE_SEQ_FADD:%[0-9]+]]:_(s32) = G_VECREDUCE_SEQ_FADD [[COPY]](s32), [[COPY1]](<4 x s32>)
+ ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+ ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
+ ; CHECK: [[VECREDUCE_SEQ_FADD:%[0-9]+]]:_(s32) = G_VECREDUCE_SEQ_FADD [[COPY]](s32), [[BITCAST]](<4 x s32>)
; CHECK: $s0 = COPY [[VECREDUCE_SEQ_FADD]](s32)
; CHECK: RET_ReallyLR implicit $s0
%res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %vec)
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q1, $s0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
- ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
- ; CHECK: [[VECREDUCE_FADD:%[0-9]+]]:_(s32) = reassoc G_VECREDUCE_FADD [[COPY1]](<4 x s32>)
+ ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+ ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
+ ; CHECK: [[VECREDUCE_FADD:%[0-9]+]]:_(s32) = reassoc G_VECREDUCE_FADD [[BITCAST]](<4 x s32>)
; CHECK: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY]], [[VECREDUCE_FADD]]
; CHECK: $s0 = COPY [[FADD]](s32)
; CHECK: RET_ReallyLR implicit $s0
; CHECK-LABEL: name: fmax
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q0
- ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
- ; CHECK: [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAX [[COPY]](<4 x s32>)
+ ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+ ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+ ; CHECK: [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAX [[BITCAST]](<4 x s32>)
; CHECK: $s0 = COPY [[VECREDUCE_FMAX]](s32)
; CHECK: RET_ReallyLR implicit $s0
%res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %vec)
; CHECK-LABEL: name: fmin
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q0
- ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
- ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMIN [[COPY]](<4 x s32>)
+ ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+ ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+ ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMIN [[BITCAST]](<4 x s32>)
; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32)
; CHECK: RET_ReallyLR implicit $s0
%res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec)
; CHECK-LABEL: name: fmin_nnan
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $q0
- ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
- ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMIN [[COPY]](<4 x s32>)
+ ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+ ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+ ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMIN [[BITCAST]](<4 x s32>)
; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32)
; CHECK: RET_ReallyLR implicit $s0
%res = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec)
define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s0, s3, -1
-; GFX6-NEXT: s_and_b32 s0, s2, s0
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_and_b32 s2, s2, s1
+; GFX6-NEXT: s_lshl_b32 s0, s3, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_and_b32 s1, s4, s1
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_xor_b32 s1, s1, -1
+; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16:
define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16_commute:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s0, s3, -1
-; GFX6-NEXT: s_and_b32 s0, s0, s2
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_and_b32 s2, s2, s1
+; GFX6-NEXT: s_lshl_b32 s0, s3, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_and_b32 s1, s4, s1
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_xor_b32 s1, s1, -1
+; GFX6-NEXT: s_and_b32 s0, s1, s0
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16_commute:
define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16_multi_use:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s1, s3, -1
-; GFX6-NEXT: s_and_b32 s0, s2, s1
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_and_b32 s2, s2, s1
+; GFX6-NEXT: s_lshl_b32 s0, s3, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_and_b32 s1, s4, s1
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_xor_b32 s1, s1, -1
+; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16_multi_use:
define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s1, s4, -1
-; GFX6-NEXT: s_and_b32 s0, s2, s1
-; GFX6-NEXT: s_and_b32 s1, s3, s1
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_lshl_b32 s0, s3, 16
+; GFX6-NEXT: s_and_b32 s2, s2, s1
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s3, s4, s1
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_lshl_b32 s3, s7, 16
+; GFX6-NEXT: s_and_b32 s1, s6, s1
+; GFX6-NEXT: s_or_b32 s1, s3, s1
+; GFX6-NEXT: s_xor_b32 s1, s1, -1
+; GFX6-NEXT: s_and_b32 s0, s0, s1
+; GFX6-NEXT: s_and_b32 s1, s2, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use:
}
define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
-; GCN-LABEL: v_andn2_v2i16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
-; GCN-NEXT: v_and_b32_e32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: v_andn2_v2i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v0, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
+; GFX6-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_andn2_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_andn2_v2i16:
; GFX10: ; %bb.0:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, 0xffff
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_ashrrev_i32_e32 v0, v1, v0
-; GFX6-NEXT: v_bfe_i32 v1, v2, 0, 16
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, v3, v1
-; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, v2, v0
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, v2, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ashr_v2i16:
; GFX6-LABEL: v_ashr_v2i16_15:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1
-; GFX6-NEXT: s_mov_b32 s4, 0xffff
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 15, v0
-; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ashr_v2i16_15:
; GFX6-LABEL: s_ashr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
-; GFX6-NEXT: s_lshr_b32 s3, s1, 16
-; GFX6-NEXT: s_and_b32 s1, s1, s4
+; GFX6-NEXT: s_and_b32 s2, s2, s4
; GFX6-NEXT: s_sext_i32_i16 s0, s0
-; GFX6-NEXT: s_ashr_i32 s0, s0, s1
-; GFX6-NEXT: s_sext_i32_i16 s1, s2
-; GFX6-NEXT: s_ashr_i32 s1, s1, s3
+; GFX6-NEXT: s_ashr_i32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s2, s3, s4
+; GFX6-NEXT: s_sext_i32_i16 s1, s1
+; GFX6-NEXT: s_ashr_i32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-LABEL: ashr_v2i16_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, 0xffff
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: s_sext_i32_i16 s0, s0
; GFX6-NEXT: v_ashr_i32_e32 v0, s0, v0
+; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: s_sext_i32_i16 s0, s1
; GFX6-NEXT: v_ashr_i32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: ashr_v2i16_vs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: s_mov_b32 s2, 0xffff
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, s1, v1
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0
+; GFX6-NEXT: s_and_b32 s0, s1, s2
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
; GFX7-LABEL: s_bswap_v2i16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_lshr_b32 s1, s0, 16
-; GFX7-NEXT: s_and_b32 s3, s0, 0xffff
+; GFX7-NEXT: s_mov_b32 s3, 0xffff
; GFX7-NEXT: s_lshl_b32 s2, s0, 8
-; GFX7-NEXT: s_lshl_b32 s1, s1, 8
-; GFX7-NEXT: s_lshr_b32 s0, s0, 24
-; GFX7-NEXT: s_or_b32 s0, s0, s1
-; GFX7-NEXT: s_lshr_b32 s3, s3, 8
+; GFX7-NEXT: s_and_b32 s0, s0, s3
+; GFX7-NEXT: s_lshr_b32 s0, s0, 8
+; GFX7-NEXT: s_or_b32 s0, s0, s2
+; GFX7-NEXT: s_lshl_b32 s2, s1, 8
+; GFX7-NEXT: s_and_b32 s1, s1, s3
+; GFX7-NEXT: s_lshr_b32 s1, s1, 8
+; GFX7-NEXT: s_or_b32 s1, s1, s2
+; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000
-; GFX7-NEXT: s_or_b32 s2, s3, s2
-; GFX7-NEXT: s_bfe_u32 s1, s2, 0x100000
-; GFX7-NEXT: s_lshl_b32 s0, s0, 16
-; GFX7-NEXT: s_or_b32 s0, s1, s0
+; GFX7-NEXT: s_lshl_b32 s1, s1, 16
+; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_bswap_v2i16:
; GFX7-LABEL: v_bswap_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0
+; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v3
-; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_bfe_u32 v1, v2, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1
+; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bswap_v2i16:
define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 {
; CHECK-LABEL: name: halfinsts_add_v2i16
; CHECK: bb.1 (%ir-block.0):
- ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
- ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
- ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
- ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
- ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
- ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY3]]
- ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
- ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
- ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY4]], [[COPY5]]
- ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
- ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
- ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32)
- ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]]
- ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
- ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
- ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
- ; CHECK: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
- ; CHECK: S_SETPC_B64_return [[COPY8]], implicit $vgpr0
+ ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+ ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+ ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+ ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY4]]
+ ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+ ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+ ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY5]], [[COPY6]]
+ ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+ ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32)
+ ; CHECK: $vgpr0 = COPY [[COPY7]](s32)
+ ; CHECK: $vgpr1 = COPY [[COPY8]](s32)
+ ; CHECK: [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
+ ; CHECK: S_SETPC_B64_return [[COPY9]], implicit $vgpr0, implicit $vgpr1
%add = add <2 x i16> %arg0, %arg0
ret <2 x i16> %add
}
; GFX6-IEEE-LABEL: v_fdiv_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
-; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_fdiv_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16
-; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16:
; GFX6-LABEL: v_fdiv_v2f16_afn:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_rcp_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_afn:
; GFX6-IEEE-LABEL: v_fdiv_v2f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
-; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16
-; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_ulp25:
; GFX6-IEEE-LABEL: v_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
-; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s6
-; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16:
; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
-; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s6
-; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16_arcp:
; GFX6-LABEL: v_rcp_v2f16_arcp_afn:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-NEXT: v_rcp_f32_e32 v1, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_rcp_f32_e32 v1, v1
; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
-; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s6
-; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v0, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v4
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16_ulp25:
; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_rcp_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v2, v4, v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v1, v0
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_bfe_u32 v1, v2, 0, 16
-; GFX6-IEEE-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v2
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v2, v4, v3, v2
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v1, v0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_bfe_u32 v1, v2, 0, 16
-; GFX6-FLUSH-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_rcp_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
; GFX6-NEXT: v_rcp_f32_e32 v3, v3
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
; GFX6-LABEL: v_fma_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
+; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16:
; GFX6-LABEL: v_fma_v2f16_fneg_lhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
+; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_lhs:
; GFX6-LABEL: v_fma_v2f16_fneg_rhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
+; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_rhs:
; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v6, 0xffff
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v0, v0, v6
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v2, v2, v6
; GFX6-NEXT: s_mov_b32 s4, 0x80008000
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX6-NEXT: v_fma_f32 v0, v0, v1, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5
+; GFX6-NEXT: v_fma_f32 v1, v2, v3, v5
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs:
; GFX6-LABEL: v_pow_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_log_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_log_f32_e32 v2, v2
-; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_log_f32_e32 v0, v0
+; GFX6-NEXT: v_log_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v2, v3
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16:
; GFX6-LABEL: v_pow_v2f16_fneg_lhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_log_f32_e32 v0, v0
+; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_log_f32_e32 v2, v2
-; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
-; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v2, v3
+; GFX6-NEXT: v_log_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
; GFX6-NEXT: v_exp_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3
+; GFX6-NEXT: v_exp_f32_e32 v2, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
; GFX6-LABEL: v_pow_v2f16_fneg_rhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_log_f32_e32 v2, v2
-; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_log_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v2, v3
+; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v0, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_mov_b32 s4, 0x80008000
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v2, v2, v4
; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs:
define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: s_fshl_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s5, s2, 15
-; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000
-; GFX6-NEXT: s_lshr_b32 s3, s0, 16
-; GFX6-NEXT: s_lshr_b32 s4, s2, 16
-; GFX6-NEXT: s_andn2_b32 s2, 15, s2
-; GFX6-NEXT: s_lshl_b32 s0, s0, s5
-; GFX6-NEXT: s_and_b32 s5, s1, 0xffff
-; GFX6-NEXT: s_lshr_b32 s5, s5, 1
-; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
-; GFX6-NEXT: s_lshr_b32 s2, s5, s2
+; GFX6-NEXT: s_and_b32 s6, s4, 15
+; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT: s_lshl_b32 s0, s0, s6
+; GFX6-NEXT: s_mov_b32 s6, 0xffff
+; GFX6-NEXT: s_andn2_b32 s4, 15, s4
+; GFX6-NEXT: s_and_b32 s2, s2, s6
+; GFX6-NEXT: s_lshr_b32 s2, s2, 1
+; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT: s_lshr_b32 s2, s2, s4
; GFX6-NEXT: s_or_b32 s0, s0, s2
-; GFX6-NEXT: s_and_b32 s2, s4, 15
+; GFX6-NEXT: s_and_b32 s2, s5, 15
; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
-; GFX6-NEXT: s_andn2_b32 s4, 15, s4
-; GFX6-NEXT: s_lshl_b32 s2, s3, s2
-; GFX6-NEXT: s_lshr_b32 s1, s1, 17
+; GFX6-NEXT: s_lshl_b32 s1, s1, s2
+; GFX6-NEXT: s_and_b32 s2, s3, s6
+; GFX6-NEXT: s_andn2_b32 s4, 15, s5
+; GFX6-NEXT: s_lshr_b32 s2, s2, 1
; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000
-; GFX6-NEXT: s_lshr_b32 s1, s1, s3
-; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_lshr_b32 s2, s2, s3
+; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-LABEL: v_fshl_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 15, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2
-; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16
-; GFX6-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 1, v5
-; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT: v_and_b32_e32 v2, 15, v4
+; GFX6-NEXT: v_and_b32_e32 v6, 15, v4
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT: s_mov_b32 s4, 0xffff
+; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 15, v5
+; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i16:
; GFX6-LABEL: v_fshl_v2i16_4_8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX6-NEXT: s_bfe_u32 s4, 4, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT: s_bfe_u32 s4, 11, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3
-; GFX6-NEXT: s_bfe_u32 s4, 8, 0x100000
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1
+; GFX6-NEXT: s_mov_b32 s4, 0xffff
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT: s_bfe_u32 s5, 11, 0x100000
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, s5, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT: s_bfe_u32 s5, 8, 0x100000
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, s4, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s5, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i16_4_8:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_and_b32_e32 v2, 15, v0
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
-; GFX6-NEXT: s_lshr_b32 s0, s0, 1
+; GFX6-NEXT: s_mov_b32 s0, 0xffff
+; GFX6-NEXT: s_and_b32 s2, s2, s0
+; GFX6-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX6-NEXT: s_lshr_b32 s2, s2, 1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
+; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, 15, v1
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX6-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX6-NEXT: s_and_b32 s0, s3, s0
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT: s_lshr_b32 s0, s1, 17
+; GFX6-NEXT: s_lshr_b32 s0, s0, 1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshl_b32_e32 v2, s2, v2
+; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2
; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: v_fshl_v2i16_svs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX6-NEXT: s_and_b32 s4, s1, 15
-; GFX6-NEXT: s_lshr_b32 s3, s1, 16
-; GFX6-NEXT: s_andn2_b32 s1, 15, s1
+; GFX6-NEXT: s_and_b32 s4, s2, 15
; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1
; GFX6-NEXT: s_lshl_b32 s0, s0, s4
-; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
+; GFX6-NEXT: s_mov_b32 s4, 0xffff
+; GFX6-NEXT: s_andn2_b32 s2, 15, s2
+; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0
+; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s3, 15
-; GFX6-NEXT: s_andn2_b32 s1, 15, s3
; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 17, v0
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
-; GFX6-NEXT: s_lshl_b32 s0, s2, s0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT: s_andn2_b32 s2, 15, s3
+; GFX6-NEXT: s_lshl_b32 s0, s1, s0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1
+; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshl_v2i16_svs:
define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: v_fshl_v2i16_vss:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s3, s1, 15
-; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: s_lshr_b32 s2, s1, 16
-; GFX6-NEXT: s_andn2_b32 s1, 15, s1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0
-; GFX6-NEXT: s_and_b32 s3, s0, 0xffff
-; GFX6-NEXT: s_lshr_b32 s3, s3, 1
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
-; GFX6-NEXT: s_lshr_b32 s1, s3, s1
-; GFX6-NEXT: v_or_b32_e32 v0, s1, v0
-; GFX6-NEXT: s_and_b32 s1, s2, 15
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT: s_and_b32 s4, s2, 15
+; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_andn2_b32 s2, 15, s2
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1
-; GFX6-NEXT: s_lshr_b32 s0, s0, 17
+; GFX6-NEXT: s_and_b32 s0, s0, s4
+; GFX6-NEXT: s_lshr_b32 s0, s0, 1
+; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT: s_lshr_b32 s0, s0, s2
+; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT: s_and_b32 s0, s3, 15
+; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1
+; GFX6-NEXT: s_and_b32 s0, s1, s4
+; GFX6-NEXT: s_andn2_b32 s2, 15, s3
+; GFX6-NEXT: s_lshr_b32 s0, s0, 1
; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000
; GFX6-NEXT: s_lshr_b32 s0, s0, s1
; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: s_fshr_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_mov_b32 s5, 0xffff
-; GFX6-NEXT: s_lshr_b32 s3, s0, 16
-; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000
-; GFX6-NEXT: s_and_b32 s6, s1, s5
-; GFX6-NEXT: s_lshl_b32 s0, s0, s4
-; GFX6-NEXT: s_lshl_b32 s3, s3, s4
-; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000
-; GFX6-NEXT: s_lshr_b32 s4, s1, 17
-; GFX6-NEXT: s_lshr_b32 s6, s6, 1
-; GFX6-NEXT: s_lshr_b32 s4, s4, s7
-; GFX6-NEXT: s_lshr_b32 s6, s6, s7
-; GFX6-NEXT: s_or_b32 s3, s3, s4
-; GFX6-NEXT: s_lshr_b32 s4, s1, 16
-; GFX6-NEXT: s_lshl_b32 s1, s1, 1
-; GFX6-NEXT: s_xor_b32 s2, s2, -1
-; GFX6-NEXT: s_and_b32 s7, s2, 15
-; GFX6-NEXT: s_and_b32 s1, s1, s5
-; GFX6-NEXT: s_or_b32 s0, s0, s6
-; GFX6-NEXT: s_lshr_b32 s6, s2, 16
-; GFX6-NEXT: s_andn2_b32 s2, 15, s2
+; GFX6-NEXT: s_mov_b32 s6, 0xffff
+; GFX6-NEXT: s_lshl_b32 s5, s5, 16
+; GFX6-NEXT: s_and_b32 s4, s4, s6
+; GFX6-NEXT: s_or_b32 s4, s5, s4
+; GFX6-NEXT: s_bfe_u32 s5, 1, 0x100000
+; GFX6-NEXT: s_and_b32 s7, s2, s6
+; GFX6-NEXT: s_lshl_b32 s0, s0, s5
+; GFX6-NEXT: s_lshl_b32 s1, s1, s5
+; GFX6-NEXT: s_and_b32 s5, s3, s6
+; GFX6-NEXT: s_lshr_b32 s7, s7, 1
+; GFX6-NEXT: s_bfe_u32 s8, 14, 0x100000
+; GFX6-NEXT: s_lshr_b32 s5, s5, 1
+; GFX6-NEXT: s_lshl_b32 s2, s2, 1
+; GFX6-NEXT: s_lshr_b32 s7, s7, s8
+; GFX6-NEXT: s_lshr_b32 s5, s5, s8
+; GFX6-NEXT: s_xor_b32 s4, s4, -1
+; GFX6-NEXT: s_and_b32 s2, s2, s6
+; GFX6-NEXT: s_or_b32 s0, s0, s7
+; GFX6-NEXT: s_and_b32 s7, s4, 15
+; GFX6-NEXT: s_or_b32 s1, s1, s5
+; GFX6-NEXT: s_lshr_b32 s5, s4, 16
+; GFX6-NEXT: s_andn2_b32 s4, 15, s4
; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000
-; GFX6-NEXT: s_lshr_b32 s1, s1, 1
-; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
-; GFX6-NEXT: s_lshr_b32 s1, s1, s2
+; GFX6-NEXT: s_lshr_b32 s2, s2, 1
+; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT: s_lshr_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s0, s0, s7
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_and_b32 s1, s6, 15
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
-; GFX6-NEXT: s_lshl_b32 s4, s4, 1
-; GFX6-NEXT: s_andn2_b32 s2, 15, s6
-; GFX6-NEXT: s_lshl_b32 s1, s3, s1
-; GFX6-NEXT: s_and_b32 s3, s4, s5
-; GFX6-NEXT: s_lshr_b32 s3, s3, 1
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s2, s5, 15
; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
-; GFX6-NEXT: s_lshr_b32 s2, s3, s2
+; GFX6-NEXT: s_lshl_b32 s3, s3, 1
+; GFX6-NEXT: s_lshl_b32 s1, s1, s2
+; GFX6-NEXT: s_and_b32 s2, s3, s6
+; GFX6-NEXT: s_andn2_b32 s4, 15, s5
+; GFX6-NEXT: s_lshr_b32 s2, s2, 1
+; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000
+; GFX6-NEXT: s_lshr_b32 s2, s2, s3
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX6-LABEL: v_fshr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: s_mov_b32 s5, 0xffff
-; GFX6-NEXT: v_and_b32_e32 v4, s5, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v5, s5, v2
; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 1, v5
; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 17, v1
-; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 15, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2
-; GFX6-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX6-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, s6, v5
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX6-NEXT: v_and_b32_e32 v5, s5, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 1, v5
+; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, s6, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 15, v4
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX6-NEXT: v_and_b32_e32 v2, s5, v2
; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 15, v5
-; GFX6-NEXT: v_xor_b32_e32 v2, -1, v5
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4
-; GFX6-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX6-NEXT: v_and_b32_e32 v3, s5, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 15, v5
+; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s5, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v2i16:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_sub_i32 s4, 0, 4
; GFX6-NEXT: s_and_b32 s6, s4, 15
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v1
+; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT: s_mov_b32 s6, 0xffff
; GFX6-NEXT: s_xor_b32 s4, s4, -1
+; GFX6-NEXT: v_and_b32_e32 v2, s6, v2
; GFX6-NEXT: s_sub_i32 s5, 0, 8
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2
; GFX6-NEXT: s_and_b32 s4, s5, 15
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_and_b32_e32 v2, s6, v3
; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
; GFX6-NEXT: s_xor_b32 s5, s5, -1
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, s4, v1
-; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v2i16_4_8:
define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) {
; GFX6-LABEL: v_fshr_v2i16_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_mov_b32 s4, 0xffff
-; GFX6-NEXT: s_and_b32 s5, s1, s4
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
-; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000
+; GFX6-NEXT: s_mov_b32 s5, 0xffff
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: s_and_b32 s6, s2, s5
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000
; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT: s_lshl_b32 s0, s0, s3
-; GFX6-NEXT: s_lshl_b32 s2, s2, s3
-; GFX6-NEXT: s_lshr_b32 s5, s5, 1
-; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000
-; GFX6-NEXT: s_lshr_b32 s3, s1, 17
+; GFX6-NEXT: s_lshr_b32 s6, s6, 1
+; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000
; GFX6-NEXT: v_and_b32_e32 v2, 15, v0
-; GFX6-NEXT: s_lshr_b32 s5, s5, s6
-; GFX6-NEXT: s_lshr_b32 s3, s3, s6
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_lshr_b32 s3, s1, 16
+; GFX6-NEXT: s_lshl_b32 s0, s0, s4
+; GFX6-NEXT: s_lshr_b32 s6, s6, s7
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT: s_or_b32 s0, s0, s5
+; GFX6-NEXT: s_or_b32 s0, s0, s6
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
-; GFX6-NEXT: s_lshl_b32 s1, s1, 1
+; GFX6-NEXT: s_lshl_b32 s2, s2, 1
; GFX6-NEXT: v_and_b32_e32 v0, 15, v0
; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT: s_and_b32 s0, s1, s4
+; GFX6-NEXT: s_and_b32 s0, s2, s5
; GFX6-NEXT: s_lshr_b32 s0, s0, 1
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
+; GFX6-NEXT: s_lshl_b32 s1, s1, s4
+; GFX6-NEXT: s_and_b32 s4, s3, s5
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, 15, v1
+; GFX6-NEXT: s_lshr_b32 s4, s4, 1
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX6-NEXT: s_lshl_b32 s3, s3, 1
; GFX6-NEXT: v_and_b32_e32 v1, 15, v1
-; GFX6-NEXT: s_and_b32 s0, s3, s4
+; GFX6-NEXT: s_and_b32 s0, s3, s5
+; GFX6-NEXT: s_lshr_b32 s4, s4, s7
+; GFX6-NEXT: s_or_b32 s1, s1, s4
; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16
; GFX6-NEXT: s_lshr_b32 s0, s0, 1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshl_b32_e32 v2, s2, v2
+; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2
; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-LABEL: v_fshr_v2i16_svs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
-; GFX6-NEXT: v_and_b32_e32 v1, s4, v0
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v0
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_and_b32 s2, s2, s4
+; GFX6-NEXT: v_and_b32_e32 v3, s4, v1
+; GFX6-NEXT: s_or_b32 s2, s3, s2
; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 17, v0
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
; GFX6-NEXT: s_lshl_b32 s0, s0, s3
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, s5, v1
-; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
-; GFX6-NEXT: s_lshl_b32 s0, s2, s3
; GFX6-NEXT: v_lshrrev_b32_e32 v2, s5, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX6-NEXT: v_or_b32_e32 v2, s0, v2
-; GFX6-NEXT: s_xor_b32 s0, s1, -1
+; GFX6-NEXT: s_lshl_b32 s0, s1, s3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, s5, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_or_b32_e32 v3, s0, v3
+; GFX6-NEXT: s_xor_b32 s0, s2, -1
; GFX6-NEXT: s_and_b32 s2, s0, 15
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_andn2_b32 s0, 15, s0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, s2, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
-; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v2
-; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, s2, v2
; GFX6-NEXT: s_andn2_b32 s1, 15, s1
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, s0, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX6-NEXT: s_bfe_u32 s0, s1, 0x100000
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, s0, v2
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1
+; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: v_fshr_v2i16_vss:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_mov_b32 s3, 0xffff
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: s_bfe_u32 s2, 1, 0x100000
-; GFX6-NEXT: s_and_b32 s4, s0, s3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, s2, v1
-; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000
-; GFX6-NEXT: s_lshr_b32 s2, s0, 17
-; GFX6-NEXT: s_lshr_b32 s4, s4, 1
-; GFX6-NEXT: s_lshr_b32 s2, s2, s5
-; GFX6-NEXT: s_lshr_b32 s4, s4, s5
-; GFX6-NEXT: v_or_b32_e32 v1, s2, v1
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
+; GFX6-NEXT: s_mov_b32 s4, 0xffff
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_and_b32 s2, s2, s4
+; GFX6-NEXT: s_or_b32 s2, s3, s2
+; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000
+; GFX6-NEXT: s_and_b32 s5, s0, s4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s3, v1
+; GFX6-NEXT: s_and_b32 s3, s1, s4
+; GFX6-NEXT: s_lshr_b32 s5, s5, 1
+; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000
+; GFX6-NEXT: s_lshr_b32 s3, s3, 1
; GFX6-NEXT: s_lshl_b32 s0, s0, 1
-; GFX6-NEXT: s_xor_b32 s1, s1, -1
-; GFX6-NEXT: s_and_b32 s5, s1, 15
-; GFX6-NEXT: s_and_b32 s0, s0, s3
-; GFX6-NEXT: v_or_b32_e32 v0, s4, v0
-; GFX6-NEXT: s_lshr_b32 s4, s1, 16
-; GFX6-NEXT: s_andn2_b32 s1, 15, s1
+; GFX6-NEXT: s_lshr_b32 s5, s5, s6
+; GFX6-NEXT: s_lshr_b32 s3, s3, s6
+; GFX6-NEXT: s_xor_b32 s2, s2, -1
+; GFX6-NEXT: s_and_b32 s0, s0, s4
+; GFX6-NEXT: v_or_b32_e32 v0, s5, v0
+; GFX6-NEXT: s_and_b32 s5, s2, 15
+; GFX6-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX6-NEXT: s_lshr_b32 s3, s2, 16
+; GFX6-NEXT: s_andn2_b32 s2, 15, s2
; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000
; GFX6-NEXT: s_lshr_b32 s0, s0, 1
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
-; GFX6-NEXT: s_lshr_b32 s0, s0, s1
+; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000
+; GFX6-NEXT: s_lshr_b32 s0, s0, s2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s5, v0
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_and_b32 s0, s4, 15
+; GFX6-NEXT: s_and_b32 s0, s3, 15
; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
-; GFX6-NEXT: s_lshl_b32 s2, s2, 1
-; GFX6-NEXT: s_andn2_b32 s1, 15, s4
+; GFX6-NEXT: s_lshl_b32 s1, s1, 1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1
-; GFX6-NEXT: s_and_b32 s0, s2, s3
+; GFX6-NEXT: s_and_b32 s0, s1, s4
+; GFX6-NEXT: s_andn2_b32 s2, 15, s3
; GFX6-NEXT: s_lshr_b32 s0, s0, 1
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000
; GFX6-NEXT: s_lshr_b32 s0, s0, s1
; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s8)
- ; CHECK: $vgpr0 = COPY [[SEXT]](s32)
+ ; CHECK: [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[LOAD]](s8)
+ ; CHECK: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[SEXT]](s16)
+ ; CHECK: $vgpr0 = COPY [[SEXT1]](s32)
; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
- ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s8)
- ; CHECK: $vgpr0 = COPY [[ZEXT]](s32)
+ ; CHECK: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[LOAD]](s8)
+ ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ZEXT]](s16)
+ ; CHECK: $vgpr0 = COPY [[ZEXT1]](s32)
; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 24
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[ASSERT_ZEXT]](s32)
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: G_STORE [[TRUNC]](s24), [[DEF]](p1) :: (store 3 into `i24 addrspace(1)* undef`, align 4, addrspace 1)
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY]], 24
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[ASSERT_SEXT]](s32)
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: G_STORE [[TRUNC]](s24), [[DEF]](p1) :: (store 3 into `i24 addrspace(1)* undef`, align 4, addrspace 1)
; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
- ; CHECK: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.3, align 16, addrspace 5)
+ ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.3, align 16, addrspace 5)
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD]](s32)
; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
; CHECK: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 2 from %fixed-stack.2, align 4, addrspace 5)
- ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
+ ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16)
; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 2 from %fixed-stack.1, align 8, addrspace 5)
; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
; CHECK: [[COPY35:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
; CHECK: [[COPY36:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, addrspace 1)
- ; CHECK: G_STORE [[LOAD]](s1), [[COPY33]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1)
- ; CHECK: G_STORE [[TRUNC]](s8), [[COPY34]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
+ ; CHECK: G_STORE [[TRUNC]](s1), [[COPY33]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1)
+ ; CHECK: G_STORE [[TRUNC1]](s8), [[COPY34]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[LOAD2]](s16), [[COPY35]](p1) :: (volatile store 2 into `i16 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[LOAD3]](s16), [[COPY36]](p1) :: (volatile store 2 into `half addrspace(1)* undef`, addrspace 1)
; CHECK: [[COPY37:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, 0xffff
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v0
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_lshr_v2i16:
; GFX6-LABEL: v_lshr_v2i16_15:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v0
+; GFX6-NEXT: s_mov_b32 s4, 0xffff
+; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 15, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 15, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_lshr_v2i16_15:
; GFX6-LABEL: s_lshr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
-; GFX6-NEXT: s_lshr_b32 s3, s1, 16
-; GFX6-NEXT: s_and_b32 s1, s1, s4
+; GFX6-NEXT: s_and_b32 s2, s2, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
-; GFX6-NEXT: s_lshr_b32 s0, s0, s1
-; GFX6-NEXT: s_lshr_b32 s1, s2, s3
+; GFX6-NEXT: s_lshr_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s2, s3, s4
+; GFX6-NEXT: s_and_b32 s1, s1, s4
+; GFX6-NEXT: s_lshr_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
; GFX6-LABEL: lshr_v2i16_sv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_mov_b32 s2, 0xffff
-; GFX6-NEXT: v_lshr_b32_e32 v1, s1, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
+; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
+; GFX6-NEXT: s_and_b32 s0, s1, s2
+; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: ; return to shader part epilog
define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: lshr_v2i16_vs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_mov_b32 s2, 0xffff
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0
+; GFX6-NEXT: s_and_b32 s0, s1, s2
+; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: ; return to shader part epilog
define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s0, s3, -1
-; GFX6-NEXT: s_or_b32 s0, s2, s0
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_and_b32 s2, s2, s1
+; GFX6-NEXT: s_lshl_b32 s0, s3, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_and_b32 s1, s4, s1
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_xor_b32 s1, s1, -1
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_v2i16:
define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16_commute:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s0, s3, -1
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_and_b32 s2, s2, s1
+; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_and_b32 s1, s4, s1
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_xor_b32 s1, s1, -1
+; GFX6-NEXT: s_or_b32 s0, s1, s0
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_v2i16_commute:
define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16_multi_use:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s1, s3, -1
-; GFX6-NEXT: s_or_b32 s0, s2, s1
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_and_b32 s2, s2, s1
+; GFX6-NEXT: s_lshl_b32 s0, s3, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_and_b32 s1, s4, s1
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_xor_b32 s1, s1, -1
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_v2i16_multi_use:
define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s1, s4, -1
-; GFX6-NEXT: s_or_b32 s0, s2, s1
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_lshl_b32 s0, s3, 16
+; GFX6-NEXT: s_and_b32 s2, s2, s1
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s3, s4, s1
+; GFX6-NEXT: s_lshl_b32 s2, s5, 16
+; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_lshl_b32 s3, s7, 16
+; GFX6-NEXT: s_and_b32 s1, s6, s1
; GFX6-NEXT: s_or_b32 s1, s3, s1
+; GFX6-NEXT: s_xor_b32 s1, s1, -1
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use:
}
define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
-; GCN-LABEL: v_orn2_v2i16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: v_orn2_v2i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v0, v0, v4
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v2, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_orn2_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_orn2_v2i16:
; GFX10: ; %bb.0:
; GFX6-LABEL: v_roundeven_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_rndne_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_rndne_f32_e32 v0, v0
+; GFX6-NEXT: v_rndne_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_v2f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_rndne_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
+; GFX7-NEXT: v_rndne_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_v2f16:
; GFX6-LABEL: v_roundeven_v2f16_fneg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_rndne_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_rndne_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-NEXT: v_rndne_f32_e32 v0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: v_rndne_f32_e32 v1, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_v2f16_fneg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_rndne_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_rndne_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT: v_rndne_f32_e32 v0, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_bfe_u32 v1, v1, 0, 16
-; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_rndne_f32_e32 v1, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_v2f16_fneg:
; GFX6-LABEL: v_saddsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_brev_b32 s5, 1
; GFX6-NEXT: v_min_i32_e32 v5, 0, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5
; GFX6-NEXT: s_brev_b32 s4, -2
; GFX6-NEXT: v_max_i32_e32 v4, 0, v0
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4
-; GFX6-NEXT: v_max_i32_e32 v1, v5, v1
-; GFX6-NEXT: v_min_i32_e32 v1, v1, v4
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v5, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
; GFX6-NEXT: v_min_i32_e32 v4, 0, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_max_i32_e32 v3, 0, v1
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4
; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v3
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_v2i16:
define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: s_saddsat_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_brev_b32 s5, 1
; GFX6-NEXT: s_min_i32 s7, s0, 0
-; GFX6-NEXT: s_lshr_b32 s3, s1, 16
-; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_sub_i32 s7, s5, s7
; GFX6-NEXT: s_brev_b32 s4, -2
; GFX6-NEXT: s_max_i32 s6, s0, 0
; GFX6-NEXT: s_sub_i32 s6, s4, s6
-; GFX6-NEXT: s_max_i32 s1, s7, s1
-; GFX6-NEXT: s_min_i32 s1, s1, s6
-; GFX6-NEXT: s_add_i32 s0, s0, s1
-; GFX6-NEXT: s_lshl_b32 s1, s2, 16
+; GFX6-NEXT: s_max_i32 s2, s7, s2
+; GFX6-NEXT: s_min_i32 s2, s2, s6
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_add_i32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s3, 16
; GFX6-NEXT: s_max_i32 s3, s1, 0
; GFX6-NEXT: s_sub_i32 s3, s4, s3
define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: saddsat_v2i16_sv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_brev_b32 s3, 1
; GFX6-NEXT: s_min_i32 s5, s0, 0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_sub_i32 s5, s3, s5
; GFX6-NEXT: s_brev_b32 s2, -2
define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: saddsat_v2i16_vs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_brev_b32 s3, 1
; GFX6-NEXT: v_min_i32_e32 v3, 0, v0
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3
; GFX6-NEXT: s_brev_b32 s2, -2
define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
; GFX7-LABEL: s_shl_v2i32_zext_v2i16:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s2, 0xffff
+; GFX7-NEXT: s_lshl_b32 s1, s1, 16
+; GFX7-NEXT: s_and_b32 s0, s0, s2
+; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff
; GFX7-NEXT: s_lshr_b32 s1, s0, 16
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX7-NEXT: s_and_b32 s0, s0, s2
; GFX7-NEXT: s_lshl_b32 s0, s0, 2
; GFX7-NEXT: s_lshl_b32 s1, s1, 2
; GFX7-NEXT: ; return to shader part epilog
; GFX7-LABEL: v_shl_v2i32_zext_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_and_b32_e32 v0, v0, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, 0xffff
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, v1, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v2
-; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_v2i16:
; GFX6-LABEL: v_shl_v2i16_15:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 15, v0
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 15, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_shl_v2i16_15:
; GFX6-LABEL: s_shl_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
-; GFX6-NEXT: s_lshr_b32 s3, s1, 16
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
-; GFX6-NEXT: s_and_b32 s1, s1, s4
-; GFX6-NEXT: s_lshl_b32 s0, s0, s1
-; GFX6-NEXT: s_lshl_b32 s1, s2, s3
+; GFX6-NEXT: s_and_b32 s2, s2, s4
+; GFX6-NEXT: s_lshl_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s2, s3, s4
+; GFX6-NEXT: s_lshl_b32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
; GFX6-LABEL: shl_v2i16_sv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: s_mov_b32 s2, 0xffff
+; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1
; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0
define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: shl_v2i16_vs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_mov_b32 s2, 0xffff
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0
+; GFX6-NEXT: s_and_b32 s0, s1, s2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-LABEL: v_ssubsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_brev_b32 s4, -2
; GFX6-NEXT: v_max_i32_e32 v4, -1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4
; GFX6-NEXT: s_brev_b32 s5, 1
; GFX6-NEXT: v_min_i32_e32 v5, -1, v0
; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5
-; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
-; GFX6-NEXT: v_min_i32_e32 v1, v1, v5
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
+; GFX6-NEXT: v_min_i32_e32 v2, v2, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3
; GFX6-NEXT: v_min_i32_e32 v4, -1, v1
+; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3
; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4
; GFX6-NEXT: v_max_i32_e32 v2, v3, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v4
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v2i16:
define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: s_ssubsat_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_brev_b32 s4, -2
; GFX6-NEXT: s_max_i32 s6, s0, -1
-; GFX6-NEXT: s_lshr_b32 s3, s1, 16
-; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_sub_i32 s6, s6, s4
; GFX6-NEXT: s_brev_b32 s5, 1
; GFX6-NEXT: s_min_i32 s7, s0, -1
; GFX6-NEXT: s_sub_i32 s7, s7, s5
-; GFX6-NEXT: s_max_i32 s1, s6, s1
-; GFX6-NEXT: s_min_i32 s1, s1, s7
-; GFX6-NEXT: s_sub_i32 s0, s0, s1
-; GFX6-NEXT: s_lshl_b32 s1, s2, 16
+; GFX6-NEXT: s_max_i32 s2, s6, s2
+; GFX6-NEXT: s_min_i32 s2, s2, s7
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_sub_i32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s3, 16
; GFX6-NEXT: s_max_i32 s3, s1, -1
; GFX6-NEXT: s_sub_i32 s3, s3, s4
define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: ssubsat_v2i16_sv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: s_brev_b32 s2, -2
; GFX6-NEXT: s_max_i32 s4, s0, -1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_sub_i32 s4, s4, s2
; GFX6-NEXT: s_brev_b32 s3, 1
define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: ssubsat_v2i16_vs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_brev_b32 s2, -2
; GFX6-NEXT: v_max_i32_e32 v2, -1, v0
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2
; GFX6-NEXT: s_brev_b32 s3, 1
; GFX6-LABEL: v_uaddsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0
-; GFX6-NEXT: v_min_u32_e32 v1, v4, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_min_u32_e32 v2, v4, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1
; GFX6-NEXT: v_min_u32_e32 v2, v3, v2
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_v2i16:
define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: s_uaddsat_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
-; GFX6-NEXT: s_lshr_b32 s3, s1, 16
-; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_not_b32 s4, s0
-; GFX6-NEXT: s_min_u32 s1, s4, s1
-; GFX6-NEXT: s_add_i32 s0, s0, s1
-; GFX6-NEXT: s_lshl_b32 s1, s2, 16
+; GFX6-NEXT: s_min_u32 s2, s4, s2
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_add_i32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s3, 16
; GFX6-NEXT: s_not_b32 s3, s1
; GFX6-NEXT: s_min_u32 s2, s3, s2
define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: uaddsat_v2i16_sv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_not_b32 s2, s0
; GFX6-NEXT: v_min_u32_e32 v0, s2, v0
define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: uaddsat_v2i16_vs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0
; GFX6-NEXT: v_min_u32_e32 v2, s0, v2
; GFX6-LABEL: v_usubsat_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_min_u32_e32 v2, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_min_u32_e32 v1, v0, v1
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX6-NEXT: v_min_u32_e32 v2, v1, v2
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_v2i16:
define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshr_b32 s2, s0, 16
-; GFX6-NEXT: s_lshr_b32 s3, s1, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_min_u32 s2, s0, s2
+; GFX6-NEXT: s_sub_i32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
-; GFX6-NEXT: s_min_u32 s1, s0, s1
-; GFX6-NEXT: s_sub_i32 s0, s0, s1
-; GFX6-NEXT: s_lshl_b32 s1, s2, 16
; GFX6-NEXT: s_lshl_b32 s2, s3, 16
; GFX6-NEXT: s_min_u32 s2, s1, s2
; GFX6-NEXT: s_sub_i32 s1, s1, s2
define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: usubsat_v2i16_sv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_min_u32_e32 v0, s0, v0
define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: usubsat_v2i16_vs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_min_u32_e32 v2, s0, v0
define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) {
; GFX7-LABEL: scalar_xnor_v2i16_one_use:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 s4, 0xffff
+; GFX7-NEXT: s_lshl_b32 s1, s1, 16
+; GFX7-NEXT: s_and_b32 s0, s0, s4
+; GFX7-NEXT: s_or_b32 s0, s1, s0
+; GFX7-NEXT: s_lshl_b32 s1, s3, 16
+; GFX7-NEXT: s_and_b32 s2, s2, s4
+; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: s_xor_b32 s0, s0, s1
; GFX7-NEXT: s_xor_b32 s0, s0, -1
; GFX7-NEXT: ; return to shader part epilog
i8 signext %p4, i16 signext %p5) {
; CHECK-LABEL: name: test_stack_args_signext
; CHECK: fixedStack:
-; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 1
-; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 2
+; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 4
+; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 4
; CHECK: liveins: $r0, $r1, $r2, $r3
; CHECK: [[VREGR1:%[0-9]+]]:_(s32) = COPY $r1
; CHECK: [[VREGP1:%[0-9]+]]:_(s16) = G_TRUNC [[VREGR1]]
; CHECK: [[FIP5:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[P5]]
; CHECK: [[VREGP5EXT:%[0-9]+]]:_(s32) = G_LOAD [[FIP5]](p0){{.*}}load 4
-; CHECK: [[VREGP5:%[0-9]+]]:_(s16) = G_TRUNC [[VREGP5EXT]]
+; CHECK: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[VREGP5EXT]], 16
+; CHECK: [[VREGP5:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_SEXT]]
; CHECK: [[SUM:%[0-9]+]]:_(s16) = G_ADD [[VREGP1]], [[VREGP5]]
; CHECK: [[SUM_EXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUM]]
; CHECK: $r0 = COPY [[SUM_EXT]](s32)
i8 zeroext %p4, i16 zeroext %p5) {
; CHECK-LABEL: name: test_stack_args_zeroext
; CHECK: fixedStack:
-; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 1
-; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 2
+; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 4
+; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 4
; CHECK: liveins: $r0, $r1, $r2, $r3
; CHECK: [[VREGR2:%[0-9]+]]:_(s32) = COPY $r2
; CHECK: [[VREGP2:%[0-9]+]]:_(s8) = G_TRUNC [[VREGR2]]
; CHECK: [[FIP4:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[P4]]
; CHECK: [[VREGP4EXT:%[0-9]+]]:_(s32) = G_LOAD [[FIP4]](p0){{.*}}load 4
-; CHECK: [[VREGP4:%[0-9]+]]:_(s8) = G_TRUNC [[VREGP4EXT]]
+; CHECK: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[VREGP4EXT]], 8
+; CHECK: [[VREGP4:%[0-9]+]]:_(s8) = G_TRUNC [[ASSERT_ZEXT]]
; CHECK: [[SUM:%[0-9]+]]:_(s8) = G_ADD [[VREGP2]], [[VREGP4]]
; CHECK: [[SUM_EXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUM]]
; CHECK: $r0 = COPY [[SUM_EXT]](s32)
i8 %p4, i16 %p5) {
; CHECK-LABEL: name: test_stack_args_noext
; CHECK: fixedStack:
-; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 1
-; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 2
+; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0, size: 4, alignment: 8,
+; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4, size: 4, alignment: 4,
; CHECK: liveins: $r0, $r1, $r2, $r3
; CHECK: [[VREGR2:%[0-9]+]]:_(s32) = COPY $r2
; CHECK: [[VREGP2:%[0-9]+]]:_(s8) = G_TRUNC [[VREGR2]]
; CHECK: [[FIP4:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[P4]]
-; CHECK: [[VREGP4:%[0-9]+]]:_(s8) = G_LOAD [[FIP4]](p0){{.*}}load 1
-; CHECK: [[SUM:%[0-9]+]]:_(s8) = G_ADD [[VREGP2]], [[VREGP4]]
+; CHECK: [[VREGP4:%[0-9]+]]:_(s32) = G_LOAD [[FIP4]](p0){{.*}}load 4
+; CHECK: [[TRUNC_VREGP4:%[0-9]+]]:_(s8) = G_TRUNC [[VREGP4]]
+; CHECK: [[SUM:%[0-9]+]]:_(s8) = G_ADD [[VREGP2]], [[TRUNC_VREGP4]]
; CHECK: [[SUM_EXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUM]]
; CHECK: $r0 = COPY [[SUM_EXT]](s32)
; CHECK: BX_RET 14 /* CC::al */, $noreg, implicit $r0
i8 signext %p4, i16 signext %p5) {
; CHECK-LABEL: name: test_stack_args_extend_the_extended
; CHECK: fixedStack:
-; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 1
-; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 2
+; CHECK-DAG: id: [[P4:[0-9]]]{{.*}}offset: 0{{.*}}size: 4, alignment: 8
+; CHECK-DAG: id: [[P5:[0-9]]]{{.*}}offset: 4{{.*}}size: 4, alignment: 4
; CHECK: liveins: $r0, $r1, $r2, $r3
; CHECK: [[FIP5:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[P5]]
; CHECK: [[VREGP5SEXT:%[0-9]+]]:_(s32) = G_LOAD [[FIP5]](p0){{.*}}load 4
-; CHECK: [[VREGP5:%[0-9]+]]:_(s16) = G_TRUNC [[VREGP5SEXT]]
+; CHECK: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[VREGP5SEXT]], 16
+; CHECK: [[VREGP5:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_SEXT]]
; CHECK: [[VREGP5ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[VREGP5]]
; CHECK: $r0 = COPY [[VREGP5ZEXT]]
; CHECK: BX_RET 14 /* CC::al */, $noreg, implicit $r0
define i16 @test_stack_args_mixed(i32 %p0, i16 %p1, i8 %p2, i1 %p3, i8 %p4, i16 %p5) {
; CHECK-LABEL: test_stack_args_mixed:
; CHECK: add [[P5ADDR:r[0-9]+]], sp, #4
-; CHECK: ldrh [[P5:r[0-9]+]], {{.*}}[[P5ADDR]]
+; CHECK: ldr [[P5:r[0-9]+]], {{.*}}[[P5ADDR]]
; CHECK: add r0, r1, [[P5]]
; CHECK: bx lr
entry:
define i8 @test_stack_args_noext(i32 %p0, i16 %p1, i8 %p2, i1 %p3, i8 %p4) {
; CHECK-LABEL: test_stack_args_noext:
; CHECK: mov [[P4ADDR:r[0-9]+]], sp
-; CHECK: ldrb [[P4:r[0-9]+]], {{.*}}[[P4ADDR]]
+; CHECK: ldr [[P4:r[0-9]+]], {{.*}}[[P4ADDR]]
; CHECK: add r0, r2, [[P4]]
; CHECK: bx lr
entry:
; SOFT-ABI: [[OFF1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; SOFT-ABI: [[FI1:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP1]], [[OFF1]](s32)
+ ; FIXME: Should avoid multiple copies from $sp
; FIXME: This ought to be align 8 but ARM's call lowering hardcodes it to 1
; SOFT-ABI: G_STORE [[Y0]](s32), [[FI1]](p0){{.*}}store 4 into stack, align 1)
+ ; SOFT-ABI: [[SP2:%[0-9]+]]:_(p0) = COPY $sp
; SOFT-ABI: [[OFF2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; SOFT-ABI: [[FI2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FI1]], [[OFF2]](s32)
+ ; SOFT-ABI: [[FI2:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP2]], [[OFF2]](s32)
; SOFT-ABI: G_STORE [[Y1]](s32), [[FI2]](p0){{.*}}store 4 into stack + 4, align 1)
; SOFT-ABI: BL &fma, {{.*}}, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0, implicit-def $r1
; SOFT-ABI-DAG: [[R0:%[0-9]+]]:_(s32) = COPY $r0
; CHECK: $r2 = COPY [[SEXTB]]
; CHECK: [[ZEXTB:%[0-9]+]]:_(s32) = G_ZEXT [[BVREG]](s16)
; CHECK: $r3 = COPY [[ZEXTB]]
+; CHECK: [[SEXTA2:%[0-9]+]]:_(s32) = G_SEXT [[AVREG]]
; CHECK: [[SP1:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[OFF1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: [[FI1:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP1]], [[OFF1]](s32)
-; CHECK: [[SEXTA2:%[0-9]+]]:_(s32) = G_SEXT [[AVREG]]
; CHECK: G_STORE [[SEXTA2]](s32), [[FI1]](p0){{.*}}store 4
+; CHECK: [[ZEXTA2:%[0-9]+]]:_(s32) = G_ZEXT [[AVREG]]
; CHECK: [[SP2:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[OFF2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK: [[FI2:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP2]], [[OFF2]](s32)
-; CHECK: [[ZEXTA2:%[0-9]+]]:_(s32) = G_ZEXT [[AVREG]]
; CHECK: G_STORE [[ZEXTA2]](s32), [[FI2]](p0){{.*}}store 4
+; CHECK: [[SEXTB2:%[0-9]+]]:_(s32) = G_SEXT [[BVREG]]
; CHECK: [[SP3:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[OFF3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK: [[FI3:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP3]], [[OFF3]](s32)
-; CHECK: [[SEXTB2:%[0-9]+]]:_(s32) = G_SEXT [[BVREG]]
; CHECK: G_STORE [[SEXTB2]](s32), [[FI3]](p0){{.*}}store 4
+; CHECK: [[ZEXTB2:%[0-9]+]]:_(s32) = G_ZEXT [[BVREG]]
; CHECK: [[SP4:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[OFF4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK: [[FI4:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP4]], [[OFF4]](s32)
-; CHECK: [[ZEXTB2:%[0-9]+]]:_(s32) = G_ZEXT [[BVREG]]
; CHECK: G_STORE [[ZEXTB2]](s32), [[FI4]](p0){{.*}}store 4
+; CHECK: [[ZEXTC:%[0-9]+]]:_(s32) = G_ZEXT [[CVREG]]
; CHECK: [[SP5:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[OFF5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK: [[FI5:%[0-9]+]]:_(p0) = G_PTR_ADD [[SP5]], [[OFF5]](s32)
-; CHECK: [[ZEXTC:%[0-9]+]]:_(s32) = G_ZEXT [[CVREG]]
; CHECK: G_STORE [[ZEXTC]](s32), [[FI5]](p0){{.*}}store 4
; ARM: BL @ext_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0
; THUMB: tBL 14 /* CC::al */, $noreg, @ext_target, csr_aapcs, implicit-def $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit $r3, implicit-def $r0
; CHECK: [[R0VREG:%[0-9]+]]:_(s32) = COPY $r0
-; CHECK: [[RVREG:%[0-9]+]]:_(s16) = G_TRUNC [[R0VREG]]
+; CHECK: [[R0VREG_ASSERT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[R0VREG]], 16
+; CHECK: [[RVREG:%[0-9]+]]:_(s16) = G_TRUNC [[R0VREG_ASSERT]]
; CHECK: ADJCALLSTACKUP 20, 0, 14 /* CC::al */, $noreg, implicit-def $sp, implicit $sp
; CHECK: [[RExtVREG:%[0-9]+]]:_(s32) = G_SEXT [[RVREG]]
; CHECK: $r0 = COPY [[RExtVREG]]
}
define half @test_half(half %a, half %b) {
-; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* (in function: test_half)
+; CHECK: remark: {{.*}} unable to legalize instruction: %{{[0-9]+}}:_(s16) = G_FADD %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: test_half)
; CHECK-LABEL: warning: Instruction selection used fallback path for test_half
%res = fadd half %a, %b
ret half %res
;
; X86-LABEL: test_add_i16:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: addw {{[0-9]+}}(%esp), %ax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addw %cx, %ax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
%ret = add i16 %arg1, %arg2
ret i16 %ret
;
; X86-LABEL: test_add_i8:
; X86: # %bb.0:
-; X86-NEXT: movb {{[0-9]+}}(%esp), %al
-; X86-NEXT: addb {{[0-9]+}}(%esp), %al
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addb %cl, %al
+; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%ret = add i8 %arg1, %arg2
ret i8 %ret
define i8 @test_arg_i8(i8 %a) {
; X32-LABEL: test_arg_i8:
; X32: # %bb.0:
-; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: # kill: def $al killed $al killed $eax
; X32-NEXT: retl
;
; X64-LABEL: test_arg_i8:
define i16 @test_arg_i16(i16 %a) {
; X32-LABEL: test_arg_i16:
; X32: # %bb.0:
-; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: # kill: def $ax killed $ax killed $eax
; X32-NEXT: retl
;
; X64-LABEL: test_arg_i16:
+; XFAIL: *
+; FIXME: This test is broken due to https://bugs.llvm.org/show_bug.cgi?id=50035
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32
; X86-LABEL: name: test_i8_args_8
; X86: bb.1.entry:
; X86: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.7
- ; X86: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.7, align 16)
+ ; X86: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.7, align 16)
+ ; X86: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD]](s32)
; X86: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.6
- ; X86: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load 1 from %fixed-stack.6, align 4)
+ ; X86: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load 1 from %fixed-stack.6, align 4)
+ ; X86: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s32)
; X86: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.5
- ; X86: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX2]](p0) :: (invariant load 1 from %fixed-stack.5, align 8)
+ ; X86: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (invariant load 1 from %fixed-stack.5, align 8)
+ ; X86: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD2]](s32)
; X86: [[FRAME_INDEX3:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.4
- ; X86: [[LOAD3:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX3]](p0) :: (invariant load 1 from %fixed-stack.4, align 4)
+ ; X86: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p0) :: (invariant load 1 from %fixed-stack.4, align 4)
+ ; X86: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD3]](s32)
; X86: [[FRAME_INDEX4:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.3
- ; X86: [[LOAD4:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX4]](p0) :: (invariant load 1 from %fixed-stack.3, align 16)
+ ; X86: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p0) :: (invariant load 1 from %fixed-stack.3, align 16)
+ ; X86: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD4]](s32)
; X86: [[FRAME_INDEX5:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.2
- ; X86: [[LOAD5:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX5]](p0) :: (invariant load 1 from %fixed-stack.2, align 4)
+ ; X86: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p0) :: (invariant load 1 from %fixed-stack.2, align 4)
+ ; X86: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD5]](s32)
; X86: [[FRAME_INDEX6:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
- ; X86: [[LOAD6:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX6]](p0) :: (invariant load 1 from %fixed-stack.1, align 8)
+ ; X86: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p0) :: (invariant load 1 from %fixed-stack.1, align 8)
+ ; X86: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD6]](s32)
; X86: [[FRAME_INDEX7:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
- ; X86: [[LOAD7:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX7]](p0) :: (invariant load 1 from %fixed-stack.0, align 4)
+ ; X86: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p0) :: (invariant load 1 from %fixed-stack.0, align 4)
+ ; X86: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD7]](s32)
; X86: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a1_8bit
; X86: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a7_8bit
; X86: [[GV2:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a8_8bit
- ; X86: G_STORE [[LOAD]](s8), [[GV]](p0) :: (store 1 into @a1_8bit)
- ; X86: G_STORE [[LOAD6]](s8), [[GV1]](p0) :: (store 1 into @a7_8bit)
- ; X86: G_STORE [[LOAD7]](s8), [[GV2]](p0) :: (store 1 into @a8_8bit)
- ; X86: $al = COPY [[LOAD]](s8)
+ ; X86: G_STORE [[TRUNC]](s8), [[GV]](p0) :: (store 1 into @a1_8bit)
+ ; X86: G_STORE [[TRUNC6]](s8), [[GV1]](p0) :: (store 1 into @a7_8bit)
+ ; X86: G_STORE [[TRUNC7]](s8), [[GV2]](p0) :: (store 1 into @a8_8bit)
+ ; X86: $al = COPY [[TRUNC]](s8)
; X86: RET 0, implicit $al
; X64-LABEL: name: test_i8_args_8
; X64: bb.1.entry:
; X64: [[COPY5:%[0-9]+]]:_(s32) = COPY $r9d
; X64: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[COPY5]](s32)
; X64: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
- ; X64: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.1, align 16)
+ ; X64: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.1, align 16)
+ ; X64: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD]](s32)
; X64: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
- ; X64: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load 1 from %fixed-stack.0, align 8)
+ ; X64: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load 1 from %fixed-stack.0, align 8)
+ ; X64: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s32)
; X64: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a1_8bit
; X64: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a7_8bit
; X64: [[GV2:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a8_8bit
; X64: G_STORE [[TRUNC]](s8), [[GV]](p0) :: (store 1 into @a1_8bit)
- ; X64: G_STORE [[LOAD]](s8), [[GV1]](p0) :: (store 1 into @a7_8bit)
- ; X64: G_STORE [[LOAD1]](s8), [[GV2]](p0) :: (store 1 into @a8_8bit)
+ ; X64: G_STORE [[TRUNC6]](s8), [[GV1]](p0) :: (store 1 into @a7_8bit)
+ ; X64: G_STORE [[TRUNC7]](s8), [[GV2]](p0) :: (store 1 into @a8_8bit)
; X64: $al = COPY [[TRUNC]](s8)
; X64: RET 0, implicit $al
entry:
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp
; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=i386-linux-gnu -regbankselect-greedy -global-isel -verify-machineinstrs < %s | FileCheck %s
define i1 * @test_store_i1(i1 %val, i1 * %p1) {
; CHECK-LABEL: test_store_i1:
; CHECK: # %bb.0:
-; CHECK-NEXT: movb 4(%esp), %cl
+; CHECK-NEXT: movl 4(%esp), %ecx
; CHECK-NEXT: movl 8(%esp), %eax
; CHECK-NEXT: andb $1, %cl
; CHECK-NEXT: movb %cl, (%eax)
define i8 * @test_store_i8(i8 %val, i8 * %p1) {
; CHECK-LABEL: test_store_i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: movb 4(%esp), %cl
+; CHECK-NEXT: movl 4(%esp), %ecx
; CHECK-NEXT: movl 8(%esp), %eax
; CHECK-NEXT: movb %cl, (%eax)
; CHECK-NEXT: retl
define i16 * @test_store_i16(i16 %val, i16 * %p1) {
; CHECK-LABEL: test_store_i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: movzwl 4(%esp), %ecx
+; CHECK-NEXT: movl 4(%esp), %ecx
; CHECK-NEXT: movl 8(%esp), %eax
; CHECK-NEXT: movw %cx, (%eax)
; CHECK-NEXT: retl