return true;
}
+void HexagonTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+ SDNode *Node) const {
+ AdjustHvxInstrPostInstrSelection(MI, Node);
+}
+
Value *HexagonTargetLowering::emitLoadLinked(IRBuilderBase &Builder,
Type *ValueTy, Value *Addr,
AtomicOrdering Ord) const {
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
EVT NewVT) const override;
+ void AdjustInstrPostInstrSelection(MachineInstr &MI,
+ SDNode *Node) const override;
+
// Handling of atomic RMW instructions.
Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr,
AtomicOrdering Ord) const override;
bool allowsHvxMisalignedMemoryAccesses(MVT VecTy,
MachineMemOperand::Flags Flags,
bool *Fast) const;
+ void AdjustHvxInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const;
bool isHvxSingleTy(MVT Ty) const;
bool isHvxPairTy(MVT Ty) const;
#include "HexagonRegisterInfo.h"
#include "HexagonSubtarget.h"
#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/IntrinsicsHexagon.h"
#include "llvm/Support/CommandLine.h"
return true;
}
+void HexagonTargetLowering::AdjustHvxInstrPostInstrSelection(
+ MachineInstr &MI, SDNode *Node) const {
+ unsigned Opc = MI.getOpcode();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ MachineBasicBlock &MB = *MI.getParent();
+ MachineFunction &MF = *MB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ auto At = MI.getIterator();
+
+ switch (Opc) {
+ case Hexagon::PS_vsplatib:
+ if (Subtarget.useHVXV62Ops()) {
+ // SplatV = A2_tfrsi #imm
+ // OutV = V6_lvsplatb SplatV
+ Register SplatV = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(MB, At, DL, TII.get(Hexagon::A2_tfrsi), SplatV)
+ .add(MI.getOperand(1));
+ Register OutV = MI.getOperand(0).getReg();
+ BuildMI(MB, At, DL, TII.get(Hexagon::V6_lvsplatb), OutV)
+ .addReg(SplatV);
+ } else {
+ // SplatV = A2_tfrsi #imm:#imm:#imm:#imm
+ // OutV = V6_lvsplatw SplatV
+ Register SplatV = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ const MachineOperand &InpOp = MI.getOperand(1);
+ assert(InpOp.isImm());
+ uint32_t V = InpOp.getImm() & 0xFF;
+ BuildMI(MB, At, DL, TII.get(Hexagon::A2_tfrsi), SplatV)
+ .addImm(V << 24 | V << 16 | V << 8 | V);
+ Register OutV = MI.getOperand(0).getReg();
+ BuildMI(MB, At, DL, TII.get(Hexagon::V6_lvsplatw), OutV).addReg(SplatV);
+ }
+ MB.erase(At);
+ break;
+ case Hexagon::PS_vsplatrb:
+ if (Subtarget.useHVXV62Ops()) {
+ // OutV = V6_lvsplatb Inp
+ Register OutV = MI.getOperand(0).getReg();
+ BuildMI(MB, At, DL, TII.get(Hexagon::V6_lvsplatb), OutV)
+ .add(MI.getOperand(1));
+ } else {
+ Register SplatV = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ const MachineOperand &InpOp = MI.getOperand(1);
+ BuildMI(MB, At, DL, TII.get(Hexagon::S2_vsplatrb), SplatV)
+ .addReg(InpOp.getReg(), 0, InpOp.getSubReg());
+ Register OutV = MI.getOperand(0).getReg();
+ BuildMI(MB, At, DL, TII.get(Hexagon::V6_lvsplatw), OutV)
+ .addReg(SplatV);
+ }
+ MB.erase(At);
+ break;
+ case Hexagon::PS_vsplatih:
+ if (Subtarget.useHVXV62Ops()) {
+ // SplatV = A2_tfrsi #imm
+ // OutV = V6_lvsplath SplatV
+ Register SplatV = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(MB, At, DL, TII.get(Hexagon::A2_tfrsi), SplatV)
+ .add(MI.getOperand(1));
+ Register OutV = MI.getOperand(0).getReg();
+ BuildMI(MB, At, DL, TII.get(Hexagon::V6_lvsplath), OutV)
+ .addReg(SplatV);
+ } else {
+ // SplatV = A2_tfrsi #imm:#imm
+ // OutV = V6_lvsplatw SplatV
+ Register SplatV = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ const MachineOperand &InpOp = MI.getOperand(1);
+ assert(InpOp.isImm());
+ uint32_t V = InpOp.getImm() & 0xFFFF;
+ BuildMI(MB, At, DL, TII.get(Hexagon::A2_tfrsi), SplatV)
+ .addImm(V << 16 | V);
+ Register OutV = MI.getOperand(0).getReg();
+ BuildMI(MB, At, DL, TII.get(Hexagon::V6_lvsplatw), OutV).addReg(SplatV);
+ }
+ MB.erase(At);
+ break;
+ case Hexagon::PS_vsplatrh:
+ if (Subtarget.useHVXV62Ops()) {
+ // OutV = V6_lvsplath Inp
+ Register OutV = MI.getOperand(0).getReg();
+ BuildMI(MB, At, DL, TII.get(Hexagon::V6_lvsplath), OutV)
+ .add(MI.getOperand(1));
+ } else {
+ // SplatV = A2_combine_ll Inp, Inp
+ // OutV = V6_lvsplatw SplatV
+ Register SplatV = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ const MachineOperand &InpOp = MI.getOperand(1);
+ BuildMI(MB, At, DL, TII.get(Hexagon::A2_combine_ll), SplatV)
+ .addReg(InpOp.getReg(), 0, InpOp.getSubReg())
+ .addReg(InpOp.getReg(), 0, InpOp.getSubReg());
+ Register OutV = MI.getOperand(0).getReg();
+ BuildMI(MB, At, DL, TII.get(Hexagon::V6_lvsplatw), OutV).addReg(SplatV);
+ }
+ MB.erase(At);
+ break;
+ case Hexagon::PS_vsplatiw:
+ case Hexagon::PS_vsplatrw:
+ if (Opc == Hexagon::PS_vsplatiw) {
+ // SplatV = A2_tfrsi #imm
+ Register SplatV = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+ BuildMI(MB, At, DL, TII.get(Hexagon::A2_tfrsi), SplatV)
+ .add(MI.getOperand(1));
+ MI.getOperand(1).ChangeToRegister(SplatV, false);
+ }
+ // OutV = V6_lvsplatw SplatV/Inp
+ MI.setDesc(TII.get(Hexagon::V6_lvsplatw));
+ break;
+ }
+}
+
SDValue
HexagonTargetLowering::convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
SelectionDAG &DAG) const {
(V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
}
-// Splats for HvxV60
-def V60splatib: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 (SplatB $V)))>;
-def V60splatih: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 (SplatH $V)))>;
-def V60splatiw: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 $V))>;
-def V60splatrb: OutPatFrag<(ops node:$Rs), (V6_lvsplatw (S2_vsplatrb $Rs))>;
-def V60splatrh: OutPatFrag<(ops node:$Rs),
- (V6_lvsplatw (A2_combine_ll $Rs, $Rs))>;
-def V60splatrw: OutPatFrag<(ops node:$Rs), (V6_lvsplatw $Rs)>;
-
-// Splats for HvxV62+
-def V62splatib: OutPatFrag<(ops node:$V), (V6_lvsplatb (ToI32 $V))>;
-def V62splatih: OutPatFrag<(ops node:$V), (V6_lvsplath (ToI32 $V))>;
-def V62splatiw: OutPatFrag<(ops node:$V), (V6_lvsplatw (ToI32 $V))>;
-def V62splatrb: OutPatFrag<(ops node:$Rs), (V6_lvsplatb $Rs)>;
-def V62splatrh: OutPatFrag<(ops node:$Rs), (V6_lvsplath $Rs)>;
-def V62splatrw: OutPatFrag<(ops node:$Rs), (V6_lvsplatw $Rs)>;
-
def Rep: OutPatFrag<(ops node:$N), (Combinev $N, $N)>;
-let Predicates = [UseHVX,UseHVXV60] in {
+let Predicates = [UseHVX] in {
let AddedComplexity = 10 in {
- def: Pat<(VecI8 (splat_vector u8_0ImmPred:$V)), (V60splatib $V)>;
- def: Pat<(VecI16 (splat_vector u16_0ImmPred:$V)), (V60splatih $V)>;
- def: Pat<(VecI32 (splat_vector anyimm:$V)), (V60splatiw $V)>;
- def: Pat<(VecPI8 (splat_vector u8_0ImmPred:$V)), (Rep (V60splatib $V))>;
- def: Pat<(VecPI16 (splat_vector u16_0ImmPred:$V)), (Rep (V60splatih $V))>;
- def: Pat<(VecPI32 (splat_vector anyimm:$V)), (Rep (V60splatiw $V))>;
- }
- def: Pat<(VecI8 (splat_vector I32:$Rs)), (V60splatrb $Rs)>;
- def: Pat<(VecI16 (splat_vector I32:$Rs)), (V60splatrh $Rs)>;
- def: Pat<(VecI32 (splat_vector I32:$Rs)), (V60splatrw $Rs)>;
- def: Pat<(VecPI8 (splat_vector I32:$Rs)), (Rep (V60splatrb $Rs))>;
- def: Pat<(VecPI16 (splat_vector I32:$Rs)), (Rep (V60splatrh $Rs))>;
- def: Pat<(VecPI32 (splat_vector I32:$Rs)), (Rep (V60splatrw $Rs))>;
-}
-let Predicates = [UseHVX,UseHVXV62] in {
- let AddedComplexity = 30 in {
- def: Pat<(VecI8 (splat_vector u8_0ImmPred:$V)), (V62splatib imm:$V)>;
- def: Pat<(VecI16 (splat_vector u16_0ImmPred:$V)), (V62splatih imm:$V)>;
- def: Pat<(VecI32 (splat_vector anyimm:$V)), (V62splatiw imm:$V)>;
- def: Pat<(VecPI8 (splat_vector u8_0ImmPred:$V)),
- (Rep (V62splatib imm:$V))>;
- def: Pat<(VecPI16 (splat_vector u16_0ImmPred:$V)),
- (Rep (V62splatih imm:$V))>;
- def: Pat<(VecPI32 (splat_vector anyimm:$V)),
- (Rep (V62splatiw imm:$V))>;
- }
- let AddedComplexity = 20 in {
- def: Pat<(VecI8 (splat_vector I32:$Rs)), (V62splatrb $Rs)>;
- def: Pat<(VecI16 (splat_vector I32:$Rs)), (V62splatrh $Rs)>;
- def: Pat<(VecI32 (splat_vector I32:$Rs)), (V62splatrw $Rs)>;
- def: Pat<(VecPI8 (splat_vector I32:$Rs)), (Rep (V62splatrb $Rs))>;
- def: Pat<(VecPI16 (splat_vector I32:$Rs)), (Rep (V62splatrh $Rs))>;
- def: Pat<(VecPI32 (splat_vector I32:$Rs)), (Rep (V62splatrw $Rs))>;
+ def: Pat<(VecI8 (splat_vector u8_0ImmPred:$V)), (PS_vsplatib imm:$V)>;
+ def: Pat<(VecI16 (splat_vector u16_0ImmPred:$V)), (PS_vsplatih imm:$V)>;
+ def: Pat<(VecI32 (splat_vector anyimm:$V)), (PS_vsplatiw imm:$V)>;
+ def: Pat<(VecPI8 (splat_vector u8_0ImmPred:$V)), (Rep (PS_vsplatib imm:$V))>;
+ def: Pat<(VecPI16 (splat_vector u16_0ImmPred:$V)), (Rep (PS_vsplatih imm:$V))>;
+ def: Pat<(VecPI32 (splat_vector anyimm:$V)), (Rep (PS_vsplatiw imm:$V))>;
}
+ def: Pat<(VecI8 (splat_vector I32:$Rs)), (PS_vsplatrb $Rs)>;
+ def: Pat<(VecI16 (splat_vector I32:$Rs)), (PS_vsplatrh $Rs)>;
+ def: Pat<(VecI32 (splat_vector I32:$Rs)), (PS_vsplatrw $Rs)>;
+ def: Pat<(VecPI8 (splat_vector I32:$Rs)), (Rep (PS_vsplatrb $Rs))>;
+ def: Pat<(VecPI16 (splat_vector I32:$Rs)), (Rep (PS_vsplatrh $Rs))>;
+ def: Pat<(VecPI32 (splat_vector I32:$Rs)), (Rep (PS_vsplatrw $Rs))>;
}
let Predicates = [UseHVXV68, UseHVXFloatingPoint] in {
let AddedComplexity = 30 in {
- def: Pat<(VecF16 (splat_vector u16_0ImmPred:$V)), (V62splatih imm:$V)>;
- def: Pat<(VecF32 (splat_vector anyint:$V)), (V62splatiw imm:$V)>;
- def: Pat<(VecF32 (splat_vector f32ImmPred:$V)), (V62splatiw (ftoi $V))>;
+ def: Pat<(VecF16 (splat_vector u16_0ImmPred:$V)), (PS_vsplatih imm:$V)>;
+ def: Pat<(VecF32 (splat_vector anyint:$V)), (PS_vsplatiw imm:$V)>;
+ def: Pat<(VecF32 (splat_vector f32ImmPred:$V)), (PS_vsplatiw (ftoi $V))>;
}
let AddedComplexity = 20 in {
- def: Pat<(VecF16 (splat_vector I32:$Rs)), (V62splatrh $Rs)>;
- def: Pat<(VecF32 (splat_vector I32:$Rs)), (V62splatrw $Rs)>;
- def: Pat<(VecF32 (splat_vector F32:$Rs)), (V62splatrw $Rs)>;
+ def: Pat<(VecF16 (splat_vector I32:$Rs)), (PS_vsplatrh $Rs)>;
+ def: Pat<(VecF32 (splat_vector I32:$Rs)), (PS_vsplatrw $Rs)>;
+ def: Pat<(VecF32 (splat_vector F32:$Rs)), (PS_vsplatrw $Rs)>;
}
}
def: Pat<(srl HVI16:$Vs, HVI16:$Vt), (V6_vlsrhv HvxVR:$Vs, HvxVR:$Vt)>;
def: Pat<(srl HVI32:$Vs, HVI32:$Vt), (V6_vlsrwv HvxVR:$Vs, HvxVR:$Vt)>;
- let Predicates = [UseHVX,UseHVXV60] in {
- def: Pat<(VecI16 (bswap HVI16:$Vs)),
- (V6_vdelta HvxVR:$Vs, (V60splatib (i32 0x01)))>;
- def: Pat<(VecI32 (bswap HVI32:$Vs)),
- (V6_vdelta HvxVR:$Vs, (V60splatib (i32 0x03)))>;
- }
- let Predicates = [UseHVX,UseHVXV62], AddedComplexity = 10 in {
- def: Pat<(VecI16 (bswap HVI16:$Vs)),
- (V6_vdelta HvxVR:$Vs, (V62splatib (i32 0x01)))>;
- def: Pat<(VecI32 (bswap HVI32:$Vs)),
- (V6_vdelta HvxVR:$Vs, (V62splatib (i32 0x03)))>;
- }
+ def: Pat<(VecI16 (bswap HVI16:$Vs)),
+ (V6_vdelta HvxVR:$Vs, (PS_vsplatib (i32 0x01)))>;
+ def: Pat<(VecI32 (bswap HVI32:$Vs)),
+ (V6_vdelta HvxVR:$Vs, (PS_vsplatib (i32 0x03)))>;
def: Pat<(VecI8 (ctpop HVI8:$Vs)),
(V6_vshuffeb (V6_vpopcounth (HiVec (V6_vzb HvxVR:$Vs))),
(V6_vaddw (LoVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))),
(HiVec (V6_vzh (V6_vpopcounth HvxVR:$Vs))))>;
- let Predicates = [UseHVX,UseHVXV60] in
- def: Pat<(VecI8 (ctlz HVI8:$Vs)),
- (V6_vsubb (V6_vshuffeb (V6_vcl0h (HiVec (V6_vzb HvxVR:$Vs))),
- (V6_vcl0h (LoVec (V6_vzb HvxVR:$Vs)))),
- (V60splatib (i32 0x08)))>;
- let Predicates = [UseHVX,UseHVXV62], AddedComplexity = 10 in
def: Pat<(VecI8 (ctlz HVI8:$Vs)),
(V6_vsubb (V6_vshuffeb (V6_vcl0h (HiVec (V6_vzb HvxVR:$Vs))),
(V6_vcl0h (LoVec (V6_vzb HvxVR:$Vs)))),
- (V62splatib (i32 0x08)))>;
+ (PS_vsplatib (i32 0x08)))>;
def: Pat<(VecI16 (ctlz HVI16:$Vs)), (V6_vcl0h HvxVR:$Vs)>;
def: Pat<(VecI32 (ctlz HVI32:$Vs)), (V6_vcl0w HvxVR:$Vs)>;
def SAVE_REGISTERS_CALL_V4STK_EXT_PIC : T_Call<"">, PredRel;
}
+let Predicates = [UseHVX], isPseudo = 1, isCodeGenOnly = 1,
+ hasSideEffects = 0, hasPostISelHook = 1 in
+class Vsplatr_template : InstHexagon<(outs HvxVR:$Vd), (ins IntRegs:$Rs),
+ "", [], "", V6_lvsplatw.Itinerary, V6_lvsplatw.Type>;
+def PS_vsplatrb: Vsplatr_template;
+def PS_vsplatrh: Vsplatr_template;
+def PS_vsplatrw: Vsplatr_template;
+
+let Predicates = [UseHVX], isPseudo = 1, isCodeGenOnly = 1,
+ hasSideEffects = 0, hasPostISelHook = 1 in
+class Vsplati_template : InstHexagon<(outs HvxVR:$Vd), (ins s32_0Imm:$Val),
+ "", [], "", V6_lvsplatw.Itinerary, V6_lvsplatw.Type>;
+def PS_vsplatib: Vsplati_template;
+def PS_vsplatih: Vsplati_template;
+def PS_vsplatiw: Vsplati_template;
+
// Vector store pseudos
let Predicates = [HasV60,UseHVX], isPseudo = 1, isCodeGenOnly = 1,
mayStore = 1, accessSize = HVXVectorAccess, hasSideEffects = 0 in
; CHECK-NEXT: v1:0.h = vunpack(v0.b)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2.h = vsplat(r7)
+; CHECK-NEXT: v3.h = vsplat(r7)
; CHECK-NEXT: r3:2 = combine(#31,#5)
-; CHECK-NEXT: v3.h = vabs(v0.h)
+; CHECK-NEXT: v2.h = vabs(v0.h)
; CHECK-NEXT: v4.h = vabs(v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r5 = ##32768
-; CHECK-NEXT: v5.uh = vcl0(v3.uh)
+; CHECK-NEXT: v5.uh = vcl0(v2.uh)
; CHECK-NEXT: q0 = vcmp.gt(v9.h,v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10.h = vsplat(r5)
; CHECK-NEXT: r4 = #10
; CHECK-NEXT: v6.uh = vcl0(v4.uh)
-; CHECK-NEXT: v5.h = vadd(v5.h,v2.h)
+; CHECK-NEXT: v5.h = vadd(v5.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27 = vmux(q0,v10,v9)
-; CHECK-NEXT: v6.h = vadd(v6.h,v2.h)
+; CHECK-NEXT: v6.h = vadd(v6.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.h = vasl(v3.h,v5.h)
+; CHECK-NEXT: v2.h = vasl(v2.h,v5.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.h = vasl(v4.h,v6.h)
-; CHECK-NEXT: v13 = vand(v3,v8)
-; CHECK-NEXT: v11.h = vadd(v3.h,v7.h)
+; CHECK-NEXT: v13 = vand(v2,v8)
+; CHECK-NEXT: v11.h = vadd(v2.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14.h = vadd(v4.h,v7.h)
; CHECK-NEXT: q2 = vcmp.eq(v13.h,v9.h)
; CHECK-NEXT: v8 = vand(v4,v8)
-; CHECK-NEXT: q1 = vcmp.gt(v3.uh,v11.uh)
+; CHECK-NEXT: q1 = vcmp.gt(v2.uh,v11.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uh = vlsr(v11.uh,r2)
-; CHECK-NEXT: v13 = vmux(q2,v9,v2)
+; CHECK-NEXT: v13 = vmux(q2,v9,v3)
; CHECK-NEXT: q2 = vcmp.eq(v8.h,v9.h)
; CHECK-NEXT: q3 = vcmp.gt(v4.uh,v14.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v20.uh = vlsr(v14.uh,r2)
-; CHECK-NEXT: v22 = vmux(q2,v9,v2)
-; CHECK-NEXT: v21 = vmux(q1,v2,v9)
-; CHECK-NEXT: v2 = vmux(q3,v2,v9)
+; CHECK-NEXT: v22 = vmux(q2,v9,v3)
+; CHECK-NEXT: v21 = vmux(q1,v3,v9)
+; CHECK-NEXT: v3 = vmux(q3,v3,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v19.uh = vlsr(v4.uh,r2)
; CHECK-NEXT: v13.h = vadd(v11.h,v13.h)
; CHECK-NEXT: v24.h = vadd(v20.h,v22.h)
-; CHECK-NEXT: v2.h = vadd(v2.h,v7.h)
+; CHECK-NEXT: v3.h = vadd(v3.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v12.uh = vlsr(v3.uh,r2)
+; CHECK-NEXT: v12.uh = vlsr(v2.uh,r2)
; CHECK-NEXT: v23.h = vadd(v21.h,v7.h)
-; CHECK-NEXT: v2.h = vsub(v2.h,v6.h)
+; CHECK-NEXT: v3.h = vsub(v3.h,v6.h)
; CHECK-NEXT: q3 = vcmp.gt(v9.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uh = vlsr(v11.uh,r7)
-; CHECK-NEXT: v3.h = vsub(v23.h,v5.h)
+; CHECK-NEXT: v2.h = vsub(v23.h,v5.h)
; CHECK-NEXT: q1 = vcmp.eq(v12.h,v11.h)
; CHECK-NEXT: q2 = vcmp.eq(v19.h,v20.h)
; CHECK-NEXT: }
; CHECK-NEXT: v5 = vor(v27,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.h = vasl(v3.h,r4)
+; CHECK-NEXT: v2.h = vasl(v2.h,r4)
; CHECK-NEXT: v4 = vmux(q2,v26,v4)
; CHECK-NEXT: q2 = vcmp.eq(v1.h,v9.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2.h = vasl(v2.h,r4)
+; CHECK-NEXT: v3.h = vasl(v3.h,r4)
; CHECK-NEXT: v4 = vor(v28,v4)
-; CHECK-NEXT: v29 = vor(v5,v3)
+; CHECK-NEXT: v29 = vor(v5,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2 = vor(v4,v2)
+; CHECK-NEXT: v3 = vor(v4,v3)
; CHECK-NEXT: v31 = vmux(q3,v9,v29)
; CHECK-NEXT: vmem(r1+#0) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v30 = vmux(q2,v9,v2)
+; CHECK-NEXT: v30 = vmux(q2,v9,v3)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v30.new
; CHECK-NEXT: }
; CHECK-NEXT: v1:0.w = vunpack(v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3 = vsplat(r0)
+; CHECK-NEXT: v4 = vsplat(r0)
; CHECK-NEXT: r7 = #512
-; CHECK-NEXT: v4.w = vabs(v0.w)
+; CHECK-NEXT: v3.w = vabs(v0.w)
; CHECK-NEXT: v6.w = vabs(v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: {
; CHECK-NEXT: v13 = vsplat(r5)
; CHECK-NEXT: r6 = ##-2147483648
-; CHECK-NEXT: v7.uw = vcl0(v4.uw)
+; CHECK-NEXT: v7.uw = vcl0(v3.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10 = vsplat(r6)
; CHECK-NEXT: v8.uw = vcl0(v6.uw)
; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w)
-; CHECK-NEXT: v7.w = vadd(v7.w,v3.w)
+; CHECK-NEXT: v7.w = vadd(v7.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #23
-; CHECK-NEXT: v8.w = vadd(v8.w,v3.w)
+; CHECK-NEXT: v8.w = vadd(v8.w,v4.w)
; CHECK-NEXT: v27 = vmux(q0,v10,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v4.w = vasl(v4.w,v7.w)
+; CHECK-NEXT: v3.w = vasl(v3.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vasl(v6.w,v8.w)
-; CHECK-NEXT: v11.w = vadd(v4.w,v5.w)
-; CHECK-NEXT: v12 = vand(v4,v9)
+; CHECK-NEXT: v11.w = vadd(v3.w,v5.w)
+; CHECK-NEXT: v12 = vand(v3,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vadd(v6.w,v5.w)
; CHECK-NEXT: v9 = vand(v6,v9)
; CHECK-NEXT: q1 = vcmp.eq(v12.w,v2.w)
-; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v11.uw)
+; CHECK-NEXT: q2 = vcmp.gt(v3.uw,v11.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v22.uw = vlsr(v11.uw,r2)
; CHECK-NEXT: q3 = vcmp.eq(v9.w,v2.w)
-; CHECK-NEXT: v23 = vmux(q1,v2,v3)
-; CHECK-NEXT: v14 = vmux(q2,v3,v2)
+; CHECK-NEXT: v23 = vmux(q1,v2,v4)
+; CHECK-NEXT: v14 = vmux(q2,v4,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2)
; CHECK-NEXT: v11.w = vadd(v22.w,v23.w)
; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v5.uw)
-; CHECK-NEXT: v25 = vmux(q3,v2,v3)
+; CHECK-NEXT: v25 = vmux(q3,v2,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v21.uw = vlsr(v4.uw,r2)
+; CHECK-NEXT: v21.uw = vlsr(v3.uw,r2)
; CHECK-NEXT: v5.w = vadd(v24.w,v25.w)
-; CHECK-NEXT: v3 = vmux(q2,v3,v2)
+; CHECK-NEXT: v4 = vmux(q2,v4,v2)
; CHECK-NEXT: v7.w = vsub(v14.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2)
-; CHECK-NEXT: v3.w = vsub(v3.w,v8.w)
+; CHECK-NEXT: v4.w = vsub(v4.w,v8.w)
; CHECK-NEXT: q3 = vcmp.eq(v21.w,v22.w)
; CHECK-NEXT: v7.w = vadd(v7.w,v13.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v4.uw = vlsr(v22.uw,r0)
-; CHECK-NEXT: v3.w = vadd(v3.w,v13.w)
+; CHECK-NEXT: v3.uw = vlsr(v22.uw,r0)
+; CHECK-NEXT: v4.w = vadd(v4.w,v13.w)
; CHECK-NEXT: q2 = vcmp.eq(v6.w,v24.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0)
-; CHECK-NEXT: v4 = vmux(q3,v11,v4)
+; CHECK-NEXT: v3 = vmux(q3,v11,v3)
; CHECK-NEXT: q3 = vcmp.gt(v2.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uw = vlsr(v24.uw,r0)
; CHECK-NEXT: v28 = vmux(q3,v10,v2)
-; CHECK-NEXT: v4 = vor(v27,v4)
+; CHECK-NEXT: v3 = vor(v27,v3)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.w = vasl(v3.w,r4)
+; CHECK-NEXT: v4.w = vasl(v4.w,r4)
; CHECK-NEXT: v5 = vor(v28,v5)
-; CHECK-NEXT: v29 = vor(v4,v7)
+; CHECK-NEXT: v29 = vor(v3,v7)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3 = vor(v5,v3)
+; CHECK-NEXT: v4 = vor(v5,v4)
; CHECK-NEXT: v31 = vmux(q3,v2,v29)
; CHECK-NEXT: vmem(r1+#0) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v30 = vmux(q2,v2,v3)
+; CHECK-NEXT: v30 = vmux(q2,v2,v4)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v30.new
; CHECK-NEXT: }
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r6 = #1
-; CHECK-NEXT: r2 = #255
+; CHECK-NEXT: r3 = #512
; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1 = vsplat(r6)
-; CHECK-NEXT: v29 = vsplat(r2)
-; CHECK-NEXT: r3 = #512
-; CHECK-NEXT: v2 = vxor(v2,v2)
+; CHECK-NEXT: v2 = vsplat(r6)
+; CHECK-NEXT: v4 = vsplat(r3)
+; CHECK-NEXT: r2 = #255
+; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3 = vsplat(r3)
; CHECK-NEXT: r5:4 = combine(##159,#8)
-; CHECK-NEXT: v5:4.uw = vunpack(v0.uh)
+; CHECK-NEXT: v1:0.uw = vunpack(v0.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vsplat(r2)
; CHECK-NEXT: v7 = vsplat(r5)
-; CHECK-NEXT: q3 = vcmp.eq(v4.w,v2.w)
+; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.uw = vcl0(v4.uw)
+; CHECK-NEXT: v5.uw = vcl0(v0.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.w = vadd(v5.w,v1.w)
+; CHECK-NEXT: v5.w = vadd(v5.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v6.w = vasl(v4.w,v5.w)
+; CHECK-NEXT: v6.w = vasl(v0.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0.w = vadd(v6.w,v29.w)
-; CHECK-NEXT: v3 = vand(v6,v3)
+; CHECK-NEXT: v1.w = vadd(v6.w,v1.w)
+; CHECK-NEXT: v4 = vand(v6,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4)
-; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v0.uw)
-; CHECK-NEXT: q1 = vcmp.eq(v3.w,v2.w)
+; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v1.uw)
+; CHECK-NEXT: q1 = vcmp.eq(v4.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #23
-; CHECK-NEXT: v0.uw = vlsr(v0.uw,r4)
-; CHECK-NEXT: v3 = vmux(q1,v2,v1)
-; CHECK-NEXT: v1 = vmux(q0,v1,v2)
+; CHECK-NEXT: v1.uw = vlsr(v1.uw,r4)
+; CHECK-NEXT: v4 = vmux(q1,v3,v2)
+; CHECK-NEXT: v2 = vmux(q0,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1.w = vsub(v1.w,v5.w)
-; CHECK-NEXT: v3.w = vadd(v0.w,v3.w)
-; CHECK-NEXT: q2 = vcmp.eq(v6.w,v0.w)
+; CHECK-NEXT: v2.w = vsub(v2.w,v5.w)
+; CHECK-NEXT: v4.w = vadd(v1.w,v4.w)
+; CHECK-NEXT: q2 = vcmp.eq(v6.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v30.uw = vlsr(v0.uw,r6)
-; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
+; CHECK-NEXT: v29.uw = vlsr(v1.uw,r6)
+; CHECK-NEXT: v2.w = vadd(v2.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v31.uw = vlsr(v3.uw,r6)
+; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1.w = vasl(v1.w,r4)
-; CHECK-NEXT: v0 = vmux(q2,v31,v30)
+; CHECK-NEXT: v2.w = vasl(v2.w,r4)
+; CHECK-NEXT: v1 = vmux(q2,v30,v29)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0 = vor(v0,v1)
+; CHECK-NEXT: v31 = vor(v1,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0 = vmux(q3,v2,v0)
+; CHECK-NEXT: v0 = vmux(q3,v3,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }