From 705e77abed0b0fb7c6ab268866d3f841838eaf97 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Mon, 10 Oct 2022 15:52:38 -0700 Subject: [PATCH] [Hexagon] Lower funnel shifts for HVX HVX v62+ has bidirectional shifts, which do not mask the shift amount to the bit width. Instead, the shift amount is sign-extended from the log(BW) bit value, and a negative value causes a shift in the other direction. For the shift amount being -log(BW), this reversed shift will shift all bits out, inserting 0s or sign bits depending on the type and direction. --- llvm/lib/Target/Hexagon/HexagonISelLowering.cpp | 29 +- llvm/lib/Target/Hexagon/HexagonISelLowering.h | 6 +- llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp | 37 +- llvm/lib/Target/Hexagon/HexagonPatterns.td | 4 + llvm/lib/Target/Hexagon/HexagonPatternsHVX.td | 127 +- llvm/test/CodeGen/Hexagon/autohvx/funnel-128b.ll | 1291 ++++++++++++++++++++ 6 files changed, 1465 insertions(+), 29 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/autohvx/funnel-128b.ll diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index a61c2c2..157026c 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1899,6 +1899,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::VASL: return "HexagonISD::VASL"; case HexagonISD::VASR: return "HexagonISD::VASR"; case HexagonISD::VLSR: return "HexagonISD::VLSR"; + case HexagonISD::MFSHL: return "HexagonISD::MFSHL"; + case HexagonISD::MFSHR: return "HexagonISD::MFSHR"; case HexagonISD::SSAT: return "HexagonISD::SSAT"; case HexagonISD::USAT: return "HexagonISD::USAT"; case HexagonISD::VEXTRACTW: return "HexagonISD::VEXTRACTW"; @@ -2323,6 +2325,19 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) return SDValue(); } +SDValue +HexagonTargetLowering::getSplatValue(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + case ISD::BUILD_VECTOR: + if (SDValue S = cast(Op)->getSplatValue()) + return S; + break; + case ISD::SPLAT_VECTOR: + return Op.getOperand(0); + } + return SDValue(); +} + // Create a Hexagon-specific node for shifting a vector by an integer. SDValue HexagonTargetLowering::getVectorShiftByInt(SDValue Op, SelectionDAG &DAG) @@ -2342,18 +2357,8 @@ HexagonTargetLowering::getVectorShiftByInt(SDValue Op, SelectionDAG &DAG) llvm_unreachable("Unexpected shift opcode"); } - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - const SDLoc &dl(Op); - - switch (Op1.getOpcode()) { - case ISD::BUILD_VECTOR: - if (SDValue S = cast(Op1)->getSplatValue()) - return DAG.getNode(NewOpc, dl, ty(Op), Op0, S); - break; - case ISD::SPLAT_VECTOR: - return DAG.getNode(NewOpc, dl, ty(Op), Op0, Op1.getOperand(0)); - } + if (SDValue Sp = getSplatValue(Op.getOperand(1), DAG)) + return DAG.getNode(NewOpc, SDLoc(Op), ty(Op), Op.getOperand(0), Sp); return SDValue(); } diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index f3e961a..2c05699 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -53,9 +53,11 @@ enum NodeType : unsigned { CP, // Constant pool. COMBINE, - VASL, + VASL, // Vector shifts by a scalar value VASR, VLSR, + MFSHL, // Funnel shifts with the shift amount guaranteed to be + MFSHR, // within the range of the bit width of the element. SSAT, // Signed saturate. USAT, // Unsigned saturate. @@ -375,6 +377,7 @@ private: SelectionDAG &DAG) const; SDValue contractPredicate(SDValue Vec64, const SDLoc &dl, SelectionDAG &DAG) const; + SDValue getSplatValue(SDValue Op, SelectionDAG &DAG) const; SDValue getVectorShiftByInt(SDValue Op, SelectionDAG &DAG) const; SDValue appendUndef(SDValue Val, MVT ResTy, SelectionDAG &DAG) const; @@ -500,6 +503,7 @@ private: SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxSelect(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxFunnelShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxFpExtend(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 28bc499..0fb707c 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -225,6 +225,8 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::ANY_EXTEND, T, Custom); setOperationAction(ISD::SIGN_EXTEND, T, Custom); setOperationAction(ISD::ZERO_EXTEND, T, Custom); + setOperationAction(ISD::FSHL, T, Custom); + setOperationAction(ISD::FSHR, T, Custom); if (T != ByteV) { setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom); // HVX only has shifts of words and halfwords. @@ -299,12 +301,14 @@ HexagonTargetLowering::initializeHVXLowering() { // Promote all shuffles to operate on vectors of bytes. setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW); } + setOperationAction(ISD::FSHL, T, Custom); + setOperationAction(ISD::FSHR, T, Custom); setOperationAction(ISD::SMIN, T, Custom); setOperationAction(ISD::SMAX, T, Custom); if (T.getScalarType() != MVT::i32) { - setOperationAction(ISD::UMIN, T, Custom); - setOperationAction(ISD::UMAX, T, Custom); + setOperationAction(ISD::UMIN, T, Custom); + setOperationAction(ISD::UMAX, T, Custom); } if (Subtarget.useHVXFloatingPoint()) { @@ -2112,6 +2116,31 @@ HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const { } SDValue +HexagonTargetLowering::LowerHvxFunnelShift(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + assert(Opc == ISD::FSHL || Opc == ISD::FSHR); + + // Make sure the shift amount is within the range of the bitwidth + // of the element type. + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + SDValue S = Op.getOperand(2); + + MVT InpTy = ty(A); + MVT ElemTy = InpTy.getVectorElementType(); + + const SDLoc &dl(Op); + unsigned ElemWidth = ElemTy.getSizeInBits(); + SDValue Mask = DAG.getSplatBuildVector( + InpTy, dl, DAG.getConstant(ElemWidth - 1, dl, ElemTy)); + + unsigned MOpc = Opc == ISD::FSHL ? HexagonISD::MFSHL : HexagonISD::MFSHR; + return DAG.getNode(MOpc, dl, ty(Op), + {A, B, DAG.getNode(ISD::AND, dl, InpTy, {S, Mask})}); +} + +SDValue HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const { const SDLoc &dl(Op); MVT ResTy = ty(Op); @@ -2958,6 +2987,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRA: case ISD::SHL: case ISD::SRL: + case ISD::FSHL: + case ISD::FSHR: case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: @@ -2996,6 +3027,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRA: case ISD::SHL: case ISD::SRL: return LowerHvxShift(Op, DAG); + case ISD::FSHL: + case ISD::FSHR: return LowerHvxFunnelShift(Op, DAG); case ISD::MULHS: case ISD::MULHU: return LowerHvxMulh(Op, DAG); case ISD::ANY_EXTEND_VECTOR_INREG: return LowerHvxExtend(Op, DAG); diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index b00ed06..34e41a4 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -1366,6 +1366,10 @@ def HexagonVASL: SDNode<"HexagonISD::VASL", SDTHexagonVShift>; def HexagonVASR: SDNode<"HexagonISD::VASR", SDTHexagonVShift>; def HexagonVLSR: SDNode<"HexagonISD::VLSR", SDTHexagonVShift>; +// Funnel shifts with the shift amount module element bit width. +def HexagonMFSHL: SDNode<"HexagonISD::MFSHL", SDTIntShiftDOp>; +def HexagonMFSHR: SDNode<"HexagonISD::MFSHR", SDTIntShiftDOp>; + def: OpR_RI_pat, v2i32, V2I32, u5_0ImmPred>; def: OpR_RI_pat, v4i16, V4I16, u4_0ImmPred>; def: OpR_RI_pat, v2i32, V2I32, u5_0ImmPred>; diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index 949e2c7..4519c85 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -41,7 +41,7 @@ def HwLen2: SDNodeXFormgetTargetConstant(ST.getVectorLength()/2, SDLoc(N), MVT::i32); }]>; -def Q2V: OutPatFrag<(ops node:$Qs), (V6_vandqrt $Qs, (A2_tfrsi -1))>; +def Q2V: OutPatFrag<(ops node:$Qs), (V6_vandqrt $Qs, (ToI32 -1))>; def Combinev: OutPatFrag<(ops node:$Vs, node:$Vt), (REG_SEQUENCE HvxWR, $Vs, vsub_hi, $Vt, vsub_lo)>; @@ -50,9 +50,9 @@ def Combineq: OutPatFrag<(ops node:$Qs, node:$Qt), (V6_vandvrt (V6_vor (V6_vror (V6_vpackeb (V6_vd0), (Q2V $Qs)), - (A2_tfrsi (HwLen2 (i32 0)))), // Half the vector length + (ToI32 (HwLen2 (i32 0)))), // Half the vector length (V6_vpackeb (V6_vd0), (Q2V $Qt))), - (A2_tfrsi -1))>; + (ToI32 -1))>; def LoVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_lo)>; def HiVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_hi)>; @@ -74,6 +74,23 @@ def VSxth: OutPatFrag<(ops node:$Vs), (V6_vunpackh $Vs)>; def VZxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackub $Vs)>; def VZxth: OutPatFrag<(ops node:$Vs), (V6_vunpackuh $Vs)>; +class VSubi: + OutPatFrag<(ops node:$Imm, node:$Vs), (VSub (VSplati (i32 $Imm)), $Vs)>; + +def VSubib: VSubi; +def VSubih: VSubi; +def VSubiw: VSubi; + +def VNegb: OutPatFrag<(ops node:$Vs), (VSubib 0, $Vs)>; +def VNegh: OutPatFrag<(ops node:$Vs), (VSubih 0, $Vs)>; +def VNegw: OutPatFrag<(ops node:$Vs), (VSubiw 0, $Vs)>; + +class pf3: PatFrag<(ops node:$a, node:$b, node:$c), + (Op node:$a, node:$b, node:$c)>; + +def Mfshl: pf3; +def Mfshr: pf3; + def IsVecOff : PatLeaf<(i32 imm), [{ int32_t V = N->getSExtValue(); int32_t VecSize = HRI->getSpillSize(Hexagon::HvxVRRegClass); @@ -126,7 +143,7 @@ multiclass HvxLdc_pat { // Calling "Select" on the resulting loads from CP fails without these // patterns. def: Pat<(ResType (Load (HexagonCP tconstpool:$Addr))), - (MI (A2_tfrsi imm:$Addr), 0)>; + (MI (ToI32 imm:$Addr), 0)>; def: Pat<(ResType (Load (HexagonAtPcrel tconstpool:$Addr))), (MI (C4_addipc imm:$Addr), 0)>; } @@ -560,11 +577,11 @@ let Predicates = [UseHVX] in { (V6_vdealb4w (IMPLICIT_DEF), HvxVR:$Vs)>; def: Pat<(VecQ8 (trunc HVI8:$Vs)), - (V6_vandvrt HvxVR:$Vs, (A2_tfrsi 0x01010101))>; + (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>; def: Pat<(VecQ16 (trunc HVI16:$Vs)), - (V6_vandvrt HvxVR:$Vs, (A2_tfrsi 0x01010101))>; + (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>; def: Pat<(VecQ32 (trunc HVI32:$Vs)), - (V6_vandvrt HvxVR:$Vs, (A2_tfrsi 0x01010101))>; + (V6_vandvrt HvxVR:$Vs, (ToI32 0x01010101))>; } let Predicates = [UseHVX] in { @@ -572,19 +589,19 @@ let Predicates = [UseHVX] in { // definitions for them, but they are length-specific. let Predicates = [UseHVX,UseHVX64B] in { def: Pat<(VecI16 (sext_inreg HVI16:$Vs, v32i8)), - (V6_vasrh (V6_vaslh HVI16:$Vs, (A2_tfrsi 8)), (A2_tfrsi 8))>; + (V6_vasrh (V6_vaslh HVI16:$Vs, (ToI32 8)), (ToI32 8))>; def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v16i8)), - (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 24)), (A2_tfrsi 24))>; + (V6_vasrw (V6_vaslw HVI32:$Vs, (ToI32 24)), (ToI32 24))>; def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v16i16)), - (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>; + (V6_vasrw (V6_vaslw HVI32:$Vs, (ToI32 16)), (ToI32 16))>; } let Predicates = [UseHVX,UseHVX128B] in { def: Pat<(VecI16 (sext_inreg HVI16:$Vs, v64i8)), - (V6_vasrh (V6_vaslh HVI16:$Vs, (A2_tfrsi 8)), (A2_tfrsi 8))>; + (V6_vasrh (V6_vaslh HVI16:$Vs, (ToI32 8)), (ToI32 8))>; def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v32i8)), - (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 24)), (A2_tfrsi 24))>; + (V6_vasrw (V6_vaslw HVI32:$Vs, (ToI32 24)), (ToI32 24))>; def: Pat<(VecI32 (sext_inreg HVI32:$Vs, v32i16)), - (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>; + (V6_vasrw (V6_vaslw HVI32:$Vs, (ToI32 16)), (ToI32 16))>; } // Take a pair of vectors Vt:Vs and shift them towards LSB by (Rt & HwLen). @@ -634,6 +651,88 @@ let Predicates = [UseHVX] in { def: Pat<(srl HVI16:$Vs, HVI16:$Vt), (V6_vlsrhv HvxVR:$Vs, HvxVR:$Vt)>; def: Pat<(srl HVI32:$Vs, HVI32:$Vt), (V6_vlsrwv HvxVR:$Vs, HvxVR:$Vt)>; + // Mfshl hi, lo, amt + def: Pat<(Mfshl HVI8:$Vu, HVI8:$Vv, HVI8:$Vs), + (V6_vshuffob (V6_vaslhv (HiVec (V6_vshufoeb $Vu, $Vv)), + (HiVec (V6_vzb $Vs))), + (V6_vaslhv (LoVec (V6_vshufoeb $Vu, $Vv)), + (LoVec (V6_vzb $Vs))))>; + let Predicates = [UseHVX,UseHVXV60] in { + // V60 doesn't produce 0 on shifts by bitwidth, e.g. Vv.h << 16-0 + def: Pat<(Mfshl HVI16:$Vu, HVI16:$Vv, HVI16:$Vs), + (V6_vmux (V6_veqh $Vs, (V6_vd0)), + $Vu, + (V6_vor (V6_vaslhv $Vu, $Vs), + (V6_vlsrhv $Vv, (VSubih 16, $Vs))))>; + def: Pat<(Mfshl HVI32:$Vu, HVI32:$Vv, HVI32:$Vs), + (V6_vmux (V6_veqw (V6_vand $Vs, (PS_vsplatiw (i32 31))), (V6_vd0)), + $Vu, + (V6_vor (V6_vaslwv $Vu, $Vs), + (V6_vlsrwv $Vv, (VSubiw 32, $Vs))))>; + } + let Predicates = [UseHVX,UseHVXV62], AddedComplexity = 10 in { + // Do it as (Vu << Vs) | (Vv >> (BW-Vs)). + // For Vs == 0 becomes Vu | (Vv >> -BW), since the shift amount is + // sign-extended. Then this becomes Vu | (Vv << BW) == Vu. + def: Pat<(Mfshl HVI16:$Vu, HVI16:$Vv, HVI16:$Vs), + (V6_vor (V6_vaslhv $Vu, $Vs), + (V6_vlsrhv $Vv, (VSubih 16, $Vs)))>; + def: Pat<(Mfshl HVI32:$Vu, HVI32:$Vv, HVI32:$Vs), + (V6_vor (V6_vaslwv $Vu, $Vs), + (V6_vlsrwv $Vv, (VSubiw 32, $Vs)))>; + } + let Predicates = [UseHVX,UseHVXV66], AddedComplexity = 20 in { + // Assume Vs > 0 (and within bit width) + // Vx[1]:Vx[0] = V6_vasr_into Vx[0], Vv, Vs + // --> (Vx[0]:Vx[0] & (ffffffff << -Vs)) | (Vv:00000000 << -Vs) + // i.e. Vx[1] = insert ((Vv << -Vs) -> Vx[0]) + def: Pat<(Mfshl HVI32:$Vu, HVI32:$Vv, HVI32:$Vs), + (HiVec (V6_vasr_into (Combinev (VecI32 (IMPLICIT_DEF)), + (V6_vlsrwv $Vv, (VSubiw 32, $Vs))), + $Vu, + (V6_vsubw (V6_vd0), $Vs)))>; + } + + // Mfshr hi, lo, amt + def: Pat<(Mfshr HVI8:$Vu, HVI8:$Vv, HVI8:$Vs), + (V6_vshuffeb (V6_vlsrhv (HiVec (V6_vshufoeb $Vu, $Vv)), + (HiVec (V6_vzb $Vs))), + (V6_vlsrhv (LoVec (V6_vshufoeb $Vu, $Vv)), + (LoVec (V6_vzb $Vs))))>; + let Predicates = [UseHVX,UseHVXV60] in { + def: Pat<(Mfshr HVI16:$Vu, HVI16:$Vv, HVI16:$Vs), + (V6_vmux (V6_veqh $Vs, (V6_vd0)), + $Vv, + (V6_vor (V6_vaslhv $Vu, (VSubih 16, $Vs)), + (V6_vlsrhv $Vv, $Vs)))>; + def: Pat<(Mfshr HVI32:$Vu, HVI32:$Vv, HVI32:$Vs), + (V6_vmux (V6_veqw $Vs, (V6_vd0)), + $Vv, + (V6_vor (V6_vaslwv $Vu, (VSubiw 32, $Vs)), + (V6_vlsrwv $Vv, $Vs)))>; + } + let Predicates = [UseHVX,UseHVXV62], AddedComplexity = 10 in { + // Do it as (Vu >> -(BW-Vs)) | (Vv >> Vs). + // For Vs == 0 becomes (Vu << BW) | Vs == 0 | Vv + def: Pat<(Mfshr HVI16:$Vu, HVI16:$Vv, HVI16:$Vs), + (V6_vor (V6_vlsrhv $Vu, (V6_vsubh $Vs, (PS_vsplatih (i32 16)))), + (V6_vlsrhv $Vv, $Vs))>; + def: Pat<(Mfshr HVI32:$Vu, HVI32:$Vv, HVI32:$Vs), + (V6_vor (V6_vlsrwv $Vu, (V6_vsubw $Vs, (PS_vsplatiw (i32 32)))), + (V6_vlsrwv $Vv, $Vs))>; + } + let Predicates = [UseHVX,UseHVXV66], AddedComplexity = 20 in { + // Assume Vs > 0 (and within bit width) + // Vx[1]:Vx[0] = V6_vasr_into Vx[0], Vv, Vs + // --> (Vx[0]:Vx[0] & (ffffffff >> Vs)) | (Vv:00000000 >> Vs) + // i.e. Vx[0] = insert ((Vv >> Vs) -> Vx[0]) + def: Pat<(Mfshr HVI32:$Vu, HVI32:$Vv, HVI32:$Vs), + (LoVec (V6_vasr_into (Combinev (VecI32 (IMPLICIT_DEF)), + (V6_vlsrwv $Vv, $Vs)), + $Vu, + $Vs))>; + } + def: Pat<(VecI16 (bswap HVI16:$Vs)), (V6_vdelta HvxVR:$Vs, (PS_vsplatib (i32 0x01)))>; def: Pat<(VecI32 (bswap HVI32:$Vs)), @@ -669,7 +768,7 @@ let Predicates = [UseHVX] in { def: HvxSel_pat; } -def V2Q: OutPatFrag<(ops node:$Vs), (V6_vandvrt $Vs, (A2_tfrsi -1))>; +def V2Q: OutPatFrag<(ops node:$Vs), (V6_vandvrt $Vs, (ToI32 -1))>; let Predicates = [UseHVX] in { def: Pat<(select I1:$Pu, VecQ8:$Qs, VecQ8:$Qt), diff --git a/llvm/test/CodeGen/Hexagon/autohvx/funnel-128b.ll b/llvm/test/CodeGen/Hexagon/autohvx/funnel-128b.ll new file mode 100644 index 0000000..4a9bfab --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/funnel-128b.ll @@ -0,0 +1,1291 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=hexagon -mattr=+hvxv60,+hvx-length128b,-packets < %s | FileCheck --check-prefix=V60 %s +; RUN: llc -march=hexagon -mattr=+hvxv62,+hvx-length128b,-packets < %s | FileCheck --check-prefix=V62 %s +; RUN: llc -march=hexagon -mattr=+hvxv66,+hvx-length128b,-packets < %s | FileCheck --check-prefix=V66 %s + +define <128 x i8> @f0(<128 x i8> %a0, <128 x i8> %a1, i8 %a2) #0 { +; V60-LABEL: f0: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = vsplatb(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r1 = ##117901063 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vsplat(r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vand(v2,v3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3:2.uh = vzxt(v2.ub) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.h = vasl(v0.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1.h = vasl(v1.h,v3.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.b = vshuffo(v1.b,v0.b) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f0: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r2 = #7 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2.b = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.b = vsplat(r2) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vand(v2,v3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3:2.uh = vzxt(v2.ub) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.h = vasl(v0.h,v2.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.h = vasl(v1.h,v3.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.b = vshuffo(v1.b,v0.b) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f0: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r2 = #7 +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2.b = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.b = vsplat(r2) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2 = vand(v2,v3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3:2.uh = vzxt(v2.ub) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.h = vasl(v0.h,v2.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1.h = vasl(v1.h,v3.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.b = vshuffo(v1.b,v0.b) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = insertelement <128 x i8> undef, i8 %a2, i32 0 + %v1 = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> zeroinitializer + %v2 = call <128 x i8> @llvm.fshl.v128i8(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %v1) + ret <128 x i8> %v2 +} + +define <64 x i16> @f1(<64 x i16> %a0, <64 x i16> %a1, i16 %a2) #0 { +; V60-LABEL: f1: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = combine(r0.l,r0.l) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r1 = ##983055 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r2 = ##1048592 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vxor(v4,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vsplat(r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vand(v2,v3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v30 = vsplat(r2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3.h = vsub(v30.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.eq(v2.h,v4.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31.h = vasl(v0.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1.h = vlsr(v1.h,v3.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1 = vor(v31,v1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vmux(q0,v0,v1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f1: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r3:2 = combine(#16,#15) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2.h = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.h = vsplat(r2) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v4.h = vsplat(r3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vand(v2,v3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.h = vsub(v4.h,v2.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.h = vasl(v0.h,v2.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.h = vlsr(v1.h,v3.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = vor(v0,v1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f1: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r3:2 = combine(#16,#15) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2.h = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.h = vsplat(r2) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v4.h = vsplat(r3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2 = vand(v2,v3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.h = vsub(v4.h,v2.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.h = vasl(v0.h,v2.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1.h = vlsr(v1.h,v3.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0 = vor(v0,v1) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = insertelement <64 x i16> undef, i16 %a2, i32 0 + %v1 = shufflevector <64 x i16> %v0, <64 x i16> undef, <64 x i32> zeroinitializer + %v2 = call <64 x i16> @llvm.fshl.v64i16(<64 x i16> %a0, <64 x i16> %a1, <64 x i16> %v1) + ret <64 x i16> %v2 +} + +define <32 x i32> @f2(<32 x i32> %a0, <32 x i32> %a1, i32 %a2) #0 { +; V60-LABEL: f2: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r3:2 = combine(#31,#32) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r0 = and(r0,#31) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vxor(v2,v2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vsplat(r2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4.w = vsub(v4.w,v3.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5 = vsplat(r3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31 = vand(v3,v5) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3.w = vasl(v0.w,v3.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1.w = vlsr(v1.w,v4.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.eq(v31.w,v2.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1 = vor(v3,v1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vmux(q0,v0,v1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f2: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r0 = and(r0,#31) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: r1 = #32 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3 = vsplat(r1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.w = vsub(v3.w,v2.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.w = vasl(v0.w,v2.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.w = vlsr(v1.w,v3.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = vor(v0,v1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f2: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r0 = and(r0,#31) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: r1 = #32 +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v4 = vxor(v4,v4) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2 = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3 = vsplat(r1) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.w = vsub(v3.w,v2.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2.w = vsub(v4.w,v2.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v4.w = vlsr(v1.w,v3.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v5:4.w = vasrinto(v0.w,v2.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0 = v5 +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = insertelement <32 x i32> undef, i32 %a2, i32 0 + %v1 = shufflevector <32 x i32> %v0, <32 x i32> undef, <32 x i32> zeroinitializer + %v2 = call <32 x i32> @llvm.fshl.v32i32(<32 x i32> %a0, <32 x i32> %a1, <32 x i32> %v1) + ret <32 x i32> %v2 +} + +define <128 x i8> @f3(<128 x i8> %a0, <128 x i8> %a1, i8 %a2) #0 { +; V60-LABEL: f3: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = vsplatb(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r1 = ##117901063 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vsplat(r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vand(v2,v3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3:2.uh = vzxt(v2.ub) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.h = vlsr(v0.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1.h = vlsr(v1.h,v3.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.b = vshuffe(v1.b,v0.b) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f3: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r2 = #7 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2.b = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.b = vsplat(r2) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vand(v2,v3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3:2.uh = vzxt(v2.ub) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.h = vlsr(v0.h,v2.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.h = vlsr(v1.h,v3.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.b = vshuffe(v1.b,v0.b) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f3: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r2 = #7 +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2.b = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.b = vsplat(r2) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2 = vand(v2,v3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3:2.uh = vzxt(v2.ub) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.h = vlsr(v0.h,v2.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1.h = vlsr(v1.h,v3.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.b = vshuffe(v1.b,v0.b) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = insertelement <128 x i8> undef, i8 %a2, i32 0 + %v1 = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> zeroinitializer + %v2 = call <128 x i8> @llvm.fshr.v128i8(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %v1) + ret <128 x i8> %v2 +} + +define <64 x i16> @f4(<64 x i16> %a0, <64 x i16> %a1, i16 %a2) #0 { +; V60-LABEL: f4: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = combine(r0.l,r0.l) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r1 = ##983055 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r2 = ##1048592 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vxor(v4,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vsplat(r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vand(v2,v3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v30 = vsplat(r2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3.h = vsub(v30.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.eq(v2.h,v4.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31.h = vlsr(v1.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.h = vasl(v0.h,v3.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vor(v0,v31) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vmux(q0,v1,v0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f4: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r3:2 = combine(#16,#15) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2.h = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.h = vsplat(r2) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v4.h = vsplat(r3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vand(v2,v3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.h = vsub(v2.h,v4.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.h = vlsr(v1.h,v2.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.h = vlsr(v0.h,v3.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = vor(v0,v1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f4: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r3:2 = combine(#16,#15) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2.h = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.h = vsplat(r2) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v4.h = vsplat(r3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2 = vand(v2,v3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.h = vsub(v2.h,v4.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1.h = vlsr(v1.h,v2.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.h = vlsr(v0.h,v3.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0 = vor(v0,v1) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = insertelement <64 x i16> undef, i16 %a2, i32 0 + %v1 = shufflevector <64 x i16> %v0, <64 x i16> undef, <64 x i32> zeroinitializer + %v2 = call <64 x i16> @llvm.fshr.v64i16(<64 x i16> %a0, <64 x i16> %a1, <64 x i16> %v1) + ret <64 x i16> %v2 +} + +define <32 x i32> @f5(<32 x i32> %a0, <32 x i32> %a1, i32 %a2) #0 { +; V60-LABEL: f5: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = and(r0,#31) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r1 = #32 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vxor(v2,v2) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vsplat(r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4.w = vsub(v4.w,v3.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.eq(v3.w,v2.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31.w = vlsr(v1.w,v3.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.w = vasl(v0.w,v4.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vor(v0,v31) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vmux(q0,v1,v0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f5: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r0 = and(r0,#31) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: r1 = #32 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3 = vsplat(r1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.w = vsub(v2.w,v3.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.w = vlsr(v1.w,v2.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.w = vlsr(v0.w,v3.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = vor(v0,v1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f5: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r0 = and(r0,#31) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3 = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2.w = vlsr(v1.w,v3.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3:2.w = vasrinto(v0.w,v3.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0 = v2 +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = insertelement <32 x i32> undef, i32 %a2, i32 0 + %v1 = shufflevector <32 x i32> %v0, <32 x i32> undef, <32 x i32> zeroinitializer + %v2 = call <32 x i32> @llvm.fshr.v32i32(<32 x i32> %a0, <32 x i32> %a1, <32 x i32> %v1) + ret <32 x i32> %v2 +} + +define <128 x i8> @f6(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2) #0 { +; V60-LABEL: f6: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = ##117901063 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vand(v2,v3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3:2.uh = vzxt(v2.ub) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.h = vasl(v0.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1.h = vasl(v1.h,v3.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.b = vshuffo(v1.b,v0.b) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f6: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r0 = #7 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.b = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vand(v2,v3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3:2.uh = vzxt(v2.ub) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.h = vasl(v0.h,v2.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.h = vasl(v1.h,v3.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.b = vshuffo(v1.b,v0.b) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f6: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r0 = #7 +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.b = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2 = vand(v2,v3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3:2.uh = vzxt(v2.ub) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.h = vasl(v0.h,v2.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1.h = vasl(v1.h,v3.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.b = vshuffo(v1.b,v0.b) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = call <128 x i8> @llvm.fshl.v128i8(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2) + ret <128 x i8> %v0 +} + +define <64 x i16> @f7(<64 x i16> %a0, <64 x i16> %a1, <64 x i16> %a2) #0 { +; V60-LABEL: f7: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = ##983055 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r1 = ##1048592 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vxor(v3,v3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5 = vsplat(r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vand(v2,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4.h = vsub(v5.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.eq(v2.h,v3.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31.h = vasl(v0.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1.h = vlsr(v1.h,v4.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1 = vor(v31,v1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vmux(q0,v0,v1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f7: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r1:0 = combine(#16,#15) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.h = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v4.h = vsplat(r1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vand(v2,v3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.h = vsub(v4.h,v2.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.h = vasl(v0.h,v2.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.h = vlsr(v1.h,v3.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = vor(v0,v1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f7: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r1:0 = combine(#16,#15) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.h = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v4.h = vsplat(r1) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2 = vand(v2,v3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.h = vsub(v4.h,v2.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.h = vasl(v0.h,v2.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1.h = vlsr(v1.h,v3.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0 = vor(v0,v1) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = call <64 x i16> @llvm.fshl.v64i16(<64 x i16> %a0, <64 x i16> %a1, <64 x i16> %a2) + ret <64 x i16> %v0 +} + +define <32 x i32> @f8(<32 x i32> %a0, <32 x i32> %a1, <32 x i32> %a2) #0 { +; V60-LABEL: f8: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r1:0 = combine(#32,#31) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vxor(v3,v3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5 = vsplat(r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vand(v2,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5.w = vsub(v5.w,v2.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vand(v2,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2.w = vasl(v0.w,v2.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.eq(v4.w,v3.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1.w = vlsr(v1.w,v5.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1 = vor(v2,v1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vmux(q0,v0,v1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f8: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r1:0 = combine(#32,#31) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3 = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v4 = vsplat(r1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vand(v2,v3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.w = vsub(v4.w,v2.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.w = vasl(v0.w,v2.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.w = vlsr(v1.w,v3.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = vor(v0,v1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f8: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r1:0 = combine(#32,#31) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v31 = vxor(v31,v31) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3 = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v4 = vsplat(r1) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2 = vand(v2,v3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v4.w = vsub(v4.w,v2.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2.w = vsub(v31.w,v2.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v4.w = vlsr(v1.w,v4.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v5:4.w = vasrinto(v0.w,v2.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0 = v5 +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = call <32 x i32> @llvm.fshl.v32i32(<32 x i32> %a0, <32 x i32> %a1, <32 x i32> %a2) + ret <32 x i32> %v0 +} + +define <128 x i8> @f9(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2) #0 { +; V60-LABEL: f9: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = ##117901063 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vand(v2,v3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3:2.uh = vzxt(v2.ub) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.h = vlsr(v0.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v1.h = vlsr(v1.h,v3.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.b = vshuffe(v1.b,v0.b) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f9: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r0 = #7 +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.b = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vand(v2,v3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3:2.uh = vzxt(v2.ub) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.h = vlsr(v0.h,v2.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.h = vlsr(v1.h,v3.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.b = vshuffe(v1.b,v0.b) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f9: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r0 = #7 +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1:0.b = vshuffoe(v0.b,v1.b) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.b = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2 = vand(v2,v3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3:2.uh = vzxt(v2.ub) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.h = vlsr(v0.h,v2.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1.h = vlsr(v1.h,v3.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.b = vshuffe(v1.b,v0.b) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = call <128 x i8> @llvm.fshr.v128i8(<128 x i8> %a0, <128 x i8> %a1, <128 x i8> %a2) + ret <128 x i8> %v0 +} + +define <64 x i16> @f10(<64 x i16> %a0, <64 x i16> %a1, <64 x i16> %a2) #0 { +; V60-LABEL: f10: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r0 = ##983055 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: r1 = ##1048592 +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vxor(v3,v3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5 = vsplat(r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vand(v2,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4.h = vsub(v5.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.eq(v2.h,v3.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31.h = vlsr(v1.h,v2.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.h = vasl(v0.h,v4.h) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vor(v0,v31) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vmux(q0,v1,v0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f10: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r1:0 = combine(#16,#15) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.h = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v4.h = vsplat(r1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vand(v2,v3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.h = vsub(v2.h,v4.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.h = vlsr(v1.h,v2.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.h = vlsr(v0.h,v3.h) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = vor(v0,v1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f10: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r1:0 = combine(#16,#15) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.h = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v4.h = vsplat(r1) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2 = vand(v2,v3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3.h = vsub(v2.h,v4.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v1.h = vlsr(v1.h,v2.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0.h = vlsr(v0.h,v3.h) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0 = vor(v0,v1) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = call <64 x i16> @llvm.fshr.v64i16(<64 x i16> %a0, <64 x i16> %a1, <64 x i16> %a2) + ret <64 x i16> %v0 +} + +define <32 x i32> @f11(<32 x i32> %a0, <32 x i32> %a1, <32 x i32> %a2) #0 { +; V60-LABEL: f11: +; V60: // %bb.0: +; V60-NEXT: { +; V60-NEXT: r1:0 = combine(#32,#31) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v3 = vxor(v3,v3) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4 = vsplat(r0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v5 = vsplat(r1) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v2 = vand(v2,v4) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v4.w = vsub(v5.w,v2.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: q0 = vcmp.eq(v2.w,v3.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v31.w = vlsr(v1.w,v2.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0.w = vasl(v0.w,v4.w) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vor(v0,v31) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: v0 = vmux(q0,v1,v0) +; V60-NEXT: } +; V60-NEXT: { +; V60-NEXT: jumpr r31 +; V60-NEXT: } +; +; V62-LABEL: f11: +; V62: // %bb.0: +; V62-NEXT: { +; V62-NEXT: r1:0 = combine(#32,#31) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3 = vsplat(r0) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v4 = vsplat(r1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v2 = vand(v2,v3) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v3.w = vsub(v2.w,v4.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v1.w = vlsr(v1.w,v2.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0.w = vlsr(v0.w,v3.w) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: v0 = vor(v0,v1) +; V62-NEXT: } +; V62-NEXT: { +; V62-NEXT: jumpr r31 +; V62-NEXT: } +; +; V66-LABEL: f11: +; V66: // %bb.0: +; V66-NEXT: { +; V66-NEXT: r0 = #31 +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3 = vsplat(r0) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3 = vand(v2,v3) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v2.w = vlsr(v1.w,v3.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v3:2.w = vasrinto(v0.w,v3.w) +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: v0 = v2 +; V66-NEXT: } +; V66-NEXT: { +; V66-NEXT: jumpr r31 +; V66-NEXT: } + %v0 = call <32 x i32> @llvm.fshr.v32i32(<32 x i32> %a0, <32 x i32> %a1, <32 x i32> %a2) + ret <32 x i32> %v0 +} + +declare <128 x i8> @llvm.fshl.v128i8(<128 x i8>, <128 x i8>, <128 x i8>) +declare <128 x i8> @llvm.fshr.v128i8(<128 x i8>, <128 x i8>, <128 x i8>) +declare <64 x i16> @llvm.fshl.v64i16(<64 x i16>, <64 x i16>, <64 x i16>) +declare <64 x i16> @llvm.fshr.v64i16(<64 x i16>, <64 x i16>, <64 x i16>) +declare <32 x i32> @llvm.fshl.v32i32(<32 x i32>, <32 x i32>, <32 x i32>) +declare <32 x i32> @llvm.fshr.v32i32(<32 x i32>, <32 x i32>, <32 x i32>) + +attributes #0 = { nounwind } -- 2.7.4