From: Yeting Kuo Date: Mon, 9 Jan 2023 13:54:22 +0000 (+0800) Subject: [RISCV] Teach lowerCTLZ_CTTZ_ZERO_UNDEF to handle conversion i32/i64 vectors to f32... X-Git-Tag: upstream/17.0.6~21265 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5280d3e7384835bb6ee797def32c98f30afaee98;p=platform%2Fupstream%2Fllvm.git [RISCV] Teach lowerCTLZ_CTTZ_ZERO_UNDEF to handle conversion i32/i64 vectors to f32 vectors. Previously lowerCTLZ_CTTZ_ZERO_UNDEF converted the source to float value by ISD::UINT_TO_FP. ISD::UINT_TO_FP uses dynamic rounding mode, so the rounding may make the exponent of the result not as expected when converting i32/i64 to f32. This is the reason why we constrained lowerCTLZ_CTTZ_ZERO_UNDEF to only handle an i32 source when the f64 type having the same element count as source is legal. The patch teaches lowerCTLZ_CTTZ_ZERO_UNDEF converts i32/i64 vectors to f32 vectors by vfcvt.f.xu.v with RTZ rounding mode. Using RTZ is to make sure the exponent of results is correct, although f32 could not totally represent each value in i32/i64. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D140782 --- diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 169ff9d..5c8bd22 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -677,16 +677,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // Splice setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); - // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point - // type that can represent the value exactly. - if (VT.getVectorElementType() != MVT::i64) { - MVT FloatEltVT = - VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32; - EVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount()); - if (isTypeLegal(FloatVT)) { - setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT, - Custom); - } + // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the range + // of f32. + EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); + if (isTypeLegal(FloatVT)) { + setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT, + Custom); } } @@ -912,17 +908,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(IntegerVPOps, VT, Custom); - // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point - // type that can represent the value exactly. - if (VT.getVectorElementType() != MVT::i64) { - MVT FloatEltVT = - VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32; - EVT FloatVT = - MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount()); - if (isTypeLegal(FloatVT)) - setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT, - Custom); - } + // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the + // range of f32. + EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); + if (isTypeLegal(FloatVT)) + setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT, + Custom); } for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) { @@ -3535,15 +3526,20 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { // Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting // the exponent. -static SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { +SDValue +RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, + SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); unsigned EltSize = VT.getScalarSizeInBits(); SDValue Src = Op.getOperand(0); SDLoc DL(Op); - // We need a FP type that can represent the value. + // We choose FP type that can represent the value if possible. Otherwise, we + // use rounding to zero conversion for correct exponent of the result. // TODO: Use f16 for i8 when possible? - MVT FloatEltVT = EltSize == 32 ? MVT::f64 : MVT::f32; + MVT FloatEltVT = (EltSize >= 32) ? MVT::f64 : MVT::f32; + if (!isTypeLegal(MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount()))) + FloatEltVT = MVT::f32; MVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount()); // Legal types should have been checked in the RISCVTargetLowering @@ -3560,27 +3556,50 @@ static SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { } // We have a legal FP type, convert to it. - SDValue FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src); + SDValue FloatVal; + if (FloatVT.bitsGT(VT)) { + FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src); + } else { + // Use RTZ to avoid rounding influencing exponent of FloatVal. + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VT); + Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget); + } + + auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); + SDValue RTZRM = + DAG.getTargetConstant(RISCVFPRndMode::RTZ, DL, Subtarget.getXLenVT()); + MVT ContainerFloatVT = + MVT::getVectorVT(FloatEltVT, ContainerVT.getVectorElementCount()); + FloatVal = DAG.getNode(RISCVISD::VFCVT_RM_F_XU_VL, DL, ContainerFloatVT, + Src, Mask, RTZRM, VL); + if (VT.isFixedLengthVector()) + FloatVal = convertFromScalableVector(FloatVT, FloatVal, DAG, Subtarget); + } // Bitcast to integer and shift the exponent to the LSB. EVT IntVT = FloatVT.changeVectorElementTypeToInteger(); SDValue Bitcast = DAG.getBitcast(IntVT, FloatVal); unsigned ShiftAmt = FloatEltVT == MVT::f64 ? 52 : 23; - SDValue Shift = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast, - DAG.getConstant(ShiftAmt, DL, IntVT)); - // Truncate back to original type to allow vnsrl. - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Shift); + SDValue Exp = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast, + DAG.getConstant(ShiftAmt, DL, IntVT)); + // Restore back to original type. Truncation after SRL is to generate vnsrl. + if (IntVT.bitsLT(VT)) + Exp = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Exp); + else if (IntVT.bitsGT(VT)) + Exp = DAG.getNode(ISD::TRUNCATE, DL, VT, Exp); // The exponent contains log2 of the value in biased form. unsigned ExponentBias = FloatEltVT == MVT::f64 ? 1023 : 127; // For trailing zeros, we just need to subtract the bias. if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) - return DAG.getNode(ISD::SUB, DL, VT, Trunc, + return DAG.getNode(ISD::SUB, DL, VT, Exp, DAG.getConstant(ExponentBias, DL, VT)); // For leading zeros, we need to remove the bias and convert from log2 to // leading zeros. We can do this by subtracting from (Bias + (EltSize - 1)). unsigned Adjust = ExponentBias + (EltSize - 1); - return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Trunc); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Exp); } // While RVV has alignment restrictions, we should always be able to load as a @@ -11571,6 +11590,28 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_X_F_V_MF2_MASK); case RISCV::PseudoVFCVT_RM_X_F_V_MF4_MASK: return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_X_F_V_MF4_MASK); + case RISCV::PseudoVFCVT_RM_F_XU_V_M1_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M1_MASK); + case RISCV::PseudoVFCVT_RM_F_XU_V_M2_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M2_MASK); + case RISCV::PseudoVFCVT_RM_F_XU_V_M4_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M4_MASK); + case RISCV::PseudoVFCVT_RM_F_XU_V_M8_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M8_MASK); + case RISCV::PseudoVFCVT_RM_F_XU_V_MF2_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_MF2_MASK); + case RISCV::PseudoVFCVT_RM_F_XU_V_MF4_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_MF4_MASK); + case RISCV::PseudoVFNCVT_RM_F_XU_W_M1_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_M1_MASK); + case RISCV::PseudoVFNCVT_RM_F_XU_W_M2_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_M2_MASK); + case RISCV::PseudoVFNCVT_RM_F_XU_W_M4_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_M4_MASK); + case RISCV::PseudoVFNCVT_RM_F_XU_W_MF2_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_MF2_MASK); + case RISCV::PseudoVFNCVT_RM_F_XU_W_MF4_MASK: + return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_MF4_MASK); case RISCV::PseudoVFROUND_NOEXCEPT_V_M1_MASK: return emitVFROUND_NOEXCEPT_MASK(MI, BB, RISCV::PseudoVFCVT_X_F_V_M1_MASK, RISCV::PseudoVFCVT_F_X_V_M1_MASK); @@ -13167,6 +13208,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VFROUND_NOEXCEPT_VL) NODE_NAME_CASE(SINT_TO_FP_VL) NODE_NAME_CASE(UINT_TO_FP_VL) + NODE_NAME_CASE(VFCVT_RM_F_XU_VL) NODE_NAME_CASE(FP_EXTEND_VL) NODE_NAME_CASE(FP_ROUND_VL) NODE_NAME_CASE(VWMUL_VL) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 883715a..3de2e4d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -243,6 +243,7 @@ enum NodeType : unsigned { VFCVT_RM_X_F_VL, // Has a rounding mode operand. SINT_TO_FP_VL, UINT_TO_FP_VL, + VFCVT_RM_F_XU_VL, // Has a rounding mode operand. FP_ROUND_VL, FP_EXTEND_VL, @@ -704,6 +705,7 @@ private: SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const; SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const; SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 291fdd9..f2d2204 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -3406,6 +3406,17 @@ multiclass VPseudoVCVTF_V { } } +multiclass VPseudoVCVTF_RM_V { + foreach m = MxListF in { + defvar mx = m.MX; + defvar WriteVFCvtIToFV_MX = !cast("WriteVFCvtIToFV_" # mx); + defvar ReadVFCvtIToFV_MX = !cast("ReadVFCvtIToFV_" # mx); + + defm _V : VPseudoConversionRM, + Sched<[WriteVFCvtIToFV_MX, ReadVFCvtIToFV_MX, ReadVMask]>; + } +} + multiclass VPseudoConversionW_V { defvar constraint = "@earlyclobber $rd"; foreach m = MxListW in @@ -3472,6 +3483,18 @@ multiclass VPseudoVNCVTF_W { } } +multiclass VPseudoVNCVTF_RM_W { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxListFW in { + defvar mx = m.MX; + defvar WriteVFNCvtIToFV_MX = !cast("WriteVFNCvtIToFV_" # mx); + defvar ReadVFNCvtIToFV_MX = !cast("ReadVFNCvtIToFV_" # mx); + + defm _W : VPseudoConversionRM, + Sched<[WriteVFNCvtIToFV_MX, ReadVFNCvtIToFV_MX, ReadVMask]>; + } +} + multiclass VPseudoVNCVTD_W { defvar constraint = "@earlyclobber $rd"; foreach m = MxListFW in { @@ -5495,6 +5518,7 @@ let Uses = [FRM] in { defm PseudoVFCVT_F_XU : VPseudoVCVTF_V; defm PseudoVFCVT_F_X : VPseudoVCVTF_V; } +defm PseudoVFCVT_RM_F_XU : VPseudoVCVTF_RM_V; } // mayRaiseFPException = true //===----------------------------------------------------------------------===// @@ -5528,6 +5552,7 @@ defm PseudoVFNCVT_F_X : VPseudoVNCVTF_W; defm PseudoVFNCVT_F_F : VPseudoVNCVTD_W; } defm PseudoVFNCVT_ROD_F_F : VPseudoVNCVTD_W; +defm PseudoVFNCVT_RM_F_XU : VPseudoVNCVTF_RM_W; } // mayRaiseFPException = true } // Predicates = [HasVInstructionsAnyF] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 09b94b2..bbb55f8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -140,11 +140,17 @@ def SDT_RISCVI2FPOp_VL : SDTypeProfile<1, 3, [ SDTCisFP<0>, SDTCisInt<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT> ]>; +def SDT_RISCVI2FPOp_RM_VL : SDTypeProfile<1, 4, [ + SDTCisFP<0>, SDTCisInt<1>, SDTCisSameNumEltsAs<0, 1>, + SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>, + SDTCisVT<4, XLenVT> +]>; def riscv_vfcvt_rtz_x_f_vl : SDNode<"RISCVISD::VFCVT_RTZ_X_F_VL", SDT_RISCVFP2IOp_VL>; def riscv_vfcvt_rtz_xu_f_vl : SDNode<"RISCVISD::VFCVT_RTZ_XU_F_VL", SDT_RISCVFP2IOp_VL>; def riscv_sint_to_fp_vl : SDNode<"RISCVISD::SINT_TO_FP_VL", SDT_RISCVI2FPOp_VL>; def riscv_uint_to_fp_vl : SDNode<"RISCVISD::UINT_TO_FP_VL", SDT_RISCVI2FPOp_VL>; +def riscv_vfcvt_rm_f_xu_vl : SDNode<"RISCVISD::VFCVT_RM_F_XU_VL", SDT_RISCVI2FPOp_RM_VL>; def SDT_RISCVVecCvtF2XOp_VL : SDTypeProfile<1, 4, [ SDTCisInt<0>, SDTCisFP<1>, SDTCisSameNumEltsAs<0, 1>, @@ -796,6 +802,18 @@ multiclass VPatConvertI2FPVL_V { } } +multiclass VPatConvertI2FP_RM_VL_V { + foreach fvti = AllFloatVectors in { + defvar ivti = GetIntVTypeInfo.Vti; + def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1), + (ivti.Mask V0), (XLenVT timm:$frm), + VLOpFrag)), + (!cast(instruction_name#"_"#fvti.LMul.MX#"_MASK") + (fvti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1, + (ivti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>; + } +} + multiclass VPatWConvertFP2IVL_V { foreach fvtiToFWti = AllWidenableFloatVectors in { defvar fvti = fvtiToFWti.Vti; @@ -848,6 +866,19 @@ multiclass VPatNConvertI2FPVL_V { } } +multiclass VPatNConvertI2FP_RM_VL_V { + foreach fvtiToFWti = AllWidenableFloatVectors in { + defvar fvti = fvtiToFWti.Vti; + defvar iwti = GetIntVTypeInfo.Vti; + def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1), + (iwti.Mask V0), (XLenVT timm:$frm), + VLOpFrag)), + (!cast(instruction_name#"_"#fvti.LMul.MX#"_MASK") + (fvti.Vector (IMPLICIT_DEF)), iwti.RegClass:$rs1, + (iwti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>; + } +} + multiclass VPatReductionVL { foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in { defvar vti_m1 = !cast(!if(is_float, "VF", "VI") # vti.SEW # "M1"); @@ -1713,6 +1744,7 @@ foreach fvti = AllFloatVectors in { defm : VPatConvertFP2IVL_V; defm : VPatConvertI2FPVL_V; defm : VPatConvertI2FPVL_V; + defm : VPatConvertI2FP_RM_VL_V; // 13.18. Widening Floating-Point/Integer Type-Convert Instructions defm : VPatWConvertFP2IVL_V; @@ -1735,6 +1767,8 @@ foreach fvti = AllFloatVectors in { defm : VPatNConvertFP2IVL_V; defm : VPatNConvertI2FPVL_V; defm : VPatNConvertI2FPVL_V; + defm : + VPatNConvertI2FP_RM_VL_V; foreach fvtiToFWti = AllWidenableFloatVectors in { defvar fvti = fvtiToFWti.Vti; defvar fwti = fvtiToFWti.Wti; diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll index 27250b4..634d085 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV32,RV32I ; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV64,RV64I +; RUN: llc -mtriple=riscv32 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV64 ; RUN: llc -mtriple=riscv32 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV32 ; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV64 @@ -29,6 +31,20 @@ define @ctlz_nxv1i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv1i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 +; CHECK-F-NEXT: vnsrl.wi v9, v10, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-F-NEXT: li a0, 134 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vrsub.vx v8, v9, a0 +; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv1i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma @@ -72,6 +88,20 @@ define @ctlz_nxv2i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv2i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 +; CHECK-F-NEXT: vnsrl.wi v9, v10, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-F-NEXT: li a0, 134 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vrsub.vx v8, v9, a0 +; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv2i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma @@ -115,6 +145,20 @@ define @ctlz_nxv4i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv4i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 +; CHECK-F-NEXT: vnsrl.wi v9, v10, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-F-NEXT: li a0, 134 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vrsub.vx v8, v9, a0 +; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv4i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma @@ -158,6 +202,20 @@ define @ctlz_nxv8i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv8i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v10, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10 +; CHECK-F-NEXT: vnsrl.wi v10, v12, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-F-NEXT: vnsrl.wi v9, v10, 0 +; CHECK-F-NEXT: li a0, 134 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vrsub.vx v8, v9, a0 +; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv8i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma @@ -201,6 +259,20 @@ define @ctlz_nxv16i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv16i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v12, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12 +; CHECK-F-NEXT: vnsrl.wi v12, v16, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; CHECK-F-NEXT: vnsrl.wi v10, v12, 0 +; CHECK-F-NEXT: li a0, 134 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vrsub.vx v8, v10, a0 +; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv16i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma @@ -344,6 +416,18 @@ define @ctlz_nxv1i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv1i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8 +; CHECK-F-NEXT: vnsrl.wi v9, v9, 23 +; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vrsub.vx v9, v9, a0 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a0, 16 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv1i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma @@ -427,6 +511,18 @@ define @ctlz_nxv2i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv2i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8 +; CHECK-F-NEXT: vnsrl.wi v9, v9, 23 +; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vrsub.vx v9, v9, a0 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a0, 16 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv2i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma @@ -510,6 +606,18 @@ define @ctlz_nxv4i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv4i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v8 +; CHECK-F-NEXT: vnsrl.wi v9, v10, 23 +; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vrsub.vx v9, v9, a0 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a0, 16 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv4i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma @@ -593,6 +701,18 @@ define @ctlz_nxv8i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv8i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v8 +; CHECK-F-NEXT: vnsrl.wi v10, v12, 23 +; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vrsub.vx v10, v10, a0 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a0, 16 +; CHECK-F-NEXT: vmerge.vxm v8, v10, a0, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv8i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma @@ -676,6 +796,18 @@ define @ctlz_nxv16i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv16i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v8 +; CHECK-F-NEXT: vnsrl.wi v12, v16, 23 +; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vrsub.vx v12, v12, a0 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a0, 16 +; CHECK-F-NEXT: vmerge.vxm v8, v12, a0, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv16i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma @@ -836,6 +968,21 @@ define @ctlz_nxv1i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv1i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v9, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v9, v9, 23 +; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vrsub.vx v9, v9, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv1i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma @@ -929,6 +1076,21 @@ define @ctlz_nxv2i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv2i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v9, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v9, v9, 23 +; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vrsub.vx v9, v9, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv2i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma @@ -1022,6 +1184,21 @@ define @ctlz_nxv4i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv4i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v10, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v10, v10, 23 +; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vrsub.vx v10, v10, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv4i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma @@ -1115,6 +1292,21 @@ define @ctlz_nxv8i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_nxv8i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v12, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v12, v12, 23 +; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vrsub.vx v12, v12, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_nxv8i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma @@ -1136,481 +1328,643 @@ define @ctlz_nxv8i32( %va) { declare @llvm.ctlz.nxv8i32(, i1) define @ctlz_nxv16i32( %va) { -; RV32-LABEL: ctlz_nxv16i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_nxv16i32: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v16, v16, a0 +; RV32I-NEXT: vsub.vv v8, v8, v16 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v16, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v16, v8 +; RV32I-NEXT: vsrl.vi v16, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v16 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_nxv16i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_nxv16i32: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vsub.vv v8, v8, v16 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v16, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v16, v8 +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v16 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: ctlz_nxv16i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v16, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v16, v16, 23 +; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vrsub.vx v16, v16, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: ctlz_nxv16i32: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v16, v8, v0.t +; CHECK-D-NEXT: vsrl.vi v16, v16, 23 +; CHECK-D-NEXT: li a1, 158 +; CHECK-D-NEXT: vrsub.vx v16, v16, a1 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: li a1, 32 +; CHECK-D-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.ctlz.nxv16i32( %va, i1 false) ret %a } declare @llvm.ctlz.nxv16i32(, i1) define @ctlz_nxv1i64( %va) { -; RV32-LABEL: ctlz_nxv1i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsrl.vi v11, v8, 1 -; RV32-NEXT: vand.vv v9, v11, v9 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: vand.vv v9, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsrl.vi v11, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v11 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vmul.vv v8, v8, v10 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_nxv1i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsrl.vx v9, v8, a0 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v9, (a0), zero +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vsrl.vi v11, v8, 1 +; RV32I-NEXT: vand.vv v9, v11, v9 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vlse64.v v9, (a0), zero +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vsrl.vi v11, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v11 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vmul.vv v8, v8, v10 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_nxv1i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: lui a0, %hi(.LCPI18_0) -; RV64-NEXT: ld a0, %lo(.LCPI18_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI18_1) -; RV64-NEXT: ld a1, %lo(.LCPI18_1)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: vand.vx v9, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: lui a0, %hi(.LCPI18_2) -; RV64-NEXT: ld a0, %lo(.LCPI18_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI18_3) -; RV64-NEXT: ld a1, %lo(.LCPI18_3)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_nxv1i64: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: vsrl.vx v9, v8, a0 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI18_0) +; RV64I-NEXT: ld a0, %lo(.LCPI18_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI18_1) +; RV64I-NEXT: ld a1, %lo(.LCPI18_1)(a1) +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: vand.vx v9, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI18_2) +; RV64I-NEXT: ld a0, %lo(.LCPI18_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI18_3) +; RV64I-NEXT: ld a1, %lo(.LCPI18_3)(a1) +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: ctlz_nxv1i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v9, v9, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vzext.vf2 v10, v9 +; CHECK-F-NEXT: li a1, 190 +; CHECK-F-NEXT: vrsub.vx v9, v10, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: ctlz_nxv1i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v9, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v9, v9, a1 +; CHECK-D-NEXT: li a1, 1086 +; CHECK-D-NEXT: vrsub.vx v9, v9, a1 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: li a1, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v9, a1, v0 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.ctlz.nxv1i64( %va, i1 false) ret %a } declare @llvm.ctlz.nxv1i64(, i1) define @ctlz_nxv2i64( %va) { -; RV32-LABEL: ctlz_nxv2i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v10, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsrl.vi v14, v8, 1 -; RV32-NEXT: vand.vv v10, v14, v10 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: vand.vv v10, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsrl.vi v14, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v14 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vmul.vv v8, v8, v12 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_nxv2i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsrl.vx v10, v8, a0 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vsrl.vi v14, v8, 1 +; RV32I-NEXT: vand.vv v10, v14, v10 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vsrl.vi v14, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v14 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vmul.vv v8, v8, v12 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_nxv2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: lui a0, %hi(.LCPI19_0) -; RV64-NEXT: ld a0, %lo(.LCPI19_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI19_1) -; RV64-NEXT: ld a1, %lo(.LCPI19_1)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: vand.vx v10, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: lui a0, %hi(.LCPI19_2) -; RV64-NEXT: ld a0, %lo(.LCPI19_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI19_3) -; RV64-NEXT: ld a1, %lo(.LCPI19_3)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_nxv2i64: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: vsrl.vx v10, v8, a0 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI19_0) +; RV64I-NEXT: ld a0, %lo(.LCPI19_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI19_1) +; RV64I-NEXT: ld a1, %lo(.LCPI19_1)(a1) +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: vand.vx v10, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI19_2) +; RV64I-NEXT: ld a0, %lo(.LCPI19_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI19_3) +; RV64I-NEXT: ld a1, %lo(.LCPI19_3)(a1) +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: ctlz_nxv2i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v10, v10, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v12, v10 +; CHECK-F-NEXT: li a1, 190 +; CHECK-F-NEXT: vrsub.vx v10, v12, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: ctlz_nxv2i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v10, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v10, v10, a1 +; CHECK-D-NEXT: li a1, 1086 +; CHECK-D-NEXT: vrsub.vx v10, v10, a1 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: li a1, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v10, a1, v0 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.ctlz.nxv2i64( %va, i1 false) ret %a } declare @llvm.ctlz.nxv2i64(, i1) define @ctlz_nxv4i64( %va) { -; RV32-LABEL: ctlz_nxv4i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v12, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsrl.vi v20, v8, 1 -; RV32-NEXT: vand.vv v12, v20, v12 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: vand.vv v12, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsrl.vi v20, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v20 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: ctlz_nxv4i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: lui a0, %hi(.LCPI20_0) -; RV64-NEXT: ld a0, %lo(.LCPI20_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI20_1) -; RV64-NEXT: ld a1, %lo(.LCPI20_1)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: vand.vx v12, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: lui a0, %hi(.LCPI20_2) -; RV64-NEXT: ld a0, %lo(.LCPI20_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI20_3) -; RV64-NEXT: ld a1, %lo(.LCPI20_3)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv4i64( %va, i1 false) - ret %a -} -declare @llvm.ctlz.nxv4i64(, i1) - -define @ctlz_nxv8i64( %va) { -; RV32-LABEL: ctlz_nxv8i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v16, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_nxv4i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsrl.vx v12, v8, a0 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vsrl.vi v20, v8, 1 +; RV32I-NEXT: vand.vv v12, v20, v12 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: vand.vv v12, v8, v16 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vsrl.vi v20, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v20 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vmul.vv v8, v8, v16 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_nxv8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: lui a0, %hi(.LCPI21_0) -; RV64-NEXT: ld a0, %lo(.LCPI21_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI21_1) -; RV64-NEXT: ld a1, %lo(.LCPI21_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: lui a0, %hi(.LCPI21_2) -; RV64-NEXT: ld a0, %lo(.LCPI21_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI21_3) -; RV64-NEXT: ld a1, %lo(.LCPI21_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_nxv4i64: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: vsrl.vx v12, v8, a0 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI20_0) +; RV64I-NEXT: ld a0, %lo(.LCPI20_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI20_1) +; RV64I-NEXT: ld a1, %lo(.LCPI20_1)(a1) +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: vand.vx v12, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI20_2) +; RV64I-NEXT: ld a0, %lo(.LCPI20_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI20_3) +; RV64I-NEXT: ld a1, %lo(.LCPI20_3)(a1) +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: ctlz_nxv4i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v12, v12, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v16, v12 +; CHECK-F-NEXT: li a1, 190 +; CHECK-F-NEXT: vrsub.vx v12, v16, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: ctlz_nxv4i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v12, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v12, v12, a1 +; CHECK-D-NEXT: li a1, 1086 +; CHECK-D-NEXT: vrsub.vx v12, v12, a1 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: li a1, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v12, a1, v0 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret + %a = call @llvm.ctlz.nxv4i64( %va, i1 false) + ret %a +} +declare @llvm.ctlz.nxv4i64(, i1) + +define @ctlz_nxv8i64( %va) { +; RV32I-LABEL: ctlz_nxv8i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsrl.vx v16, v8, a0 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vlse64.v v24, (a0), zero +; RV32I-NEXT: vsrl.vi v0, v8, 1 +; RV32I-NEXT: vand.vv v16, v0, v16 +; RV32I-NEXT: vsub.vv v8, v8, v16 +; RV32I-NEXT: vand.vv v16, v8, v24 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v24 +; RV32I-NEXT: vadd.vv v8, v16, v8 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vlse64.v v24, (a0), zero +; RV32I-NEXT: vsrl.vi v0, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v0 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vmul.vv v8, v8, v24 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv8i64: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: vsrl.vx v16, v8, a0 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI21_0) +; RV64I-NEXT: ld a0, %lo(.LCPI21_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI21_1) +; RV64I-NEXT: ld a1, %lo(.LCPI21_1)(a1) +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vsub.vv v8, v8, v16 +; RV64I-NEXT: vand.vx v16, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v16, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI21_2) +; RV64I-NEXT: ld a0, %lo(.LCPI21_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI21_3) +; RV64I-NEXT: ld a1, %lo(.LCPI21_3)(a1) +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v16 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: ctlz_nxv8i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v16, v16, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vzext.vf2 v24, v16 +; CHECK-F-NEXT: li a1, 190 +; CHECK-F-NEXT: vrsub.vx v16, v24, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: ctlz_nxv8i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v16, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v16, v16, a1 +; CHECK-D-NEXT: li a1, 1086 +; CHECK-D-NEXT: vrsub.vx v16, v16, a1 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: li a1, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.ctlz.nxv8i64( %va, i1 false) ret %a } @@ -1641,6 +1995,18 @@ define @ctlz_zero_undef_nxv1i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv1i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9 +; CHECK-F-NEXT: vnsrl.wi v8, v8, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-F-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-F-NEXT: li a0, 134 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma @@ -1681,6 +2047,18 @@ define @ctlz_zero_undef_nxv2i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv2i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9 +; CHECK-F-NEXT: vnsrl.wi v8, v8, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-F-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-F-NEXT: li a0, 134 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma @@ -1721,6 +2099,18 @@ define @ctlz_zero_undef_nxv4i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv4i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 +; CHECK-F-NEXT: vnsrl.wi v8, v10, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-F-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-F-NEXT: li a0, 134 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma @@ -1761,6 +2151,18 @@ define @ctlz_zero_undef_nxv8i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv8i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v10, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10 +; CHECK-F-NEXT: vnsrl.wi v8, v12, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-F-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-F-NEXT: li a0, 134 +; CHECK-F-NEXT: vrsub.vx v8, v10, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma @@ -1801,6 +2203,18 @@ define @ctlz_zero_undef_nxv16i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv16i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v12, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12 +; CHECK-F-NEXT: vnsrl.wi v8, v16, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; CHECK-F-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-F-NEXT: li a0, 134 +; CHECK-F-NEXT: vrsub.vx v8, v12, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv16i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma @@ -1939,6 +2353,15 @@ define @ctlz_zero_undef_nxv1i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv1i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v9, 23 +; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma @@ -2018,6 +2441,15 @@ define @ctlz_zero_undef_nxv2i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv2i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v9, 23 +; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma @@ -2097,6 +2529,15 @@ define @ctlz_zero_undef_nxv4i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv4i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v10, 23 +; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma @@ -2176,6 +2617,15 @@ define @ctlz_zero_undef_nxv8i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv8i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v12, 23 +; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma @@ -2255,6 +2705,15 @@ define @ctlz_zero_undef_nxv16i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv16i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v16, 23 +; CHECK-F-NEXT: li a0, 142 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv16i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma @@ -2410,6 +2869,18 @@ define @ctlz_zero_undef_nxv1i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv1i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vrsub.vx v8, v8, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma @@ -2499,6 +2970,18 @@ define @ctlz_zero_undef_nxv2i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv2i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vrsub.vx v8, v8, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma @@ -2588,6 +3071,18 @@ define @ctlz_zero_undef_nxv4i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv4i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vrsub.vx v8, v8, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma @@ -2677,6 +3172,18 @@ define @ctlz_zero_undef_nxv8i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: ctlz_zero_undef_nxv8i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vrsub.vx v8, v8, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma @@ -2694,477 +3201,609 @@ define @ctlz_zero_undef_nxv8i32( %va) { } define @ctlz_zero_undef_nxv16i32( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv16i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_zero_undef_nxv16i32: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v16, v16, a0 +; RV32I-NEXT: vsub.vv v8, v8, v16 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v16, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v16, v8 +; RV32I-NEXT: vsrl.vi v16, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v16 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv16i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_zero_undef_nxv16i32: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vsub.vv v8, v8, v16 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v16, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v16, v8 +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v16 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: ctlz_zero_undef_nxv16i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a1, 158 +; CHECK-F-NEXT: vrsub.vx v8, v8, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: ctlz_zero_undef_nxv16i32: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-D-NEXT: vsrl.vi v8, v8, 23 +; CHECK-D-NEXT: li a1, 158 +; CHECK-D-NEXT: vrsub.vx v8, v8, a1 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.ctlz.nxv16i32( %va, i1 true) ret %a } define @ctlz_zero_undef_nxv1i64( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv1i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsrl.vi v11, v8, 1 -; RV32-NEXT: vand.vv v9, v11, v9 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: vand.vv v9, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsrl.vi v11, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v11 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vmul.vv v8, v8, v10 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_zero_undef_nxv1i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsrl.vx v9, v8, a0 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v9, (a0), zero +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vsrl.vi v11, v8, 1 +; RV32I-NEXT: vand.vv v9, v11, v9 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vlse64.v v9, (a0), zero +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vsrl.vi v11, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v11 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vmul.vv v8, v8, v10 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv1i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: lui a0, %hi(.LCPI40_0) -; RV64-NEXT: ld a0, %lo(.LCPI40_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI40_1) -; RV64-NEXT: ld a1, %lo(.LCPI40_1)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: vand.vx v9, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: lui a0, %hi(.LCPI40_2) -; RV64-NEXT: ld a0, %lo(.LCPI40_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI40_3) -; RV64-NEXT: ld a1, %lo(.LCPI40_3)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_zero_undef_nxv1i64: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: vsrl.vx v9, v8, a0 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI40_0) +; RV64I-NEXT: ld a0, %lo(.LCPI40_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI40_1) +; RV64I-NEXT: ld a1, %lo(.LCPI40_1)(a1) +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: vand.vx v9, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI40_2) +; RV64I-NEXT: ld a0, %lo(.LCPI40_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI40_3) +; RV64I-NEXT: ld a1, %lo(.LCPI40_3)(a1) +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: ctlz_zero_undef_nxv1i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v9, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: li a1, 190 +; CHECK-F-NEXT: vrsub.vx v8, v9, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a1 +; CHECK-D-NEXT: li a1, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a1 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.ctlz.nxv1i64( %va, i1 true) ret %a } define @ctlz_zero_undef_nxv2i64( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv2i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v10, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsrl.vi v14, v8, 1 -; RV32-NEXT: vand.vv v10, v14, v10 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: vand.vv v10, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsrl.vi v14, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v14 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vmul.vv v8, v8, v12 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_zero_undef_nxv2i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsrl.vx v10, v8, a0 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vsrl.vi v14, v8, 1 +; RV32I-NEXT: vand.vv v10, v14, v10 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vsrl.vi v14, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v14 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vmul.vv v8, v8, v12 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: lui a0, %hi(.LCPI41_0) -; RV64-NEXT: ld a0, %lo(.LCPI41_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI41_1) -; RV64-NEXT: ld a1, %lo(.LCPI41_1)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: vand.vx v10, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: lui a0, %hi(.LCPI41_2) -; RV64-NEXT: ld a0, %lo(.LCPI41_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI41_3) -; RV64-NEXT: ld a1, %lo(.LCPI41_3)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_zero_undef_nxv2i64: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: vsrl.vx v10, v8, a0 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI41_0) +; RV64I-NEXT: ld a0, %lo(.LCPI41_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI41_1) +; RV64I-NEXT: ld a1, %lo(.LCPI41_1)(a1) +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: vand.vx v10, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI41_2) +; RV64I-NEXT: ld a0, %lo(.LCPI41_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI41_3) +; RV64I-NEXT: ld a1, %lo(.LCPI41_3)(a1) +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: ctlz_zero_undef_nxv2i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v10, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v10, v8 +; CHECK-F-NEXT: li a1, 190 +; CHECK-F-NEXT: vrsub.vx v8, v10, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a1 +; CHECK-D-NEXT: li a1, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a1 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.ctlz.nxv2i64( %va, i1 true) ret %a } define @ctlz_zero_undef_nxv4i64( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv4i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v12, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsrl.vi v20, v8, 1 -; RV32-NEXT: vand.vv v12, v20, v12 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: vand.vv v12, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsrl.vi v20, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v20 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_zero_undef_nxv4i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsrl.vx v12, v8, a0 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vsrl.vi v20, v8, 1 +; RV32I-NEXT: vand.vv v12, v20, v12 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: vand.vv v12, v8, v16 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vsrl.vi v20, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v20 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vmul.vv v8, v8, v16 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv4i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: lui a0, %hi(.LCPI42_0) -; RV64-NEXT: ld a0, %lo(.LCPI42_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI42_1) -; RV64-NEXT: ld a1, %lo(.LCPI42_1)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: vand.vx v12, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: lui a0, %hi(.LCPI42_2) -; RV64-NEXT: ld a0, %lo(.LCPI42_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI42_3) -; RV64-NEXT: ld a1, %lo(.LCPI42_3)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_zero_undef_nxv4i64: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: vsrl.vx v12, v8, a0 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI42_0) +; RV64I-NEXT: ld a0, %lo(.LCPI42_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI42_1) +; RV64I-NEXT: ld a1, %lo(.LCPI42_1)(a1) +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: vand.vx v12, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI42_2) +; RV64I-NEXT: ld a0, %lo(.LCPI42_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI42_3) +; RV64I-NEXT: ld a1, %lo(.LCPI42_3)(a1) +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: ctlz_zero_undef_nxv4i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v12, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v12, v8 +; CHECK-F-NEXT: li a1, 190 +; CHECK-F-NEXT: vrsub.vx v8, v12, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a1 +; CHECK-D-NEXT: li a1, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a1 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.ctlz.nxv4i64( %va, i1 true) ret %a } define @ctlz_zero_undef_nxv8i64( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv8i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v16, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_zero_undef_nxv8i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: li a0, 32 +; RV32I-NEXT: vsrl.vx v16, v8, a0 +; RV32I-NEXT: vor.vv v8, v8, v16 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vlse64.v v24, (a0), zero +; RV32I-NEXT: vsrl.vi v0, v8, 1 +; RV32I-NEXT: vand.vv v16, v0, v16 +; RV32I-NEXT: vsub.vv v8, v8, v16 +; RV32I-NEXT: vand.vv v16, v8, v24 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v24 +; RV32I-NEXT: vadd.vv v8, v16, v8 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vlse64.v v24, (a0), zero +; RV32I-NEXT: vsrl.vi v0, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v0 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vmul.vv v8, v8, v24 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: lui a0, %hi(.LCPI43_0) -; RV64-NEXT: ld a0, %lo(.LCPI43_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI43_1) -; RV64-NEXT: ld a1, %lo(.LCPI43_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: lui a0, %hi(.LCPI43_2) -; RV64-NEXT: ld a0, %lo(.LCPI43_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI43_3) -; RV64-NEXT: ld a1, %lo(.LCPI43_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_zero_undef_nxv8i64: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: vsrl.vx v16, v8, a0 +; RV64I-NEXT: vor.vv v8, v8, v16 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI43_0) +; RV64I-NEXT: ld a0, %lo(.LCPI43_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI43_1) +; RV64I-NEXT: ld a1, %lo(.LCPI43_1)(a1) +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vsub.vv v8, v8, v16 +; RV64I-NEXT: vand.vx v16, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v16, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI43_2) +; RV64I-NEXT: ld a0, %lo(.LCPI43_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI43_3) +; RV64I-NEXT: ld a1, %lo(.LCPI43_3)(a1) +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v16 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v16, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vzext.vf2 v16, v8 +; CHECK-F-NEXT: li a1, 190 +; CHECK-F-NEXT: vrsub.vx v8, v16, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: ctlz_zero_undef_nxv8i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a1 +; CHECK-D-NEXT: li a1, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a1 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.ctlz.nxv8i64( %va, i1 true) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll index fd02061..439e63a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV32,RV32I ; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV64,RV64I -; RUN: llc -mtriple=riscv32 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV32,RV32F +; RUN: llc -mtriple=riscv64 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV64,RV64F +; RUN: llc -mtriple=riscv32 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV32,RV32D +; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV64,RV64D define @cttz_nxv1i8( %va) { ; CHECK-ZVE64X-LABEL: cttz_nxv1i8: @@ -26,6 +28,23 @@ define @cttz_nxv1i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv1i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e8, mf8, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v10, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v10 +; CHECK-F-NEXT: vnsrl.wi v9, v9, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsub.vx v8, v9, a0 +; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv1i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf8, ta, ma @@ -69,6 +88,23 @@ define @cttz_nxv2i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv2i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e8, mf4, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v10, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v10 +; CHECK-F-NEXT: vnsrl.wi v9, v9, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsub.vx v8, v9, a0 +; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv2i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf4, ta, ma @@ -112,6 +148,23 @@ define @cttz_nxv4i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv4i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e8, mf2, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-F-NEXT: vzext.vf2 v10, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10 +; CHECK-F-NEXT: vnsrl.wi v9, v12, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsub.vx v8, v9, a0 +; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv4i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf2, ta, ma @@ -155,6 +208,23 @@ define @cttz_nxv8i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv8i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v10, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10 +; CHECK-F-NEXT: vnsrl.wi v10, v12, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-F-NEXT: vnsrl.wi v9, v10, 0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsub.vx v8, v9, a0 +; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv8i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, m1, ta, ma @@ -198,6 +268,23 @@ define @cttz_nxv16i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv16i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-F-NEXT: vrsub.vi v10, v8, 0 +; CHECK-F-NEXT: vand.vv v10, v8, v10 +; CHECK-F-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v12, v10 +; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12 +; CHECK-F-NEXT: vnsrl.wi v12, v16, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; CHECK-F-NEXT: vnsrl.wi v10, v12, 0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsub.vx v8, v10, a0 +; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv16i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, m2, ta, ma @@ -328,6 +415,20 @@ define @cttz_nxv1i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv1i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 +; CHECK-F-NEXT: vnsrl.wi v9, v10, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v9, v9, a0 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a0, 16 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv1i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma @@ -403,6 +504,20 @@ define @cttz_nxv2i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv2i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 +; CHECK-F-NEXT: vnsrl.wi v9, v10, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v9, v9, a0 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a0, 16 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv2i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma @@ -478,6 +593,20 @@ define @cttz_nxv4i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv4i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 +; CHECK-F-NEXT: vnsrl.wi v9, v10, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v9, v9, a0 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a0, 16 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv4i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma @@ -553,6 +682,20 @@ define @cttz_nxv8i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv8i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-F-NEXT: vrsub.vi v10, v8, 0 +; CHECK-F-NEXT: vand.vv v10, v8, v10 +; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10 +; CHECK-F-NEXT: vnsrl.wi v10, v12, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v10, v10, a0 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a0, 16 +; CHECK-F-NEXT: vmerge.vxm v8, v10, a0, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv8i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma @@ -628,6 +771,20 @@ define @cttz_nxv16i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv16i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-F-NEXT: vrsub.vi v12, v8, 0 +; CHECK-F-NEXT: vand.vv v12, v8, v12 +; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12 +; CHECK-F-NEXT: vnsrl.wi v12, v16, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v12, v12, a0 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a0, 16 +; CHECK-F-NEXT: vmerge.vxm v8, v12, a0, v0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv16i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma @@ -766,6 +923,23 @@ define @cttz_nxv1i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv1i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v9, v9, v0.t +; CHECK-F-NEXT: vsrl.vi v9, v9, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v9, v9, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv1i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma @@ -847,6 +1021,23 @@ define @cttz_nxv2i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv2i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v9, v9, v0.t +; CHECK-F-NEXT: vsrl.vi v9, v9, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v9, v9, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv2i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma @@ -928,6 +1119,23 @@ define @cttz_nxv4i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv4i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vrsub.vi v10, v8, 0 +; CHECK-F-NEXT: vand.vv v10, v8, v10 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v10, v10, v0.t +; CHECK-F-NEXT: vsrl.vi v10, v10, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v10, v10, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv4i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma @@ -1009,6 +1217,23 @@ define @cttz_nxv8i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_nxv8i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vrsub.vi v12, v8, 0 +; CHECK-F-NEXT: vand.vv v12, v8, v12 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v12, v12, v0.t +; CHECK-F-NEXT: vsrl.vi v12, v12, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v12, v12, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_nxv8i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma @@ -1032,387 +1257,733 @@ define @cttz_nxv8i32( %va) { declare @llvm.cttz.nxv8i32(, i1) define @cttz_nxv16i32( %va) { -; RV32-LABEL: cttz_nxv16i32: -; RV32: # %bb.0: -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a0 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_nxv16i32: +; RV32I: # %bb.0: +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32I-NEXT: vsub.vx v16, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v16, v16, a0 +; RV32I-NEXT: vsub.vv v8, v8, v16 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v16, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v16, v8 +; RV32I-NEXT: vsrl.vi v16, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v16 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_nxv16i32: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a0 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_nxv16i32: +; RV64I: # %bb.0: +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV64I-NEXT: vsub.vx v16, v8, a0 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vsub.vv v8, v8, v16 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v16, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v16, v8 +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v16 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: cttz_nxv16i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-F-NEXT: vrsub.vi v16, v8, 0 +; CHECK-F-NEXT: vand.vv v16, v8, v16 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v16, v16, v0.t +; CHECK-F-NEXT: vsrl.vi v16, v16, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v16, v16, a1 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: li a1, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: cttz_nxv16i32: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-D-NEXT: vrsub.vi v16, v8, 0 +; CHECK-D-NEXT: vand.vv v16, v8, v16 +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v16, v16, v0.t +; CHECK-D-NEXT: vsrl.vi v16, v16, 23 +; CHECK-D-NEXT: li a1, 127 +; CHECK-D-NEXT: vsub.vx v16, v16, a1 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: li a1, 32 +; CHECK-D-NEXT: vmerge.vxm v8, v16, a1, v0 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.cttz.nxv16i32( %va, i1 false) ret %a } declare @llvm.cttz.nxv16i32(, i1) define @cttz_nxv1i64( %va) { -; RV32-LABEL: cttz_nxv1i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a0 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsrl.vi v11, v8, 1 -; RV32-NEXT: vand.vv v10, v11, v10 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsrl.vi v11, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v11 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vmul.vv v8, v8, v10 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_nxv1i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vlse64.v v9, (a0), zero +; RV32I-NEXT: vsrl.vi v11, v8, 1 +; RV32I-NEXT: vand.vv v10, v11, v10 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v9 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vlse64.v v9, (a0), zero +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vsrl.vi v11, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v11 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vmul.vv v8, v8, v10 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_nxv1i64: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a0 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: lui a0, %hi(.LCPI18_0) -; RV64-NEXT: ld a0, %lo(.LCPI18_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI18_1) -; RV64-NEXT: ld a1, %lo(.LCPI18_1)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: vand.vx v9, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: lui a0, %hi(.LCPI18_2) -; RV64-NEXT: ld a0, %lo(.LCPI18_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI18_3) -; RV64-NEXT: ld a1, %lo(.LCPI18_3)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_nxv1i64: +; RV64I: # %bb.0: +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: lui a0, %hi(.LCPI18_0) +; RV64I-NEXT: ld a0, %lo(.LCPI18_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI18_1) +; RV64I-NEXT: ld a1, %lo(.LCPI18_1)(a1) +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: vand.vx v9, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI18_2) +; RV64I-NEXT: ld a0, %lo(.LCPI18_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI18_3) +; RV64I-NEXT: ld a1, %lo(.LCPI18_3)(a1) +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; RV32F-LABEL: cttz_nxv1i64: +; RV32F: # %bb.0: +; RV32F-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32F-NEXT: vmseq.vx v9, v8, zero +; RV32F-NEXT: vrsub.vi v10, v8, 0 +; RV32F-NEXT: vand.vv v8, v8, v10 +; RV32F-NEXT: vmset.m v0 +; RV32F-NEXT: fsrmi a0, 1 +; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; RV32F-NEXT: vfncvt.f.xu.w v10, v8, v0.t +; RV32F-NEXT: vsrl.vi v8, v10, 23 +; RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32F-NEXT: vzext.vf2 v10, v8 +; RV32F-NEXT: li a1, 127 +; RV32F-NEXT: vsub.vx v8, v10, a1 +; RV32F-NEXT: li a1, 64 +; RV32F-NEXT: vmv.v.v v0, v9 +; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32F-NEXT: fsrm a0 +; RV32F-NEXT: ret +; +; RV64F-LABEL: cttz_nxv1i64: +; RV64F: # %bb.0: +; RV64F-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64F-NEXT: vrsub.vi v9, v8, 0 +; RV64F-NEXT: vand.vv v9, v8, v9 +; RV64F-NEXT: vmset.m v0 +; RV64F-NEXT: fsrmi a0, 1 +; RV64F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; RV64F-NEXT: vfncvt.f.xu.w v10, v9, v0.t +; RV64F-NEXT: vsrl.vi v9, v10, 23 +; RV64F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64F-NEXT: vzext.vf2 v10, v9 +; RV64F-NEXT: li a1, 127 +; RV64F-NEXT: vsub.vx v9, v10, a1 +; RV64F-NEXT: vmseq.vi v0, v8, 0 +; RV64F-NEXT: li a1, 64 +; RV64F-NEXT: vmerge.vxm v8, v9, a1, v0 +; RV64F-NEXT: fsrm a0 +; RV64F-NEXT: ret +; +; RV32D-LABEL: cttz_nxv1i64: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32D-NEXT: vmseq.vx v9, v8, zero +; RV32D-NEXT: vrsub.vi v10, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v10 +; RV32D-NEXT: vmset.m v0 +; RV32D-NEXT: fsrmi a0, 1 +; RV32D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; RV32D-NEXT: li a1, 52 +; RV32D-NEXT: vsrl.vx v8, v8, a1 +; RV32D-NEXT: li a1, 1023 +; RV32D-NEXT: vsub.vx v8, v8, a1 +; RV32D-NEXT: li a1, 64 +; RV32D-NEXT: vmv.v.v v0, v9 +; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32D-NEXT: fsrm a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv1i64: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v9, v8, v9 +; RV64D-NEXT: vmset.m v0 +; RV64D-NEXT: fsrmi a0, 1 +; RV64D-NEXT: vfcvt.f.xu.v v9, v9, v0.t +; RV64D-NEXT: li a1, 52 +; RV64D-NEXT: vsrl.vx v9, v9, a1 +; RV64D-NEXT: li a1, 1023 +; RV64D-NEXT: vsub.vx v9, v9, a1 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: li a1, 64 +; RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; RV64D-NEXT: fsrm a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv1i64( %va, i1 false) ret %a } declare @llvm.cttz.nxv1i64(, i1) define @cttz_nxv2i64( %va) { -; RV32-LABEL: cttz_nxv2i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a0 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsrl.vi v14, v8, 1 -; RV32-NEXT: vand.vv v12, v14, v12 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsrl.vi v14, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v14 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vmul.vv v8, v8, v12 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_nxv2i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vsub.vx v10, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vsrl.vi v14, v8, 1 +; RV32I-NEXT: vand.vv v12, v14, v12 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: vand.vv v12, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vsrl.vi v14, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v14 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vmul.vv v8, v8, v12 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_nxv2i64: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a0 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v10 -; RV64-NEXT: lui a0, %hi(.LCPI19_0) -; RV64-NEXT: ld a0, %lo(.LCPI19_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI19_1) -; RV64-NEXT: ld a1, %lo(.LCPI19_1)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: vand.vx v10, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: lui a0, %hi(.LCPI19_2) -; RV64-NEXT: ld a0, %lo(.LCPI19_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI19_3) -; RV64-NEXT: ld a1, %lo(.LCPI19_3)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_nxv2i64: +; RV64I: # %bb.0: +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64I-NEXT: vsub.vx v10, v8, a0 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: lui a0, %hi(.LCPI19_0) +; RV64I-NEXT: ld a0, %lo(.LCPI19_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI19_1) +; RV64I-NEXT: ld a1, %lo(.LCPI19_1)(a1) +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: vand.vx v10, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI19_2) +; RV64I-NEXT: ld a0, %lo(.LCPI19_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI19_3) +; RV64I-NEXT: ld a1, %lo(.LCPI19_3)(a1) +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; RV32F-LABEL: cttz_nxv2i64: +; RV32F: # %bb.0: +; RV32F-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32F-NEXT: vmseq.vx v10, v8, zero +; RV32F-NEXT: vrsub.vi v12, v8, 0 +; RV32F-NEXT: vand.vv v8, v8, v12 +; RV32F-NEXT: vmset.m v0 +; RV32F-NEXT: fsrmi a0, 1 +; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV32F-NEXT: vfncvt.f.xu.w v11, v8, v0.t +; RV32F-NEXT: vsrl.vi v8, v11, 23 +; RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32F-NEXT: vzext.vf2 v12, v8 +; RV32F-NEXT: li a1, 127 +; RV32F-NEXT: vsub.vx v8, v12, a1 +; RV32F-NEXT: li a1, 64 +; RV32F-NEXT: vmv1r.v v0, v10 +; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32F-NEXT: fsrm a0 +; RV32F-NEXT: ret +; +; RV64F-LABEL: cttz_nxv2i64: +; RV64F: # %bb.0: +; RV64F-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64F-NEXT: vrsub.vi v10, v8, 0 +; RV64F-NEXT: vand.vv v10, v8, v10 +; RV64F-NEXT: vmset.m v0 +; RV64F-NEXT: fsrmi a0, 1 +; RV64F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64F-NEXT: vfncvt.f.xu.w v12, v10, v0.t +; RV64F-NEXT: vsrl.vi v10, v12, 23 +; RV64F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64F-NEXT: vzext.vf2 v12, v10 +; RV64F-NEXT: li a1, 127 +; RV64F-NEXT: vsub.vx v10, v12, a1 +; RV64F-NEXT: vmseq.vi v0, v8, 0 +; RV64F-NEXT: li a1, 64 +; RV64F-NEXT: vmerge.vxm v8, v10, a1, v0 +; RV64F-NEXT: fsrm a0 +; RV64F-NEXT: ret +; +; RV32D-LABEL: cttz_nxv2i64: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32D-NEXT: vmseq.vx v10, v8, zero +; RV32D-NEXT: vrsub.vi v12, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v12 +; RV32D-NEXT: vmset.m v0 +; RV32D-NEXT: fsrmi a0, 1 +; RV32D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; RV32D-NEXT: li a1, 52 +; RV32D-NEXT: vsrl.vx v8, v8, a1 +; RV32D-NEXT: li a1, 1023 +; RV32D-NEXT: vsub.vx v8, v8, a1 +; RV32D-NEXT: li a1, 64 +; RV32D-NEXT: vmv1r.v v0, v10 +; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32D-NEXT: fsrm a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv2i64: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64D-NEXT: vrsub.vi v10, v8, 0 +; RV64D-NEXT: vand.vv v10, v8, v10 +; RV64D-NEXT: vmset.m v0 +; RV64D-NEXT: fsrmi a0, 1 +; RV64D-NEXT: vfcvt.f.xu.v v10, v10, v0.t +; RV64D-NEXT: li a1, 52 +; RV64D-NEXT: vsrl.vx v10, v10, a1 +; RV64D-NEXT: li a1, 1023 +; RV64D-NEXT: vsub.vx v10, v10, a1 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: li a1, 64 +; RV64D-NEXT: vmerge.vxm v8, v10, a1, v0 +; RV64D-NEXT: fsrm a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv2i64( %va, i1 false) ret %a } declare @llvm.cttz.nxv2i64(, i1) define @cttz_nxv4i64( %va) { -; RV32-LABEL: cttz_nxv4i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a0 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsrl.vi v20, v8, 1 -; RV32-NEXT: vand.vv v16, v20, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsrl.vi v20, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v20 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_nxv4i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vsub.vx v12, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vsrl.vi v20, v8, 1 +; RV32I-NEXT: vand.vv v16, v20, v16 +; RV32I-NEXT: vsub.vv v8, v8, v16 +; RV32I-NEXT: vand.vv v16, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vadd.vv v8, v16, v8 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vsrl.vi v20, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v20 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vmul.vv v8, v8, v16 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_nxv4i64: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a0 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v12 -; RV64-NEXT: lui a0, %hi(.LCPI20_0) -; RV64-NEXT: ld a0, %lo(.LCPI20_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI20_1) -; RV64-NEXT: ld a1, %lo(.LCPI20_1)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: vand.vx v12, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: lui a0, %hi(.LCPI20_2) -; RV64-NEXT: ld a0, %lo(.LCPI20_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI20_3) -; RV64-NEXT: ld a1, %lo(.LCPI20_3)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_nxv4i64: +; RV64I: # %bb.0: +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV64I-NEXT: vsub.vx v12, v8, a0 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: lui a0, %hi(.LCPI20_0) +; RV64I-NEXT: ld a0, %lo(.LCPI20_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI20_1) +; RV64I-NEXT: ld a1, %lo(.LCPI20_1)(a1) +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: vand.vx v12, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI20_2) +; RV64I-NEXT: ld a0, %lo(.LCPI20_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI20_3) +; RV64I-NEXT: ld a1, %lo(.LCPI20_3)(a1) +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; RV32F-LABEL: cttz_nxv4i64: +; RV32F: # %bb.0: +; RV32F-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32F-NEXT: vmseq.vx v12, v8, zero +; RV32F-NEXT: vrsub.vi v16, v8, 0 +; RV32F-NEXT: vand.vv v8, v8, v16 +; RV32F-NEXT: vmset.m v0 +; RV32F-NEXT: fsrmi a0, 1 +; RV32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32F-NEXT: vfncvt.f.xu.w v14, v8, v0.t +; RV32F-NEXT: vsrl.vi v8, v14, 23 +; RV32F-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32F-NEXT: vzext.vf2 v16, v8 +; RV32F-NEXT: li a1, 127 +; RV32F-NEXT: vsub.vx v8, v16, a1 +; RV32F-NEXT: li a1, 64 +; RV32F-NEXT: vmv1r.v v0, v12 +; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32F-NEXT: fsrm a0 +; RV32F-NEXT: ret +; +; RV64F-LABEL: cttz_nxv4i64: +; RV64F: # %bb.0: +; RV64F-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64F-NEXT: vrsub.vi v12, v8, 0 +; RV64F-NEXT: vand.vv v12, v8, v12 +; RV64F-NEXT: vmset.m v0 +; RV64F-NEXT: fsrmi a0, 1 +; RV64F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64F-NEXT: vfncvt.f.xu.w v16, v12, v0.t +; RV64F-NEXT: vsrl.vi v12, v16, 23 +; RV64F-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64F-NEXT: vzext.vf2 v16, v12 +; RV64F-NEXT: li a1, 127 +; RV64F-NEXT: vsub.vx v12, v16, a1 +; RV64F-NEXT: vmseq.vi v0, v8, 0 +; RV64F-NEXT: li a1, 64 +; RV64F-NEXT: vmerge.vxm v8, v12, a1, v0 +; RV64F-NEXT: fsrm a0 +; RV64F-NEXT: ret +; +; RV32D-LABEL: cttz_nxv4i64: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32D-NEXT: vmseq.vx v12, v8, zero +; RV32D-NEXT: vrsub.vi v16, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v16 +; RV32D-NEXT: vmset.m v0 +; RV32D-NEXT: fsrmi a0, 1 +; RV32D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; RV32D-NEXT: li a1, 52 +; RV32D-NEXT: vsrl.vx v8, v8, a1 +; RV32D-NEXT: li a1, 1023 +; RV32D-NEXT: vsub.vx v8, v8, a1 +; RV32D-NEXT: li a1, 64 +; RV32D-NEXT: vmv1r.v v0, v12 +; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32D-NEXT: fsrm a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv4i64: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64D-NEXT: vrsub.vi v12, v8, 0 +; RV64D-NEXT: vand.vv v12, v8, v12 +; RV64D-NEXT: vmset.m v0 +; RV64D-NEXT: fsrmi a0, 1 +; RV64D-NEXT: vfcvt.f.xu.v v12, v12, v0.t +; RV64D-NEXT: li a1, 52 +; RV64D-NEXT: vsrl.vx v12, v12, a1 +; RV64D-NEXT: li a1, 1023 +; RV64D-NEXT: vsub.vx v12, v12, a1 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: li a1, 64 +; RV64D-NEXT: vmerge.vxm v8, v12, a1, v0 +; RV64D-NEXT: fsrm a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv4i64( %va, i1 false) ret %a } declare @llvm.cttz.nxv4i64(, i1) define @cttz_nxv8i64( %va) { -; RV32-LABEL: cttz_nxv8i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a0 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v24, v0, v24 -; RV32-NEXT: vsub.vv v8, v8, v24 -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_nxv8i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vsub.vx v16, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v24, (a0), zero +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vsrl.vi v0, v8, 1 +; RV32I-NEXT: vand.vv v24, v0, v24 +; RV32I-NEXT: vsub.vv v8, v8, v24 +; RV32I-NEXT: vand.vv v24, v8, v16 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vadd.vv v8, v24, v8 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vlse64.v v24, (a0), zero +; RV32I-NEXT: vsrl.vi v0, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v0 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vmul.vv v8, v8, v24 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_nxv8i64: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a0 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v16 -; RV64-NEXT: lui a0, %hi(.LCPI21_0) -; RV64-NEXT: ld a0, %lo(.LCPI21_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI21_1) -; RV64-NEXT: ld a1, %lo(.LCPI21_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: lui a0, %hi(.LCPI21_2) -; RV64-NEXT: ld a0, %lo(.LCPI21_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI21_3) -; RV64-NEXT: ld a1, %lo(.LCPI21_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_nxv8i64: +; RV64I: # %bb.0: +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64I-NEXT: vsub.vx v16, v8, a0 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: lui a0, %hi(.LCPI21_0) +; RV64I-NEXT: ld a0, %lo(.LCPI21_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI21_1) +; RV64I-NEXT: ld a1, %lo(.LCPI21_1)(a1) +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vsub.vv v8, v8, v16 +; RV64I-NEXT: vand.vx v16, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v16, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI21_2) +; RV64I-NEXT: ld a0, %lo(.LCPI21_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI21_3) +; RV64I-NEXT: ld a1, %lo(.LCPI21_3)(a1) +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v16 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; RV32F-LABEL: cttz_nxv8i64: +; RV32F: # %bb.0: +; RV32F-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32F-NEXT: vmseq.vx v16, v8, zero +; RV32F-NEXT: vrsub.vi v24, v8, 0 +; RV32F-NEXT: vand.vv v8, v8, v24 +; RV32F-NEXT: vmset.m v0 +; RV32F-NEXT: fsrmi a0, 1 +; RV32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RV32F-NEXT: vfncvt.f.xu.w v20, v8, v0.t +; RV32F-NEXT: vsrl.vi v8, v20, 23 +; RV32F-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32F-NEXT: vzext.vf2 v24, v8 +; RV32F-NEXT: li a1, 127 +; RV32F-NEXT: vsub.vx v8, v24, a1 +; RV32F-NEXT: li a1, 64 +; RV32F-NEXT: vmv1r.v v0, v16 +; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32F-NEXT: fsrm a0 +; RV32F-NEXT: ret +; +; RV64F-LABEL: cttz_nxv8i64: +; RV64F: # %bb.0: +; RV64F-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64F-NEXT: vrsub.vi v16, v8, 0 +; RV64F-NEXT: vand.vv v16, v8, v16 +; RV64F-NEXT: vmset.m v0 +; RV64F-NEXT: fsrmi a0, 1 +; RV64F-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RV64F-NEXT: vfncvt.f.xu.w v24, v16, v0.t +; RV64F-NEXT: vsrl.vi v16, v24, 23 +; RV64F-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64F-NEXT: vzext.vf2 v24, v16 +; RV64F-NEXT: li a1, 127 +; RV64F-NEXT: vsub.vx v16, v24, a1 +; RV64F-NEXT: vmseq.vi v0, v8, 0 +; RV64F-NEXT: li a1, 64 +; RV64F-NEXT: vmerge.vxm v8, v16, a1, v0 +; RV64F-NEXT: fsrm a0 +; RV64F-NEXT: ret +; +; RV32D-LABEL: cttz_nxv8i64: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32D-NEXT: vmseq.vx v16, v8, zero +; RV32D-NEXT: vrsub.vi v24, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v24 +; RV32D-NEXT: vmset.m v0 +; RV32D-NEXT: fsrmi a0, 1 +; RV32D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; RV32D-NEXT: li a1, 52 +; RV32D-NEXT: vsrl.vx v8, v8, a1 +; RV32D-NEXT: li a1, 1023 +; RV32D-NEXT: vsub.vx v8, v8, a1 +; RV32D-NEXT: li a1, 64 +; RV32D-NEXT: vmv1r.v v0, v16 +; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0 +; RV32D-NEXT: fsrm a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv8i64: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64D-NEXT: vrsub.vi v16, v8, 0 +; RV64D-NEXT: vand.vv v16, v8, v16 +; RV64D-NEXT: vmset.m v0 +; RV64D-NEXT: fsrmi a0, 1 +; RV64D-NEXT: vfcvt.f.xu.v v16, v16, v0.t +; RV64D-NEXT: li a1, 52 +; RV64D-NEXT: vsrl.vx v16, v16, a1 +; RV64D-NEXT: li a1, 1023 +; RV64D-NEXT: vsub.vx v16, v16, a1 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: li a1, 64 +; RV64D-NEXT: vmerge.vxm v8, v16, a1, v0 +; RV64D-NEXT: fsrm a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv8i64( %va, i1 false) ret %a } @@ -1440,6 +2011,21 @@ define @cttz_zero_undef_nxv1i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv1i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e8, mf8, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9 +; CHECK-F-NEXT: vnsrl.wi v8, v8, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-F-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv1i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf8, ta, ma @@ -1480,6 +2066,21 @@ define @cttz_zero_undef_nxv2i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv2i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e8, mf4, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9 +; CHECK-F-NEXT: vnsrl.wi v8, v8, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-F-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv2i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf4, ta, ma @@ -1520,6 +2121,21 @@ define @cttz_zero_undef_nxv4i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv4i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e8, mf2, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9 +; CHECK-F-NEXT: vnsrl.wi v8, v10, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-F-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv4i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, mf2, ta, ma @@ -1560,6 +2176,21 @@ define @cttz_zero_undef_nxv8i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv8i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v10, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10 +; CHECK-F-NEXT: vnsrl.wi v8, v12, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-F-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v10, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv8i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, m1, ta, ma @@ -1600,6 +2231,21 @@ define @cttz_zero_undef_nxv16i8( %va) { ; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15 ; CHECK-ZVE64X-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv16i8: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-F-NEXT: vrsub.vi v10, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v10 +; CHECK-F-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v12, v8 +; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12 +; CHECK-F-NEXT: vnsrl.wi v8, v16, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; CHECK-F-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v12, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv16i8: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e8, m2, ta, ma @@ -1725,6 +2371,17 @@ define @cttz_zero_undef_nxv1i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv1i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v9, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv1i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma @@ -1796,6 +2453,17 @@ define @cttz_zero_undef_nxv2i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv2i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v9, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv2i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma @@ -1867,6 +2535,17 @@ define @cttz_zero_undef_nxv4i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv4i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v10, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv4i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma @@ -1938,6 +2617,17 @@ define @cttz_zero_undef_nxv8i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv8i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-F-NEXT: vrsub.vi v10, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v10 +; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v12, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv8i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma @@ -2009,6 +2699,17 @@ define @cttz_zero_undef_nxv16i16( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 8 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv16i16: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-F-NEXT: vrsub.vi v12, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v12 +; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v8 +; CHECK-F-NEXT: vnsrl.wi v8, v16, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv16i16: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma @@ -2142,6 +2843,20 @@ define @cttz_zero_undef_nxv1i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv1i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv1i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma @@ -2219,6 +2934,20 @@ define @cttz_zero_undef_nxv2i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv2i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv2i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma @@ -2296,6 +3025,20 @@ define @cttz_zero_undef_nxv4i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv4i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vrsub.vi v10, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v10 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv4i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma @@ -2373,6 +3116,20 @@ define @cttz_zero_undef_nxv8i32( %va) { ; RV64I-NEXT: vsrl.vi v8, v8, 24 ; RV64I-NEXT: ret ; +; CHECK-F-LABEL: cttz_zero_undef_nxv8i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vrsub.vi v12, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v12 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; ; CHECK-D-LABEL: cttz_zero_undef_nxv8i32: ; CHECK-D: # %bb.0: ; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma @@ -2392,383 +3149,539 @@ define @cttz_zero_undef_nxv8i32( %va) { } define @cttz_zero_undef_nxv16i32( %va) { -; RV32-LABEL: cttz_zero_undef_nxv16i32: -; RV32: # %bb.0: -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a0 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv16i32: +; RV32I: # %bb.0: +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32I-NEXT: vsub.vx v16, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v16, v16, a0 +; RV32I-NEXT: vsub.vv v8, v8, v16 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v16, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v16, v8 +; RV32I-NEXT: vsrl.vi v16, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v16 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv16i32: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a0 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv16i32: +; RV64I: # %bb.0: +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV64I-NEXT: vsub.vx v16, v8, a0 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vsub.vv v8, v8, v16 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v16, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v16, v8 +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v16 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: cttz_zero_undef_nxv16i32: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-F-NEXT: vrsub.vi v16, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v16 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: cttz_zero_undef_nxv16i32: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-D-NEXT: vrsub.vi v16, v8, 0 +; CHECK-D-NEXT: vand.vv v8, v8, v16 +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-D-NEXT: vsrl.vi v8, v8, 23 +; CHECK-D-NEXT: li a1, 127 +; CHECK-D-NEXT: vsub.vx v8, v8, a1 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.cttz.nxv16i32( %va, i1 true) ret %a } define @cttz_zero_undef_nxv1i64( %va) { -; RV32-LABEL: cttz_zero_undef_nxv1i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a0 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vsrl.vi v11, v8, 1 -; RV32-NEXT: vand.vv v10, v11, v10 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsrl.vi v11, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v11 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vmul.vv v8, v8, v10 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv1i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vlse64.v v9, (a0), zero +; RV32I-NEXT: vsrl.vi v11, v8, 1 +; RV32I-NEXT: vand.vv v10, v11, v10 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v9 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vlse64.v v9, (a0), zero +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vsrl.vi v11, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v11 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vmul.vv v8, v8, v10 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv1i64: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a0 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: lui a0, %hi(.LCPI40_0) -; RV64-NEXT: ld a0, %lo(.LCPI40_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI40_1) -; RV64-NEXT: ld a1, %lo(.LCPI40_1)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: vand.vx v9, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: lui a0, %hi(.LCPI40_2) -; RV64-NEXT: ld a0, %lo(.LCPI40_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI40_3) -; RV64-NEXT: ld a1, %lo(.LCPI40_3)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv1i64: +; RV64I: # %bb.0: +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: lui a0, %hi(.LCPI40_0) +; RV64I-NEXT: ld a0, %lo(.LCPI40_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI40_1) +; RV64I-NEXT: ld a1, %lo(.LCPI40_1)(a1) +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: vand.vx v9, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI40_2) +; RV64I-NEXT: ld a0, %lo(.LCPI40_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI40_3) +; RV64I-NEXT: ld a1, %lo(.LCPI40_3)(a1) +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: cttz_zero_undef_nxv1i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vrsub.vi v9, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v9 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v9, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-F-NEXT: vzext.vf2 v9, v8 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v8, v9, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: cttz_zero_undef_nxv1i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-D-NEXT: vrsub.vi v9, v8, 0 +; CHECK-D-NEXT: vand.vv v8, v8, v9 +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a1 +; CHECK-D-NEXT: li a1, 1023 +; CHECK-D-NEXT: vsub.vx v8, v8, a1 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.cttz.nxv1i64( %va, i1 true) ret %a } define @cttz_zero_undef_nxv2i64( %va) { -; RV32-LABEL: cttz_zero_undef_nxv2i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a0 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsrl.vi v14, v8, 1 -; RV32-NEXT: vand.vv v12, v14, v12 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsrl.vi v14, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v14 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vmul.vv v8, v8, v12 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv2i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32I-NEXT: vsub.vx v10, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vsrl.vi v14, v8, 1 +; RV32I-NEXT: vand.vv v12, v14, v12 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: vand.vv v12, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vlse64.v v10, (a0), zero +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vsrl.vi v14, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v14 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vmul.vv v8, v8, v12 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv2i64: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a0 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v10 -; RV64-NEXT: lui a0, %hi(.LCPI41_0) -; RV64-NEXT: ld a0, %lo(.LCPI41_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI41_1) -; RV64-NEXT: ld a1, %lo(.LCPI41_1)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: vand.vx v10, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: lui a0, %hi(.LCPI41_2) -; RV64-NEXT: ld a0, %lo(.LCPI41_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI41_3) -; RV64-NEXT: ld a1, %lo(.LCPI41_3)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv2i64: +; RV64I: # %bb.0: +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64I-NEXT: vsub.vx v10, v8, a0 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: lui a0, %hi(.LCPI41_0) +; RV64I-NEXT: ld a0, %lo(.LCPI41_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI41_1) +; RV64I-NEXT: ld a1, %lo(.LCPI41_1)(a1) +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: vand.vx v10, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI41_2) +; RV64I-NEXT: ld a0, %lo(.LCPI41_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI41_3) +; RV64I-NEXT: ld a1, %lo(.LCPI41_3)(a1) +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: cttz_zero_undef_nxv2i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vrsub.vi v10, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v10 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v10, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-F-NEXT: vzext.vf2 v10, v8 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v8, v10, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: cttz_zero_undef_nxv2i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-D-NEXT: vrsub.vi v10, v8, 0 +; CHECK-D-NEXT: vand.vv v8, v8, v10 +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a1 +; CHECK-D-NEXT: li a1, 1023 +; CHECK-D-NEXT: vsub.vx v8, v8, a1 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.cttz.nxv2i64( %va, i1 true) ret %a } define @cttz_zero_undef_nxv4i64( %va) { -; RV32-LABEL: cttz_zero_undef_nxv4i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a0 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsrl.vi v20, v8, 1 -; RV32-NEXT: vand.vv v16, v20, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsrl.vi v20, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v20 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv4i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV32I-NEXT: vsub.vx v12, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vsrl.vi v20, v8, 1 +; RV32I-NEXT: vand.vv v16, v20, v16 +; RV32I-NEXT: vsub.vv v8, v8, v16 +; RV32I-NEXT: vand.vv v16, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vadd.vv v8, v16, v8 +; RV32I-NEXT: vlse64.v v12, (a0), zero +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vsrl.vi v20, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v20 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vmul.vv v8, v8, v16 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv4i64: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a0 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v12 -; RV64-NEXT: lui a0, %hi(.LCPI42_0) -; RV64-NEXT: ld a0, %lo(.LCPI42_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI42_1) -; RV64-NEXT: ld a1, %lo(.LCPI42_1)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: vand.vx v12, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: lui a0, %hi(.LCPI42_2) -; RV64-NEXT: ld a0, %lo(.LCPI42_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI42_3) -; RV64-NEXT: ld a1, %lo(.LCPI42_3)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv4i64: +; RV64I: # %bb.0: +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV64I-NEXT: vsub.vx v12, v8, a0 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: lui a0, %hi(.LCPI42_0) +; RV64I-NEXT: ld a0, %lo(.LCPI42_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI42_1) +; RV64I-NEXT: ld a1, %lo(.LCPI42_1)(a1) +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: vand.vx v12, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI42_2) +; RV64I-NEXT: ld a0, %lo(.LCPI42_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI42_3) +; RV64I-NEXT: ld a1, %lo(.LCPI42_3)(a1) +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: cttz_zero_undef_nxv4i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vrsub.vi v12, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v12 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v12, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-F-NEXT: vzext.vf2 v12, v8 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v8, v12, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: cttz_zero_undef_nxv4i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-D-NEXT: vrsub.vi v12, v8, 0 +; CHECK-D-NEXT: vand.vv v8, v8, v12 +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a1 +; CHECK-D-NEXT: li a1, 1023 +; CHECK-D-NEXT: vsub.vx v8, v8, a1 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.cttz.nxv4i64( %va, i1 true) ret %a } define @cttz_zero_undef_nxv8i64( %va) { -; RV32-LABEL: cttz_zero_undef_nxv8i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a0 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v24, v0, v24 -; RV32-NEXT: vsub.vv v8, v8, v24 -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv8i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32I-NEXT: vsub.vx v16, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: vlse64.v v24, (a0), zero +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vsrl.vi v0, v8, 1 +; RV32I-NEXT: vand.vv v24, v0, v24 +; RV32I-NEXT: vsub.vv v8, v8, v24 +; RV32I-NEXT: vand.vv v24, v8, v16 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vadd.vv v8, v24, v8 +; RV32I-NEXT: vlse64.v v16, (a0), zero +; RV32I-NEXT: vlse64.v v24, (a0), zero +; RV32I-NEXT: vsrl.vi v0, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v0 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vmul.vv v8, v8, v24 +; RV32I-NEXT: li a0, 56 +; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv8i64: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a0 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v16 -; RV64-NEXT: lui a0, %hi(.LCPI43_0) -; RV64-NEXT: ld a0, %lo(.LCPI43_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI43_1) -; RV64-NEXT: ld a1, %lo(.LCPI43_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: vand.vx v16, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: lui a0, %hi(.LCPI43_2) -; RV64-NEXT: ld a0, %lo(.LCPI43_2)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI43_3) -; RV64-NEXT: ld a1, %lo(.LCPI43_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vmul.vx v8, v8, a1 -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv8i64: +; RV64I: # %bb.0: +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64I-NEXT: vsub.vx v16, v8, a0 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: lui a0, %hi(.LCPI43_0) +; RV64I-NEXT: ld a0, %lo(.LCPI43_0)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI43_1) +; RV64I-NEXT: ld a1, %lo(.LCPI43_1)(a1) +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vsub.vv v8, v8, v16 +; RV64I-NEXT: vand.vx v16, v8, a1 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vadd.vv v8, v16, v8 +; RV64I-NEXT: lui a0, %hi(.LCPI43_2) +; RV64I-NEXT: ld a0, %lo(.LCPI43_2)(a0) +; RV64I-NEXT: lui a1, %hi(.LCPI43_3) +; RV64I-NEXT: ld a1, %lo(.LCPI43_3)(a1) +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v16 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: li a0, 56 +; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: ret +; +; CHECK-F-LABEL: cttz_zero_undef_nxv8i64: +; CHECK-F: # %bb.0: +; CHECK-F-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vrsub.vi v16, v8, 0 +; CHECK-F-NEXT: vand.vv v8, v8, v16 +; CHECK-F-NEXT: vmset.m v0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8, v0.t +; CHECK-F-NEXT: vsrl.vi v8, v16, 23 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-F-NEXT: vzext.vf2 v16, v8 +; CHECK-F-NEXT: li a1, 127 +; CHECK-F-NEXT: vsub.vx v8, v16, a1 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: ret +; +; CHECK-D-LABEL: cttz_zero_undef_nxv8i64: +; CHECK-D: # %bb.0: +; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-D-NEXT: vrsub.vi v16, v8, 0 +; CHECK-D-NEXT: vand.vv v8, v8, v16 +; CHECK-D-NEXT: vmset.m v0 +; CHECK-D-NEXT: fsrmi a0, 1 +; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-D-NEXT: li a1, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a1 +; CHECK-D-NEXT: li a1, 1023 +; CHECK-D-NEXT: vsub.vx v8, v8, a1 +; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: ret %a = call @llvm.cttz.nxv8i64( %va, i1 true) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll index cc08aeb..abc68c4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64I ; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32 ; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32F +; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64F ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32D ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64D ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32 @@ -201,6 +203,34 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind { ; LMULMAX1-RV64-NEXT: vse16.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret ; +; LMULMAX2-RV32F-LABEL: ctlz_v8i16: +; LMULMAX2-RV32F: # %bb.0: +; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32F-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX2-RV32F-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV32F-NEXT: li a1, 142 +; LMULMAX2-RV32F-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32F-NEXT: li a1, 16 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32F-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32F-NEXT: ret +; +; LMULMAX2-RV64F-LABEL: ctlz_v8i16: +; LMULMAX2-RV64F: # %bb.0: +; LMULMAX2-RV64F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; LMULMAX2-RV64F-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64F-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX2-RV64F-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV64F-NEXT: li a1, 142 +; LMULMAX2-RV64F-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64F-NEXT: li a1, 16 +; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64F-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64F-NEXT: ret +; ; LMULMAX2-RV32D-LABEL: ctlz_v8i16: ; LMULMAX2-RV32D: # %bb.0: ; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, ma @@ -327,81 +357,39 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind { ; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0) ; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: ctlz_v4i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vnot.v v8, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: lui a1, 349525 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 209715 -; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 61681 -; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: lui a1, 4112 -; LMULMAX1-RV32-NEXT: addi a1, a1, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32F-LABEL: ctlz_v4i32: +; LMULMAX2-RV32F: # %bb.0: +; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32F-NEXT: vmset.m v0 +; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32F-NEXT: vfcvt.f.xu.v v9, v8, v0.t +; LMULMAX2-RV32F-NEXT: fsrm a1 +; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v9, 23 +; LMULMAX2-RV32F-NEXT: li a1, 158 +; LMULMAX2-RV32F-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32F-NEXT: li a1, 32 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32F-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32F-NEXT: ret ; -; LMULMAX1-RV64-LABEL: ctlz_v4i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vnot.v v8, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: lui a1, 349525 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 209715 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 61681 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: lui a1, 4112 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64F-LABEL: ctlz_v4i32: +; LMULMAX2-RV64F: # %bb.0: +; LMULMAX2-RV64F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV64F-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64F-NEXT: vmset.m v0 +; LMULMAX2-RV64F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64F-NEXT: vfcvt.f.xu.v v9, v8, v0.t +; LMULMAX2-RV64F-NEXT: fsrm a1 +; LMULMAX2-RV64F-NEXT: vsrl.vi v9, v9, 23 +; LMULMAX2-RV64F-NEXT: li a1, 158 +; LMULMAX2-RV64F-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64F-NEXT: li a1, 32 +; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64F-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64F-NEXT: ret ; ; LMULMAX2-RV32D-LABEL: ctlz_v4i32: ; LMULMAX2-RV32D: # %bb.0: @@ -456,258 +444,204 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind { declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) define void @ctlz_v2i64(ptr %x, ptr %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v2i64: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: li a1, 32 -; LMULMAX2-RV32-NEXT: vsrl.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v9, -1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vxor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v10, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: li a1, 56 -; LMULMAX2-RV32-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: ctlz_v2i64: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 16 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: li a1, 32 +; LMULMAX2-RV32I-NEXT: vsrl.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.i v9, -1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v9, v9, v10 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: li a1, 56 +; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: ctlz_v2i64: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: li a1, 32 -; LMULMAX2-RV64-NEXT: vsrl.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vnot.v v8, v8 -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI3_0) -; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1) -; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI3_1) -; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2) -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a2 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a2 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI3_2) -; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1) -; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI3_3) -; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2) -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX2-RV64-NEXT: li a1, 56 -; LMULMAX2-RV64-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: ctlz_v2i64: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV64I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 16 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: li a1, 32 +; LMULMAX2-RV64I-NEXT: vsrl.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI3_0) +; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI3_0)(a1) +; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI3_1) +; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI3_1)(a2) +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a2 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a2 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI3_2) +; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI3_2)(a1) +; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI3_3) +; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI3_3)(a2) +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a2 +; LMULMAX2-RV64I-NEXT: li a1, 56 +; LMULMAX2-RV64I-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: ctlz_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: li a1, 32 -; LMULMAX1-RV32-NEXT: vsrl.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.i v9, -1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vxor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: lui a1, 349525 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 209715 -; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v10, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 61681 -; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 4112 -; LMULMAX1-RV32-NEXT: addi a1, a1, 257 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: li a1, 56 -; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32F-LABEL: ctlz_v2i64: +; LMULMAX2-RV32F: # %bb.0: +; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; LMULMAX2-RV32F-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32F-NEXT: vmset.m v0 +; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v9, v8, v0.t +; LMULMAX2-RV32F-NEXT: fsrm a1 +; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v9, 23 +; LMULMAX2-RV32F-NEXT: li a1, 190 +; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; LMULMAX2-RV32F-NEXT: vwsubu.wv v10, v10, v9 +; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vmv.v.i v9, 0 +; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vmseq.vv v0, v8, v9 +; LMULMAX2-RV32F-NEXT: li a1, 64 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32F-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32F-NEXT: ret ; -; LMULMAX1-RV64-LABEL: ctlz_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: li a1, 32 -; LMULMAX1-RV64-NEXT: vsrl.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vnot.v v8, v8 -; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI3_0) -; LMULMAX1-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI3_1) -; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2) -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a2 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a2 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI3_2) -; LMULMAX1-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI3_3) -; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2) -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX1-RV64-NEXT: li a1, 56 -; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64F-LABEL: ctlz_v2i64: +; LMULMAX2-RV64F: # %bb.0: +; LMULMAX2-RV64F-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; LMULMAX2-RV64F-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64F-NEXT: vmset.m v0 +; LMULMAX2-RV64F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64F-NEXT: vfncvt.f.xu.w v9, v8, v0.t +; LMULMAX2-RV64F-NEXT: fsrm a1 +; LMULMAX2-RV64F-NEXT: vsrl.vi v9, v9, 23 +; LMULMAX2-RV64F-NEXT: li a1, 190 +; LMULMAX2-RV64F-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV64F-NEXT: vwsubu.vv v11, v10, v9 +; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64F-NEXT: li a1, 64 +; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v11, a1, v0 +; LMULMAX2-RV64F-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64F-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: ctlz_v2i64: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32D-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vmset.m v0 +; LMULMAX2-RV32D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v9, v8, v0.t +; LMULMAX2-RV32D-NEXT: fsrm a1 +; LMULMAX2-RV32D-NEXT: li a1, 52 +; LMULMAX2-RV32D-NEXT: vsrl.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: li a1, 1086 +; LMULMAX2-RV32D-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32D-NEXT: vmv.v.i v10, 0 +; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32D-NEXT: vmseq.vv v0, v8, v10 +; LMULMAX2-RV32D-NEXT: li a1, 64 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32D-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: ctlz_v2i64: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV64D-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vmset.m v0 +; LMULMAX2-RV64D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v9, v8, v0.t +; LMULMAX2-RV64D-NEXT: fsrm a1 +; LMULMAX2-RV64D-NEXT: li a1, 52 +; LMULMAX2-RV64D-NEXT: vsrl.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: li a1, 1086 +; LMULMAX2-RV64D-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: li a1, 64 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret ; ; LMULMAX8-RV32-LABEL: ctlz_v2i64: ; LMULMAX8-RV32: # %bb.0: ; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: li a1, 32 -; LMULMAX8-RV32-NEXT: vsrl.vx v9, v8, a1 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.i v9, -1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX8-RV32-NEXT: lui a1, 349525 -; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: lui a1, 209715 -; LMULMAX8-RV32-NEXT: addi a1, a1, 819 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v9 -; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: lui a1, 61681 -; LMULMAX8-RV32-NEXT: addi a1, a1, -241 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: lui a1, 4112 -; LMULMAX8-RV32-NEXT: addi a1, a1, 257 +; LMULMAX8-RV32-NEXT: vmset.m v0 +; LMULMAX8-RV32-NEXT: fsrmi a1, 1 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v9, v8, v0.t +; LMULMAX8-RV32-NEXT: fsrm a1 +; LMULMAX8-RV32-NEXT: li a1, 52 +; LMULMAX8-RV32-NEXT: vsrl.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: li a1, 1086 +; LMULMAX8-RV32-NEXT: vrsub.vx v9, v9, a1 ; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vmv.v.i v10, 0 ; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: li a1, 56 -; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vmseq.vv v0, v8, v10 +; LMULMAX8-RV32-NEXT: li a1, 64 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0 ; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) ; LMULMAX8-RV32-NEXT: ret ; @@ -715,41 +649,17 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind { ; LMULMAX8-RV64: # %bb.0: ; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: li a1, 32 -; LMULMAX8-RV64-NEXT: vsrl.vx v9, v8, a1 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: vnot.v v8, v8 -; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI3_0) -; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1) -; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI3_1) -; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2) -; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX8-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: vand.vx v9, v8, a2 -; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a2 -; LMULMAX8-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI3_2) -; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1) -; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI3_3) -; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2) -; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX8-RV64-NEXT: li a1, 56 -; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vmset.m v0 +; LMULMAX8-RV64-NEXT: fsrmi a1, 1 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v9, v8, v0.t +; LMULMAX8-RV64-NEXT: fsrm a1 +; LMULMAX8-RV64-NEXT: li a1, 52 +; LMULMAX8-RV64-NEXT: vsrl.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: li a1, 1086 +; LMULMAX8-RV64-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: li a1, 64 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 ; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX8-RV64-NEXT: ret %a = load <2 x i64>, ptr %x @@ -1069,209 +979,149 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind { declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1) define void @ctlz_v8i32(ptr %x, ptr %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v8i32: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vnot.v v8, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret -; -; LMULMAX2-RV64-LABEL: ctlz_v8i32: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vnot.v v8, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 349525 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 209715 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 61681 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: lui a1, 4112 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV32I-LABEL: ctlz_v8i32: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 16 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: ctlz_v8i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle32.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle32.v v9, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vnot.v v8, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: lui a2, 349525 -; LMULMAX1-RV32-NEXT: addi a2, a2, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a3, 209715 -; LMULMAX1-RV32-NEXT: addi a3, a3, 819 -; LMULMAX1-RV32-NEXT: vand.vx v10, v8, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a3 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a4, 61681 -; LMULMAX1-RV32-NEXT: addi a4, a4, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV32-NEXT: lui a5, 4112 -; LMULMAX1-RV32-NEXT: addi a5, a5, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 2 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 8 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 16 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vnot.v v9, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v10, v9, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a3 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV32-NEXT: vmul.vx v9, v9, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 24 -; LMULMAX1-RV32-NEXT: vse32.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse32.v v8, (a1) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV64I-LABEL: ctlz_v8i32: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 16 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 349525 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 209715 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 61681 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: lui a1, 4112 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV64-LABEL: ctlz_v8i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle32.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle32.v v9, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vnot.v v8, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: lui a2, 349525 -; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a3, 209715 -; LMULMAX1-RV64-NEXT: addiw a3, a3, 819 -; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a3 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a4, 61681 -; LMULMAX1-RV64-NEXT: addiw a4, a4, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV64-NEXT: lui a5, 4112 -; LMULMAX1-RV64-NEXT: addiw a5, a5, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a5 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 2 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 8 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 16 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vnot.v v9, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a3 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a5 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 24 -; LMULMAX1-RV64-NEXT: vse32.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse32.v v8, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV32F-LABEL: ctlz_v8i32: +; LMULMAX2-RV32F: # %bb.0: +; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32F-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32F-NEXT: vmset.m v0 +; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32F-NEXT: vfcvt.f.xu.v v10, v8, v0.t +; LMULMAX2-RV32F-NEXT: fsrm a1 +; LMULMAX2-RV32F-NEXT: vsrl.vi v10, v10, 23 +; LMULMAX2-RV32F-NEXT: li a1, 158 +; LMULMAX2-RV32F-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32F-NEXT: li a1, 32 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32F-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32F-NEXT: ret +; +; LMULMAX2-RV64F-LABEL: ctlz_v8i32: +; LMULMAX2-RV64F: # %bb.0: +; LMULMAX2-RV64F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV64F-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64F-NEXT: vmset.m v0 +; LMULMAX2-RV64F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64F-NEXT: vfcvt.f.xu.v v10, v8, v0.t +; LMULMAX2-RV64F-NEXT: fsrm a1 +; LMULMAX2-RV64F-NEXT: vsrl.vi v10, v10, 23 +; LMULMAX2-RV64F-NEXT: li a1, 158 +; LMULMAX2-RV64F-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64F-NEXT: li a1, 32 +; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64F-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64F-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: ctlz_v8i32: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vmset.m v0 +; LMULMAX2-RV32D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v10, v8, v0.t +; LMULMAX2-RV32D-NEXT: fsrm a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 23 +; LMULMAX2-RV32D-NEXT: li a1, 158 +; LMULMAX2-RV32D-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: li a1, 32 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: ctlz_v8i32: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vmset.m v0 +; LMULMAX2-RV64D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v10, v8, v0.t +; LMULMAX2-RV64D-NEXT: fsrm a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 23 +; LMULMAX2-RV64D-NEXT: li a1, 158 +; LMULMAX2-RV64D-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: li a1, 32 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret ; ; LMULMAX8-LABEL: ctlz_v8i32: ; LMULMAX8: # %bb.0: @@ -1296,314 +1146,204 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind { declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v4i64: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: li a1, 32 -; LMULMAX2-RV32-NEXT: vsrl.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v10, -1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vxor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v12, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v12, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: li a1, 56 -; LMULMAX2-RV32-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: ctlz_v4i64: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 16 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: li a1, 32 +; LMULMAX2-RV32I-NEXT: vsrl.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.i v10, -1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v12, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v10, v10, v12 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v12, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v12, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: li a1, 56 +; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: ctlz_v4i64: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: li a1, 32 -; LMULMAX2-RV64-NEXT: vsrl.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vnot.v v8, v8 -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI7_0) -; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI7_0)(a1) -; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI7_1) -; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI7_1)(a2) -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a2 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a2 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI7_2) -; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI7_2)(a1) -; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI7_3) -; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI7_3)(a2) -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX2-RV64-NEXT: li a1, 56 -; LMULMAX2-RV64-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: ctlz_v4i64: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV64I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 16 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: li a1, 32 +; LMULMAX2-RV64I-NEXT: vsrl.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI7_0) +; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI7_0)(a1) +; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI7_1) +; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI7_1)(a2) +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a2 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a2 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI7_2) +; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI7_2)(a1) +; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI7_3) +; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI7_3)(a2) +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a2 +; LMULMAX2-RV64I-NEXT: li a1, 56 +; LMULMAX2-RV64I-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: ctlz_v4i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: li a2, 32 -; LMULMAX1-RV32-NEXT: vsrl.vx v10, v8, a2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.i v10, -1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vxor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v11, v8, 1 -; LMULMAX1-RV32-NEXT: lui a3, 349525 -; LMULMAX1-RV32-NEXT: addi a3, a3, 1365 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v12, a3 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v12 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v11 -; LMULMAX1-RV32-NEXT: lui a3, 209715 -; LMULMAX1-RV32-NEXT: addi a3, a3, 819 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v11, a3 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v13, v8, v11 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v11 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v13, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v13, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v13 -; LMULMAX1-RV32-NEXT: lui a3, 61681 -; LMULMAX1-RV32-NEXT: addi a3, a3, -241 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v13, a3 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v13 -; LMULMAX1-RV32-NEXT: lui a3, 4112 -; LMULMAX1-RV32-NEXT: addi a3, a3, 257 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v14, a3 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v14 -; LMULMAX1-RV32-NEXT: li a3, 56 -; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v15, v9, 1 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15 -; LMULMAX1-RV32-NEXT: vsrl.vi v15, v9, 2 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15 -; LMULMAX1-RV32-NEXT: vsrl.vi v15, v9, 4 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15 -; LMULMAX1-RV32-NEXT: vsrl.vi v15, v9, 8 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15 -; LMULMAX1-RV32-NEXT: vsrl.vi v15, v9, 16 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15 -; LMULMAX1-RV32-NEXT: vsrl.vx v15, v9, a2 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15 -; LMULMAX1-RV32-NEXT: vxor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v12 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vv v10, v9, v11 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v11 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v13 -; LMULMAX1-RV32-NEXT: vmul.vv v9, v9, v14 -; LMULMAX1-RV32-NEXT: vsrl.vx v9, v9, a3 -; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32F-LABEL: ctlz_v4i64: +; LMULMAX2-RV32F: # %bb.0: +; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32F-NEXT: vmset.m v0 +; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v10, v8, v0.t +; LMULMAX2-RV32F-NEXT: fsrm a1 +; LMULMAX2-RV32F-NEXT: vsrl.vi v10, v10, 23 +; LMULMAX2-RV32F-NEXT: li a1, 190 +; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; LMULMAX2-RV32F-NEXT: vmv.v.x v12, a1 +; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vwsubu.wv v12, v12, v10 +; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32F-NEXT: vmv.v.i v10, 0 +; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32F-NEXT: vmseq.vv v0, v8, v10 +; LMULMAX2-RV32F-NEXT: li a1, 64 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v12, a1, v0 +; LMULMAX2-RV32F-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32F-NEXT: ret ; -; LMULMAX1-RV64-LABEL: ctlz_v4i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle64.v v9, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: li a2, 32 -; LMULMAX1-RV64-NEXT: vsrl.vx v10, v8, a2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vnot.v v8, v8 -; LMULMAX1-RV64-NEXT: lui a3, %hi(.LCPI7_0) -; LMULMAX1-RV64-NEXT: ld a3, %lo(.LCPI7_0)(a3) -; LMULMAX1-RV64-NEXT: lui a4, %hi(.LCPI7_1) -; LMULMAX1-RV64-NEXT: ld a4, %lo(.LCPI7_1)(a4) -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: lui a5, %hi(.LCPI7_2) -; LMULMAX1-RV64-NEXT: ld a5, %lo(.LCPI7_2)(a5) -; LMULMAX1-RV64-NEXT: lui a6, %hi(.LCPI7_3) -; LMULMAX1-RV64-NEXT: ld a6, %lo(.LCPI7_3)(a6) -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a5 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a6 -; LMULMAX1-RV64-NEXT: li a7, 56 -; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a7 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 2 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 8 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 16 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vx v10, v9, a2 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vnot.v v9, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a5 -; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a6 -; LMULMAX1-RV64-NEXT: vsrl.vx v9, v9, a7 -; LMULMAX1-RV64-NEXT: vse64.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v8, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64F-LABEL: ctlz_v4i64: +; LMULMAX2-RV64F: # %bb.0: +; LMULMAX2-RV64F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV64F-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64F-NEXT: vmset.m v0 +; LMULMAX2-RV64F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64F-NEXT: vfncvt.f.xu.w v10, v8, v0.t +; LMULMAX2-RV64F-NEXT: fsrm a1 +; LMULMAX2-RV64F-NEXT: vsrl.vi v10, v10, 23 +; LMULMAX2-RV64F-NEXT: li a1, 190 +; LMULMAX2-RV64F-NEXT: vmv.v.x v11, a1 +; LMULMAX2-RV64F-NEXT: vwsubu.vv v12, v11, v10 +; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64F-NEXT: li a1, 64 +; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v12, a1, v0 +; LMULMAX2-RV64F-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64F-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: ctlz_v4i64: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32D-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vmset.m v0 +; LMULMAX2-RV32D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v10, v8, v0.t +; LMULMAX2-RV32D-NEXT: fsrm a1 +; LMULMAX2-RV32D-NEXT: li a1, 52 +; LMULMAX2-RV32D-NEXT: vsrl.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: li a1, 1086 +; LMULMAX2-RV32D-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32D-NEXT: vmv.v.i v12, 0 +; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32D-NEXT: vmseq.vv v0, v8, v12 +; LMULMAX2-RV32D-NEXT: li a1, 64 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32D-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: ctlz_v4i64: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV64D-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vmset.m v0 +; LMULMAX2-RV64D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v10, v8, v0.t +; LMULMAX2-RV64D-NEXT: fsrm a1 +; LMULMAX2-RV64D-NEXT: li a1, 52 +; LMULMAX2-RV64D-NEXT: vsrl.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: li a1, 1086 +; LMULMAX2-RV64D-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: li a1, 64 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret ; ; LMULMAX8-RV32-LABEL: ctlz_v4i64: ; LMULMAX8-RV32: # %bb.0: ; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: li a1, 32 -; LMULMAX8-RV32-NEXT: vsrl.vx v10, v8, a1 -; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.i v10, -1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX8-RV32-NEXT: lui a1, 349525 -; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v12, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vand.vv v10, v10, v12 -; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: lui a1, 209715 -; LMULMAX8-RV32-NEXT: addi a1, a1, 819 -; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vand.vv v12, v8, v10 -; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: vadd.vv v8, v12, v8 -; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: lui a1, 61681 -; LMULMAX8-RV32-NEXT: addi a1, a1, -241 -; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: lui a1, 4112 -; LMULMAX8-RV32-NEXT: addi a1, a1, 257 +; LMULMAX8-RV32-NEXT: vmset.m v0 +; LMULMAX8-RV32-NEXT: fsrmi a1, 1 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v10, v8, v0.t +; LMULMAX8-RV32-NEXT: fsrm a1 +; LMULMAX8-RV32-NEXT: li a1, 52 +; LMULMAX8-RV32-NEXT: vsrl.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: li a1, 1086 +; LMULMAX8-RV32-NEXT: vrsub.vx v10, v10, a1 ; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vmv.v.i v12, 0 ; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: li a1, 56 -; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vmseq.vv v0, v8, v12 +; LMULMAX8-RV32-NEXT: li a1, 64 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0 ; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) ; LMULMAX8-RV32-NEXT: ret ; @@ -1611,41 +1351,17 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { ; LMULMAX8-RV64: # %bb.0: ; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: li a1, 32 -; LMULMAX8-RV64-NEXT: vsrl.vx v10, v8, a1 -; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: vnot.v v8, v8 -; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI7_0) -; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI7_0)(a1) -; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI7_1) -; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI7_1)(a2) -; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX8-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: vand.vx v10, v8, a2 -; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a2 -; LMULMAX8-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI7_2) -; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI7_2)(a1) -; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI7_3) -; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI7_3)(a2) -; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX8-RV64-NEXT: li a1, 56 -; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vmset.m v0 +; LMULMAX8-RV64-NEXT: fsrmi a1, 1 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v10, v8, v0.t +; LMULMAX8-RV64-NEXT: fsrm a1 +; LMULMAX8-RV64-NEXT: li a1, 52 +; LMULMAX8-RV64-NEXT: vsrl.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: li a1, 1086 +; LMULMAX8-RV64-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: li a1, 64 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 ; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX8-RV64-NEXT: ret %a = load <4 x i64>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll index 144f469..4d2db34 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64I ; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32 ; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32F +; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64F ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32D ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64D ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32 @@ -181,6 +183,38 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind { ; LMULMAX1-RV64-NEXT: vse16.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret ; +; LMULMAX2-RV32F-LABEL: cttz_v8i16: +; LMULMAX2-RV32F: # %bb.0: +; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32F-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV32F-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV32F-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX2-RV32F-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV32F-NEXT: li a1, 127 +; LMULMAX2-RV32F-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32F-NEXT: li a1, 16 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32F-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32F-NEXT: ret +; +; LMULMAX2-RV64F-LABEL: cttz_v8i16: +; LMULMAX2-RV64F: # %bb.0: +; LMULMAX2-RV64F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; LMULMAX2-RV64F-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64F-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV64F-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV64F-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX2-RV64F-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV64F-NEXT: li a1, 127 +; LMULMAX2-RV64F-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64F-NEXT: li a1, 16 +; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64F-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64F-NEXT: ret +; ; LMULMAX2-RV32D-LABEL: cttz_v8i16: ; LMULMAX2-RV32D: # %bb.0: ; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, ma @@ -299,67 +333,43 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind { ; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0) ; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: cttz_v4i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX1-RV32-NEXT: li a1, 1 -; LMULMAX1-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vnot.v v8, v8 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: lui a1, 349525 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 209715 -; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 61681 -; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: lui a1, 4112 -; LMULMAX1-RV32-NEXT: addi a1, a1, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32F-LABEL: cttz_v4i32: +; LMULMAX2-RV32F: # %bb.0: +; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32F-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV32F-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV32F-NEXT: vmset.m v0 +; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32F-NEXT: vfcvt.f.xu.v v9, v9, v0.t +; LMULMAX2-RV32F-NEXT: fsrm a1 +; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v9, 23 +; LMULMAX2-RV32F-NEXT: li a1, 127 +; LMULMAX2-RV32F-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32F-NEXT: li a1, 32 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32F-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32F-NEXT: ret ; -; LMULMAX1-RV64-LABEL: cttz_v4i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX1-RV64-NEXT: li a1, 1 -; LMULMAX1-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vnot.v v8, v8 -; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: lui a1, 349525 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 209715 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 61681 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: lui a1, 4112 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64F-LABEL: cttz_v4i32: +; LMULMAX2-RV64F: # %bb.0: +; LMULMAX2-RV64F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV64F-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64F-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV64F-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV64F-NEXT: vmset.m v0 +; LMULMAX2-RV64F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64F-NEXT: vfcvt.f.xu.v v9, v9, v0.t +; LMULMAX2-RV64F-NEXT: fsrm a1 +; LMULMAX2-RV64F-NEXT: vsrl.vi v9, v9, 23 +; LMULMAX2-RV64F-NEXT: li a1, 127 +; LMULMAX2-RV64F-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64F-NEXT: li a1, 32 +; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64F-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64F-NEXT: ret ; ; LMULMAX2-RV32D-LABEL: cttz_v4i32: ; LMULMAX2-RV32D: # %bb.0: @@ -420,208 +430,197 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind { declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) define void @cttz_v2i64(ptr %x, ptr %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v2i64: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32-NEXT: li a1, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v10, -1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vxor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v10, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: li a1, 56 -; LMULMAX2-RV32-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v2i64: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: li a1, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.i v10, -1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v9, v9, v10 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: li a1, 56 +; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v2i64: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV64-NEXT: li a1, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vnot.v v8, v8 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI3_0) -; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1) -; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI3_1) -; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2) -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a2 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a2 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI3_2) -; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1) -; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI3_3) -; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2) -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX2-RV64-NEXT: li a1, 56 -; LMULMAX2-RV64-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v2i64: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV64I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64I-NEXT: li a1, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI3_0) +; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI3_0)(a1) +; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI3_1) +; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI3_1)(a2) +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a2 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a2 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI3_2) +; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI3_2)(a1) +; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI3_3) +; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI3_3)(a2) +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a2 +; LMULMAX2-RV64I-NEXT: li a1, 56 +; LMULMAX2-RV64I-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: cttz_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX1-RV32-NEXT: li a1, 1 -; LMULMAX1-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.i v10, -1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vxor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: lui a1, 349525 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 209715 -; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v10, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 61681 -; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 4112 -; LMULMAX1-RV32-NEXT: addi a1, a1, 257 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: li a1, 56 -; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32F-LABEL: cttz_v2i64: +; LMULMAX2-RV32F: # %bb.0: +; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vle64.v v9, (a0) +; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vmv.v.i v10, 0 +; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vmseq.vv v8, v9, v10 +; LMULMAX2-RV32F-NEXT: vsub.vv v10, v10, v9 +; LMULMAX2-RV32F-NEXT: vand.vv v9, v9, v10 +; LMULMAX2-RV32F-NEXT: vmset.m v0 +; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v10, v9, v0.t +; LMULMAX2-RV32F-NEXT: fsrm a1 +; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v10, 23 +; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vzext.vf2 v10, v9 +; LMULMAX2-RV32F-NEXT: li a1, 127 +; LMULMAX2-RV32F-NEXT: vsub.vx v9, v10, a1 +; LMULMAX2-RV32F-NEXT: li a1, 64 +; LMULMAX2-RV32F-NEXT: vmv.v.v v0, v8 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32F-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32F-NEXT: ret ; -; LMULMAX1-RV64-LABEL: cttz_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX1-RV64-NEXT: li a1, 1 -; LMULMAX1-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vnot.v v8, v8 -; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI3_0) -; LMULMAX1-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI3_1) -; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2) -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a2 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a2 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI3_2) -; LMULMAX1-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI3_3) -; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2) -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX1-RV64-NEXT: li a1, 56 -; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64F-LABEL: cttz_v2i64: +; LMULMAX2-RV64F: # %bb.0: +; LMULMAX2-RV64F-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV64F-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64F-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV64F-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV64F-NEXT: vmset.m v0 +; LMULMAX2-RV64F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; LMULMAX2-RV64F-NEXT: vfncvt.f.xu.w v10, v9, v0.t +; LMULMAX2-RV64F-NEXT: fsrm a1 +; LMULMAX2-RV64F-NEXT: vsrl.vi v9, v10, 23 +; LMULMAX2-RV64F-NEXT: li a1, 127 +; LMULMAX2-RV64F-NEXT: vwsubu.vx v10, v9, a1 +; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64F-NEXT: li a1, 64 +; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64F-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64F-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: cttz_v2i64: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32D-NEXT: vle64.v v9, (a0) +; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX2-RV32D-NEXT: vmv.v.i v10, 0 +; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV32D-NEXT: vmseq.vv v8, v9, v10 +; LMULMAX2-RV32D-NEXT: vsub.vv v10, v10, v9 +; LMULMAX2-RV32D-NEXT: vand.vv v9, v9, v10 +; LMULMAX2-RV32D-NEXT: vmset.m v0 +; LMULMAX2-RV32D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v9, v9, v0.t +; LMULMAX2-RV32D-NEXT: fsrm a1 +; LMULMAX2-RV32D-NEXT: li a1, 52 +; LMULMAX2-RV32D-NEXT: vsrl.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: li a1, 1023 +; LMULMAX2-RV32D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: li a1, 64 +; LMULMAX2-RV32D-NEXT: vmv.v.v v0, v8 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32D-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: cttz_v2i64: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV64D-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV64D-NEXT: vmset.m v0 +; LMULMAX2-RV64D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v9, v9, v0.t +; LMULMAX2-RV64D-NEXT: fsrm a1 +; LMULMAX2-RV64D-NEXT: li a1, 52 +; LMULMAX2-RV64D-NEXT: vsrl.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: li a1, 1023 +; LMULMAX2-RV64D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: li a1, 64 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret ; ; LMULMAX8-RV32-LABEL: cttz_v2i64: ; LMULMAX8-RV32: # %bb.0: ; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV32-NEXT: li a1, 1 -; LMULMAX8-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.i v10, -1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX8-RV32-NEXT: lui a1, 349525 -; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV32-NEXT: vle64.v v9, (a0) ; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vmv.v.i v10, 0 ; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX8-RV32-NEXT: vmseq.vv v8, v9, v10 +; LMULMAX8-RV32-NEXT: vsub.vv v10, v10, v9 ; LMULMAX8-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: lui a1, 209715 -; LMULMAX8-RV32-NEXT: addi a1, a1, 819 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v9 -; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: lui a1, 61681 -; LMULMAX8-RV32-NEXT: addi a1, a1, -241 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: lui a1, 4112 -; LMULMAX8-RV32-NEXT: addi a1, a1, 257 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: li a1, 56 -; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vmset.m v0 +; LMULMAX8-RV32-NEXT: fsrmi a1, 1 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v9, v9, v0.t +; LMULMAX8-RV32-NEXT: fsrm a1 +; LMULMAX8-RV32-NEXT: li a1, 52 +; LMULMAX8-RV32-NEXT: vsrl.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: li a1, 1023 +; LMULMAX8-RV32-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: li a1, 64 +; LMULMAX8-RV32-NEXT: vmv.v.v v0, v8 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0 ; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) ; LMULMAX8-RV32-NEXT: ret ; @@ -629,31 +628,19 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind { ; LMULMAX8-RV64: # %bb.0: ; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV64-NEXT: li a1, 1 -; LMULMAX8-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX8-RV64-NEXT: vnot.v v8, v8 -; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI3_0) -; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1) -; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI3_1) -; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2) -; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX8-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: vand.vx v9, v8, a2 -; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a2 -; LMULMAX8-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI3_2) -; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1) -; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI3_3) -; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2) -; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX8-RV64-NEXT: li a1, 56 -; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV64-NEXT: vmset.m v0 +; LMULMAX8-RV64-NEXT: fsrmi a1, 1 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v9, v9, v0.t +; LMULMAX8-RV64-NEXT: fsrm a1 +; LMULMAX8-RV64-NEXT: li a1, 52 +; LMULMAX8-RV64-NEXT: vsrl.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: li a1, 1023 +; LMULMAX8-RV64-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: li a1, 64 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 ; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX8-RV64-NEXT: ret %a = load <2 x i64>, ptr %x @@ -936,165 +923,143 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind { declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1) define void @cttz_v8i32(ptr %x, ptr %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v8i32: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV32-NEXT: li a1, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vnot.v v8, v8 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v8i32: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: li a1, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v8i32: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: li a1, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vnot.v v8, v8 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 349525 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 209715 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 61681 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: lui a1, 4112 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v8i32: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: li a1, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 349525 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 209715 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 61681 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: lui a1, 4112 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: cttz_v8i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle32.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle32.v v9, (a0) -; LMULMAX1-RV32-NEXT: li a2, 1 -; LMULMAX1-RV32-NEXT: vsub.vx v10, v8, a2 -; LMULMAX1-RV32-NEXT: vnot.v v8, v8 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: lui a3, 349525 -; LMULMAX1-RV32-NEXT: addi a3, a3, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a4, 209715 -; LMULMAX1-RV32-NEXT: addi a4, a4, 819 -; LMULMAX1-RV32-NEXT: vand.vx v10, v8, a4 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a5, 61681 -; LMULMAX1-RV32-NEXT: addi a5, a5, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a5 -; LMULMAX1-RV32-NEXT: lui a6, 4112 -; LMULMAX1-RV32-NEXT: addi a6, a6, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a6 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV32-NEXT: vsub.vx v10, v9, a2 -; LMULMAX1-RV32-NEXT: vnot.v v9, v9 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v10, v9, a4 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a5 -; LMULMAX1-RV32-NEXT: vmul.vx v9, v9, a6 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 24 -; LMULMAX1-RV32-NEXT: vse32.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse32.v v8, (a1) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32F-LABEL: cttz_v8i32: +; LMULMAX2-RV32F: # %bb.0: +; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32F-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32F-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX2-RV32F-NEXT: vand.vv v10, v8, v10 +; LMULMAX2-RV32F-NEXT: vmset.m v0 +; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32F-NEXT: vfcvt.f.xu.v v10, v10, v0.t +; LMULMAX2-RV32F-NEXT: fsrm a1 +; LMULMAX2-RV32F-NEXT: vsrl.vi v10, v10, 23 +; LMULMAX2-RV32F-NEXT: li a1, 127 +; LMULMAX2-RV32F-NEXT: vsub.vx v10, v10, a1 +; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32F-NEXT: li a1, 32 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32F-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32F-NEXT: ret ; -; LMULMAX1-RV64-LABEL: cttz_v8i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle32.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle32.v v9, (a0) -; LMULMAX1-RV64-NEXT: li a2, 1 -; LMULMAX1-RV64-NEXT: vsub.vx v10, v8, a2 -; LMULMAX1-RV64-NEXT: vnot.v v8, v8 -; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: lui a3, 349525 -; LMULMAX1-RV64-NEXT: addiw a3, a3, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a4, 209715 -; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 -; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a5, 61681 -; LMULMAX1-RV64-NEXT: addiw a5, a5, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a5 -; LMULMAX1-RV64-NEXT: lui a6, 4112 -; LMULMAX1-RV64-NEXT: addiw a6, a6, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a6 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV64-NEXT: vsub.vx v10, v9, a2 -; LMULMAX1-RV64-NEXT: vnot.v v9, v9 -; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a5 -; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a6 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 24 -; LMULMAX1-RV64-NEXT: vse32.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse32.v v8, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64F-LABEL: cttz_v8i32: +; LMULMAX2-RV64F: # %bb.0: +; LMULMAX2-RV64F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV64F-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64F-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX2-RV64F-NEXT: vand.vv v10, v8, v10 +; LMULMAX2-RV64F-NEXT: vmset.m v0 +; LMULMAX2-RV64F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64F-NEXT: vfcvt.f.xu.v v10, v10, v0.t +; LMULMAX2-RV64F-NEXT: fsrm a1 +; LMULMAX2-RV64F-NEXT: vsrl.vi v10, v10, 23 +; LMULMAX2-RV64F-NEXT: li a1, 127 +; LMULMAX2-RV64F-NEXT: vsub.vx v10, v10, a1 +; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64F-NEXT: li a1, 32 +; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64F-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64F-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: cttz_v8i32: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vv v10, v8, v10 +; LMULMAX2-RV32D-NEXT: vmset.m v0 +; LMULMAX2-RV32D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v10, v10, v0.t +; LMULMAX2-RV32D-NEXT: fsrm a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 23 +; LMULMAX2-RV32D-NEXT: li a1, 127 +; LMULMAX2-RV32D-NEXT: vsub.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: li a1, 32 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: cttz_v8i32: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vv v10, v8, v10 +; LMULMAX2-RV64D-NEXT: vmset.m v0 +; LMULMAX2-RV64D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v10, v10, v0.t +; LMULMAX2-RV64D-NEXT: fsrm a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 23 +; LMULMAX2-RV64D-NEXT: li a1, 127 +; LMULMAX2-RV64D-NEXT: vsub.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: li a1, 32 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret ; ; LMULMAX8-LABEL: cttz_v8i32: ; LMULMAX8: # %bb.0: @@ -1121,244 +1086,197 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind { declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1) define void @cttz_v4i64(ptr %x, ptr %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v4i64: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32-NEXT: li a1, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v12, -1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vxor.vv v8, v8, v12 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v12, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v12, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: li a1, 56 -; LMULMAX2-RV32-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v4i64: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: li a1, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.i v12, -1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v12 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v12, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v10, v10, v12 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v12, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v12, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: li a1, 56 +; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v4i64: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV64-NEXT: li a1, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vnot.v v8, v8 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI7_0) -; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI7_0)(a1) -; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI7_1) -; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI7_1)(a2) -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a2 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a2 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI7_2) -; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI7_2)(a1) -; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI7_3) -; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI7_3)(a2) -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX2-RV64-NEXT: li a1, 56 -; LMULMAX2-RV64-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v4i64: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV64I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64I-NEXT: li a1, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI7_0) +; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI7_0)(a1) +; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI7_1) +; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI7_1)(a2) +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a2 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a2 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI7_2) +; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI7_2)(a1) +; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI7_3) +; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI7_3)(a2) +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a2 +; LMULMAX2-RV64I-NEXT: li a1, 56 +; LMULMAX2-RV64I-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: cttz_v4i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) -; LMULMAX1-RV32-NEXT: li a2, 1 -; LMULMAX1-RV32-NEXT: vsub.vx v10, v8, a2 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.i v11, -1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vxor.vv v8, v8, v11 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: lui a3, 349525 -; LMULMAX1-RV32-NEXT: addi a3, a3, 1365 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v12, a3 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v12 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a3, 209715 -; LMULMAX1-RV32-NEXT: addi a3, a3, 819 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v10, a3 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v13, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v13, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v13, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v13 -; LMULMAX1-RV32-NEXT: lui a3, 61681 -; LMULMAX1-RV32-NEXT: addi a3, a3, -241 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v13, a3 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v13 -; LMULMAX1-RV32-NEXT: lui a3, 4112 -; LMULMAX1-RV32-NEXT: addi a3, a3, 257 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v14, a3 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v14 -; LMULMAX1-RV32-NEXT: li a3, 56 -; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a3 -; LMULMAX1-RV32-NEXT: vsub.vx v15, v9, a2 -; LMULMAX1-RV32-NEXT: vxor.vv v9, v9, v11 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v15 -; LMULMAX1-RV32-NEXT: vsrl.vi v11, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v12 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v11 -; LMULMAX1-RV32-NEXT: vand.vv v11, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v11, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v13 -; LMULMAX1-RV32-NEXT: vmul.vv v9, v9, v14 -; LMULMAX1-RV32-NEXT: vsrl.vx v9, v9, a3 -; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32F-LABEL: cttz_v4i64: +; LMULMAX2-RV32F: # %bb.0: +; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32F-NEXT: vle64.v v10, (a0) +; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32F-NEXT: vmv.v.i v12, 0 +; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32F-NEXT: vmseq.vv v8, v10, v12 +; LMULMAX2-RV32F-NEXT: vsub.vv v12, v12, v10 +; LMULMAX2-RV32F-NEXT: vand.vv v10, v10, v12 +; LMULMAX2-RV32F-NEXT: vmset.m v0 +; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v9, v10, v0.t +; LMULMAX2-RV32F-NEXT: fsrm a1 +; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v9, 23 +; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; LMULMAX2-RV32F-NEXT: vzext.vf2 v10, v9 +; LMULMAX2-RV32F-NEXT: li a1, 127 +; LMULMAX2-RV32F-NEXT: vsub.vx v10, v10, a1 +; LMULMAX2-RV32F-NEXT: li a1, 64 +; LMULMAX2-RV32F-NEXT: vmv1r.v v0, v8 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32F-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32F-NEXT: ret ; -; LMULMAX1-RV64-LABEL: cttz_v4i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle64.v v9, (a0) -; LMULMAX1-RV64-NEXT: li a2, 1 -; LMULMAX1-RV64-NEXT: vsub.vx v10, v8, a2 -; LMULMAX1-RV64-NEXT: vnot.v v8, v8 -; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a3, %hi(.LCPI7_0) -; LMULMAX1-RV64-NEXT: ld a3, %lo(.LCPI7_0)(a3) -; LMULMAX1-RV64-NEXT: lui a4, %hi(.LCPI7_1) -; LMULMAX1-RV64-NEXT: ld a4, %lo(.LCPI7_1)(a4) -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: lui a5, %hi(.LCPI7_2) -; LMULMAX1-RV64-NEXT: ld a5, %lo(.LCPI7_2)(a5) -; LMULMAX1-RV64-NEXT: lui a6, %hi(.LCPI7_3) -; LMULMAX1-RV64-NEXT: ld a6, %lo(.LCPI7_3)(a6) -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a5 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a6 -; LMULMAX1-RV64-NEXT: li a7, 56 -; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a7 -; LMULMAX1-RV64-NEXT: vsub.vx v10, v9, a2 -; LMULMAX1-RV64-NEXT: vnot.v v9, v9 -; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a5 -; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a6 -; LMULMAX1-RV64-NEXT: vsrl.vx v9, v9, a7 -; LMULMAX1-RV64-NEXT: vse64.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v8, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64F-LABEL: cttz_v4i64: +; LMULMAX2-RV64F: # %bb.0: +; LMULMAX2-RV64F-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV64F-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64F-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX2-RV64F-NEXT: vand.vv v10, v8, v10 +; LMULMAX2-RV64F-NEXT: vmset.m v0 +; LMULMAX2-RV64F-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; LMULMAX2-RV64F-NEXT: vfncvt.f.xu.w v12, v10, v0.t +; LMULMAX2-RV64F-NEXT: fsrm a1 +; LMULMAX2-RV64F-NEXT: vsrl.vi v10, v12, 23 +; LMULMAX2-RV64F-NEXT: li a1, 127 +; LMULMAX2-RV64F-NEXT: vwsubu.vx v12, v10, a1 +; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64F-NEXT: li a1, 64 +; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v12, a1, v0 +; LMULMAX2-RV64F-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64F-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: cttz_v4i64: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32D-NEXT: vle64.v v10, (a0) +; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32D-NEXT: vmv.v.i v12, 0 +; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV32D-NEXT: vmseq.vv v8, v10, v12 +; LMULMAX2-RV32D-NEXT: vsub.vv v12, v12, v10 +; LMULMAX2-RV32D-NEXT: vand.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vmset.m v0 +; LMULMAX2-RV32D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v10, v10, v0.t +; LMULMAX2-RV32D-NEXT: fsrm a1 +; LMULMAX2-RV32D-NEXT: li a1, 52 +; LMULMAX2-RV32D-NEXT: vsrl.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: li a1, 1023 +; LMULMAX2-RV32D-NEXT: vsub.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: li a1, 64 +; LMULMAX2-RV32D-NEXT: vmv1r.v v0, v8 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32D-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: cttz_v4i64: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV64D-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vv v10, v8, v10 +; LMULMAX2-RV64D-NEXT: vmset.m v0 +; LMULMAX2-RV64D-NEXT: fsrmi a1, 1 +; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v10, v10, v0.t +; LMULMAX2-RV64D-NEXT: fsrm a1 +; LMULMAX2-RV64D-NEXT: li a1, 52 +; LMULMAX2-RV64D-NEXT: vsrl.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: li a1, 1023 +; LMULMAX2-RV64D-NEXT: vsub.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: li a1, 64 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret ; ; LMULMAX8-RV32-LABEL: cttz_v4i64: ; LMULMAX8-RV32: # %bb.0: ; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV32-NEXT: li a1, 1 -; LMULMAX8-RV32-NEXT: vsub.vx v10, v8, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.i v12, -1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v12 -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX8-RV32-NEXT: lui a1, 349525 -; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV32-NEXT: vle64.v v10, (a0) ; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v12, a1 +; LMULMAX8-RV32-NEXT: vmv.v.i v12, 0 ; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX8-RV32-NEXT: vmseq.vv v8, v10, v12 +; LMULMAX8-RV32-NEXT: vsub.vv v12, v12, v10 ; LMULMAX8-RV32-NEXT: vand.vv v10, v10, v12 -; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: lui a1, 209715 -; LMULMAX8-RV32-NEXT: addi a1, a1, 819 -; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vand.vv v12, v8, v10 -; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: vadd.vv v8, v12, v8 -; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: lui a1, 61681 -; LMULMAX8-RV32-NEXT: addi a1, a1, -241 -; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: lui a1, 4112 -; LMULMAX8-RV32-NEXT: addi a1, a1, 257 -; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: li a1, 56 -; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vmset.m v0 +; LMULMAX8-RV32-NEXT: fsrmi a1, 1 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v10, v10, v0.t +; LMULMAX8-RV32-NEXT: fsrm a1 +; LMULMAX8-RV32-NEXT: li a1, 52 +; LMULMAX8-RV32-NEXT: vsrl.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: li a1, 1023 +; LMULMAX8-RV32-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: li a1, 64 +; LMULMAX8-RV32-NEXT: vmv1r.v v0, v8 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0 ; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) ; LMULMAX8-RV32-NEXT: ret ; @@ -1366,31 +1284,19 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ; LMULMAX8-RV64: # %bb.0: ; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV64-NEXT: li a1, 1 -; LMULMAX8-RV64-NEXT: vsub.vx v10, v8, a1 -; LMULMAX8-RV64-NEXT: vnot.v v8, v8 -; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI7_0) -; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI7_0)(a1) -; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI7_1) -; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI7_1)(a2) -; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX8-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: vand.vx v10, v8, a2 -; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a2 -; LMULMAX8-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI7_2) -; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI7_2)(a1) -; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI7_3) -; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI7_3)(a2) -; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX8-RV64-NEXT: li a1, 56 -; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV64-NEXT: vmset.m v0 +; LMULMAX8-RV64-NEXT: fsrmi a1, 1 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v10, v10, v0.t +; LMULMAX8-RV64-NEXT: fsrm a1 +; LMULMAX8-RV64-NEXT: li a1, 52 +; LMULMAX8-RV64-NEXT: vsrl.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: li a1, 1023 +; LMULMAX8-RV64-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: li a1, 64 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 ; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX8-RV64-NEXT: ret %a = load <4 x i64>, ptr %x