if (IsBoolTy)
EltTy = IntegerType::get(getLLVMContext(), SVEBitsPerBlock / NumOpnds);
- Address Alloca = CreateTempAlloca(llvm::ArrayType::get(EltTy, NumOpnds),
- CharUnits::fromQuantity(16));
+ SmallVector<llvm::Value *, 16> VecOps;
for (unsigned I = 0; I < NumOpnds; ++I)
- Builder.CreateDefaultAlignedStore(
- IsBoolTy ? Builder.CreateZExt(Ops[I], EltTy) : Ops[I],
- Builder.CreateGEP(Alloca.getElementType(), Alloca.getPointer(),
- {Builder.getInt64(0), Builder.getInt64(I)}));
+ VecOps.push_back(Builder.CreateZExt(Ops[I], EltTy));
+ Value *Vec = BuildVector(VecOps);
SVETypeFlags TypeFlags(Builtin->TypeModifier);
Value *Pred = EmitSVEAllTruePred(TypeFlags);
llvm::Type *OverloadedTy = getSVEVectorForElementType(EltTy);
- Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_ld1rq, OverloadedTy);
- Value *Alloca0 = Builder.CreateGEP(
- Alloca.getElementType(), Alloca.getPointer(),
- {Builder.getInt64(0), Builder.getInt64(0)});
- Value *LD1RQ = Builder.CreateCall(F, {Pred, Alloca0});
+ Value *InsertSubVec = Builder.CreateInsertVector(
+ OverloadedTy, UndefValue::get(OverloadedTy), Vec, Builder.getInt64(0));
+
+ Function *F =
+ CGM.getIntrinsic(Intrinsic::aarch64_sve_dupq_lane, OverloadedTy);
+ Value *DupQLane =
+ Builder.CreateCall(F, {InsertSubVec, Builder.getInt64(0)});
if (!IsBoolTy)
- return LD1RQ;
+ return DupQLane;
// For svdupq_n_b* we need to add an additional 'cmpne' with '0'.
F = CGM.getIntrinsic(NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne
: Intrinsic::aarch64_sve_cmpne_wide,
OverloadedTy);
- Value *Call =
- Builder.CreateCall(F, {Pred, LD1RQ, EmitSVEDupX(Builder.getInt64(0))});
+ Value *Call = Builder.CreateCall(
+ F, {Pred, DupQLane, EmitSVEDupX(Builder.getInt64(0))});
return EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
}
svbfloat16_t test_svdupq_n_bf16(bfloat16_t x0, bfloat16_t x1, bfloat16_t x2, bfloat16_t x3,
bfloat16_t x4, bfloat16_t x5, bfloat16_t x6, bfloat16_t x7) {
// CHECK-LABEL: test_svdupq_n_bf16
- // CHECK: %[[ALLOCA:.*]] = alloca [8 x bfloat], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x bfloat], [8 x bfloat]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store bfloat %x0, bfloat* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x bfloat], [8 x bfloat]* %[[ALLOCA]], i64 0, i64 7
- // CHECK: store bfloat %x7, bfloat* %[[GEP]], align 2
- // CHECK-NOT: store
- // CHECK: call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %{{.*}}, bfloat* nonnull %[[BASE]])
- // CHECK: ret <vscale x 8 x bfloat> %[[LOAD]]
+ // CHECK: insertelement <8 x bfloat> undef, bfloat %x0, i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <8 x bfloat> %[[X:.*]], bfloat %x7, i32 7
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 8 x bfloat> @llvm.experimental.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 8 x bfloat> %[[DUPQ]]
// expected-warning@+1 {{implicit declaration of function 'svdupq_n_bf16'}}
return SVE_ACLE_FUNC(svdupq, _n, _bf16, )(x0, x1, x2, x3, x4, x5, x6, x7);
}
int8_t x12, int8_t x13, int8_t x14, int8_t x15)
{
// CHECK-LABEL: test_svdupq_n_s8
- // CHECK: %[[ALLOCA:.*]] = alloca [16 x i8], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i8 %x0, i8* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 15
- // CHECK: store i8 %x15, i8* %[[GEP]], align 1
- // CHECK-NOT: store
- // CHECK: %[[PTRUE:.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %[[PTRUE]], i8* nonnull %[[BASE]])
- // CHECK: ret <vscale x 16 x i8> %[[LOAD]]
+ // CHECK: insertelement <16 x i8> undef, i8 %x0, i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <16 x i8> %[[X:.*]], i8 %x15, i32 15
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 16 x i8> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_s8,)(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15);
}
int16_t x4, int16_t x5, int16_t x6, int16_t x7)
{
// CHECK-LABEL: test_svdupq_n_s16
- // CHECK-DAG: %[[ALLOCA:.*]] = alloca [8 x i16], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i16 %x0, i16* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 7
- // CHECK: store i16 %x7, i16* %[[GEP]], align 2
- // CHECK-NOT: store
- // CHECK: call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %{{.*}}, i16* nonnull %[[BASE]])
- // CHECK: ret <vscale x 8 x i16> %[[LOAD]]
+ // CHECK: insertelement <8 x i16> undef, i16 %x0, i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <8 x i16> %[[X:.*]], i16 %x7, i32 7
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 8 x i16> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_s16,)(x0, x1, x2, x3, x4, x5, x6, x7);
}
svint32_t test_svdupq_n_s32(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
{
// CHECK-LABEL: test_svdupq_n_s32
- // CHECK: %[[ALLOCA:.*]] = alloca [4 x i32], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i32 %x0, i32* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 3
- // CHECK: store i32 %x3, i32* %[[GEP]], align 4
- // CHECK-NOT: store
- // CHECK: call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %{{.*}}, i32* nonnull %[[BASE]])
- // CHECK: ret <vscale x 4 x i32> %[[LOAD]]
+ // CHECK: insertelement <4 x i32> undef, i32 %x0, i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <4 x i32> %[[X:.*]], i32 %x3, i32 3
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 4 x i32> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_s32,)(x0, x1, x2, x3);
}
svint64_t test_svdupq_n_s64(int64_t x0, int64_t x1)
{
// CHECK-LABEL: test_svdupq_n_s64
- // CHECK: %[[ALLOCA:.*]] = alloca [2 x i64], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i64 %x0, i64* %[[BASE]], align 16
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 1
- // CHECK: store i64 %x1, i64* %[[GEP]], align 8
- // CHECK-NOT: store
- // CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %{{.*}}, i64* nonnull %[[BASE]])
- // CHECK: ret <vscale x 2 x i64> %[[LOAD]]
+ // CHECK: %[[SVEC:.*]] = insertelement <2 x i64> undef, i64 %x0, i32 0
+ // CHECK: %[[VEC:.*]] = insertelement <2 x i64> %[[SVEC]], i64 %x1, i32 1
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 2 x i64> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_s64,)(x0, x1);
}
uint8_t x12, uint8_t x13, uint8_t x14, uint8_t x15)
{
// CHECK-LABEL: test_svdupq_n_u8
- // CHECK: %[[ALLOCA:.*]] = alloca [16 x i8], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i8 %x0, i8* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 15
- // CHECK: store i8 %x15, i8* %[[GEP]], align 1
- // CHECK-NOT: store
- // CHECK: %[[PTRUE:.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %[[PTRUE]], i8* nonnull %[[BASE]])
- // CHECK: ret <vscale x 16 x i8> %[[LOAD]]
+ // CHECK: insertelement <16 x i8> undef, i8 %x0, i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <16 x i8> %[[X:.*]], i8 %x15, i32 15
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 16 x i8> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_u8,)(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15);
}
uint16_t x4, uint16_t x5, uint16_t x6, uint16_t x7)
{
// CHECK-LABEL: test_svdupq_n_u16
- // CHECK: %[[ALLOCA:.*]] = alloca [8 x i16], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i16 %x0, i16* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 7
- // CHECK: store i16 %x7, i16* %[[GEP]], align 2
- // CHECK-NOT: store
- // CHECK: call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %{{.*}}, i16* nonnull %[[BASE]])
- // CHECK: ret <vscale x 8 x i16> %[[LOAD]]
+ // CHECK: insertelement <8 x i16> undef, i16 %x0, i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <8 x i16> %[[X:.*]], i16 %x7, i32 7
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 8 x i16> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_u16,)(x0, x1, x2, x3, x4, x5, x6, x7);
}
svuint32_t test_svdupq_n_u32(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3)
{
// CHECK-LABEL: test_svdupq_n_u32
- // CHECK: %[[ALLOCA:.*]] = alloca [4 x i32], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i32 %x0, i32* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 3
- // CHECK: store i32 %x3, i32* %[[GEP]], align 4
- // CHECK-NOT: store
- // CHECK: call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %{{.*}}, i32* nonnull %[[BASE]])
- // CHECK: ret <vscale x 4 x i32> %[[LOAD]]
+ // CHECK: insertelement <4 x i32> undef, i32 %x0, i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <4 x i32> %[[X:.*]], i32 %x3, i32 3
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 4 x i32> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_u32,)(x0, x1, x2, x3);
}
svuint64_t test_svdupq_n_u64(uint64_t x0, uint64_t x1)
{
// CHECK-LABEL: test_svdupq_n_u64
- // CHECK: %[[ALLOCA:.*]] = alloca [2 x i64], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i64 %x0, i64* %[[BASE]], align 16
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 1
- // CHECK: store i64 %x1, i64* %[[GEP]], align 8
- // CHECK-NOT: store
- // CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %{{.*}}, i64* nonnull %[[BASE]])
- // CHECK: ret <vscale x 2 x i64> %[[LOAD]]
+ // CHECK: %[[SVEC:.*]] = insertelement <2 x i64> undef, i64 %x0, i32 0
+ // CHECK: %[[VEC:.*]] = insertelement <2 x i64> %[[SVEC]], i64 %x1, i32 1
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 2 x i64> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_u64,)(x0, x1);
}
float16_t x4, float16_t x5, float16_t x6, float16_t x7)
{
// CHECK-LABEL: test_svdupq_n_f16
- // CHECK: %[[ALLOCA:.*]] = alloca [8 x half], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x half], [8 x half]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store half %x0, half* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x half], [8 x half]* %[[ALLOCA]], i64 0, i64 7
- // CHECK: store half %x7, half* %[[GEP]], align 2
- // CHECK-NOT: store
- // CHECK: call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %{{.*}}, half* nonnull %[[BASE]])
- // CHECK: ret <vscale x 8 x half> %[[LOAD]]
+ // CHECK: insertelement <8 x half> undef, half %x0, i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <8 x half> %[[X:.*]], half %x7, i32 7
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 8 x half> @llvm.experimental.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 8 x half> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_f16,)(x0, x1, x2, x3, x4, x5, x6, x7);
}
svfloat32_t test_svdupq_n_f32(float32_t x0, float32_t x1, float32_t x2, float32_t x3)
{
// CHECK-LABEL: test_svdupq_n_f32
- // CHECK: %[[ALLOCA:.*]] = alloca [4 x float], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [4 x float], [4 x float]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store float %x0, float* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [4 x float], [4 x float]* %[[ALLOCA]], i64 0, i64 3
- // CHECK: store float %x3, float* %[[GEP]], align 4
- // CHECK-NOT: store
- // CHECK: call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %{{.*}}, float* nonnull %[[BASE]])
- // CHECK: ret <vscale x 4 x float> %[[LOAD]]
+ // CHECK: insertelement <4 x float> undef, float %x0, i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <4 x float> %[[X:.*]], float %x3, i32 3
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 4 x float> @llvm.experimental.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 4 x float> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_f32,)(x0, x1, x2, x3);
}
svfloat64_t test_svdupq_n_f64(float64_t x0, float64_t x1)
{
// CHECK-LABEL: test_svdupq_n_f64
- // CHECK: %[[ALLOCA:.*]] = alloca [2 x double], align 16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x double], [2 x double]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store double %x0, double* %[[BASE]], align 16
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x double], [2 x double]* %[[ALLOCA]], i64 0, i64 1
- // CHECK: store double %x1, double* %[[GEP]], align 8
- // CHECK-NOT: store
- // CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %{{.*}}, double* nonnull %[[BASE]])
- // CHECK: ret <vscale x 2 x double> %[[LOAD]]
+ // CHECK: %[[SVEC:.*]] = insertelement <2 x double> undef, double %x0, i32 0
+ // CHECK: %[[VEC:.*]] = insertelement <2 x double> %[[SVEC]], double %x1, i32 1
+ // CHECK-NOT: insertelement
+ // CHECK: %[[INS:.*]] = call <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %[[INS]], i64 0)
+ // CHECK: ret <vscale x 2 x double> %[[DUPQ]]
return SVE_ACLE_FUNC(svdupq,_n,_f64,)(x0, x1);
}
bool x12, bool x13, bool x14, bool x15)
{
// CHECK-LABEL: test_svdupq_n_b8
- // CHECK-DAG: %[[ALLOCA:.*]] = alloca [16 x i8], align 16
// CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i8
// CHECK-DAG: %[[X15:.*]] = zext i1 %x15 to i8
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i8 %[[X0]], i8* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 15
- // CHECK: store i8 %[[X15]], i8* %[[GEP]], align 1
- // CHECK-NOT: store
+ // CHECK: insertelement <16 x i8> undef, i8 %[[X0]], i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <16 x i8> %[[X:.*]], i8 %[[X15]], i32 15
+ // CHECK-NOT: insertelement
// CHECK: %[[PTRUE:.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %[[PTRUE]], i8* nonnull %[[BASE]])
+ // CHECK: %[[INS:.*]] = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %[[INS]], i64 0)
// CHECK: %[[ZERO:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
- // CHECK: %[[CMP:.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> %[[PTRUE]], <vscale x 16 x i8> %[[LOAD]], <vscale x 2 x i64> %[[ZERO]])
+ // CHECK: %[[CMP:.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> %[[PTRUE]], <vscale x 16 x i8> %[[DUPQ]], <vscale x 2 x i64> %[[ZERO]])
// CHECK: ret <vscale x 16 x i1> %[[CMP]]
return SVE_ACLE_FUNC(svdupq,_n,_b8,)(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15);
}
bool x4, bool x5, bool x6, bool x7)
{
// CHECK-LABEL: test_svdupq_n_b16
- // CHECK-DAG: %[[ALLOCA:.*]] = alloca [8 x i16], align 16
// CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i16
// CHECK-DAG: %[[X7:.*]] = zext i1 %x7 to i16
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i16 %[[X0]], i16* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 7
- // CHECK: store i16 %[[X7]], i16* %[[GEP]], align 2
- // CHECK-NOT: store
+ // CHECK: insertelement <8 x i16> undef, i16 %[[X0]], i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <8 x i16> %[[X:.*]], i16 %[[X7]], i32 7
+ // CHECK-NOT: insertelement
// CHECK: %[[PTRUE:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %{{.*}}, i16* nonnull %[[BASE]])
+ // CHECK: %[[INS:.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %[[INS]], i64 0)
// CHECK: %[[ZERO:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
- // CHECK: %[[CMP:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.cmpne.wide.nxv8i16(<vscale x 8 x i1> %{{.*}}, <vscale x 8 x i16> %[[LOAD]], <vscale x 2 x i64> %[[ZERO]])
+ // CHECK: %[[CMP:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.cmpne.wide.nxv8i16(<vscale x 8 x i1> %[[PTRUE]], <vscale x 8 x i16> %[[DUPQ]], <vscale x 2 x i64> %[[ZERO]])
// CHECK: %[[CAST:.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %[[CMP]])
// CHECK: ret <vscale x 16 x i1> %[[CAST]]
return SVE_ACLE_FUNC(svdupq,_n,_b16,)(x0, x1, x2, x3, x4, x5, x6, x7);
svbool_t test_svdupq_n_b32(bool x0, bool x1, bool x2, bool x3)
{
// CHECK-LABEL: test_svdupq_n_b32
- // CHECK-DAG: %[[ALLOCA:.*]] = alloca [4 x i32], align 16
// CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i32
// CHECK-DAG: %[[X3:.*]] = zext i1 %x3 to i32
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i32 %[[X0]], i32* %[[BASE]], align 16
- // <assume other stores>
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 3
- // CHECK: store i32 %[[X3]], i32* %[[GEP]], align 4
- // CHECK-NOT: store
+ // CHECK: insertelement <4 x i32> undef, i32 %[[X0]], i32 0
+ // <assume other insertelement>
+ // CHECK: %[[VEC:.*]] = insertelement <4 x i32> %[[X:.*]], i32 %[[X3]], i32 3
+ // CHECK-NOT: insertelement
// CHECK: %[[PTRUE:.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %{{.*}}, i32* nonnull %[[BASE]])
+ // CHECK: %[[INS:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %[[INS]], i64 0)
// CHECK: %[[ZERO:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
- // CHECK: %[[INTRINSIC:.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> %{{.*}}, <vscale x 4 x i32> %[[LOAD]], <vscale x 2 x i64> %[[ZERO]])
- // CHECK: %[[CAST:.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %[[INTRINSIC]])
+ // CHECK: %[[CMP:.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> %[[PTRUE]], <vscale x 4 x i32> %[[DUPQ]], <vscale x 2 x i64> %[[ZERO]])
+ // CHECK: %[[CAST:.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %[[CMP]])
// CHECK: ret <vscale x 16 x i1> %[[CAST]]
return SVE_ACLE_FUNC(svdupq,_n,_b32,)(x0, x1, x2, x3);
}
svbool_t test_svdupq_n_b64(bool x0, bool x1)
{
// CHECK-LABEL: test_svdupq_n_b64
- // CHECK-DAG: %[[ALLOCA:.*]] = alloca [2 x i64], align 16
// CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i64
// CHECK-DAG: %[[X1:.*]] = zext i1 %x1 to i64
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i64 %[[X0]], i64* %[[BASE]], align 16
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 1
- // CHECK: store i64 %[[X1]], i64* %[[GEP]], align 8
- // CHECK-NOT: store
+ // CHECK: %[[SVEC:.*]] = insertelement <2 x i64> undef, i64 %[[X0]], i32 0
+ // CHECK: %[[VEC:.*]] = insertelement <2 x i64> %[[SVEC]], i64 %[[X1]], i32 1
+ // CHECK-NOT: insertelement
// CHECK: %[[PTRUE:.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %{{.*}}, i64* nonnull %[[BASE]])
+ // CHECK: %[[INS:.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %[[VEC]], i64 0)
+ // CHECK: %[[DUPQ:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %[[INS]], i64 0)
// CHECK: %[[ZERO:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
- // CHECK: %[[INTRINSIC:.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %{{.*}}, <vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i64> %[[ZERO]])
- // CHECK: %[[CAST:.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %[[INTRINSIC]])
+ // CHECK: %[[CMP:.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %[[PTRUE]], <vscale x 2 x i64> %[[DUPQ]], <vscale x 2 x i64> %[[ZERO]])
+ // CHECK: %[[CAST:.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %[[CMP]])
// CHECK: ret <vscale x 16 x i1> %[[CAST]]
return SVE_ACLE_FUNC(svdupq,_n,_b64,)(x0, x1);
}
-
-// This test checks that the `alloca` is added to the entry-block.
-svint64_t test_svdupq_control_flow(int64_t x0, int64_t x1, svint64_t Default, bool P)
-{
- // CHECK-LABEL: test_svdupq_control_flow
- // CHECK: entry:
- // CHECK-DAG: %[[ALLOCA:.*]] = alloca [2 x i64], align 16
- // CHECK-DAG: [[BR:.*]]:
- // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 0
- // CHECK-DAG: store i64 %x0, i64* %[[BASE]], align 16
- // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 1
- // CHECK: store i64 %x1, i64* %[[GEP]], align 8
- // CHECK-NOT: store
- // CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
- // CHECK: %[[LOAD:.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %{{.*}}, i64* nonnull %[[BASE]])
- // CHECK: [[END:.*]]:
- // CHECK: %[[RETVAL:.*]] = phi <vscale x 2 x i64> [ %[[LOAD]], %if.end ], [ %Default, %entry ]
- // CHECK: ret <vscale x 2 x i64> %[[RETVAL]]
- if (P)
- return Default;
- return SVE_ACLE_FUNC(svdupq,_n,_s64,)(x0, x1);
-}
--- /dev/null
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+svbool_t test_svdupq_n_b8_const()
+{
+ // CHECK-LABEL: test_svdupq_n_b8_const
+ // CHECK: ptrue p0.h
+ // CHECK-NEXT: ret
+ return svdupq_n_b8(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0);
+}
+
+svbool_t test_svdupq_n_b16_const()
+{
+ // CHECK-LABEL: test_svdupq_n_b16_const
+ // CHECK: ptrue p0.h
+ // CHECK-NEXT: ret
+ return svdupq_n_b16(1, 1, 1, 1, 1, 1, 1, 1);
+}
+
+svbool_t test_svdupq_n_b32_const()
+{
+ // CHECK-LABEL: test_svdupq_n_b32_const
+ // CHECK: ptrue p0.s
+ // CHECK-NEXT: ret
+ return svdupq_n_b32(1, 1, 1, 1);
+}
+
+svbool_t test_svdupq_n_b64_const()
+{
+ // CHECK-LABEL: test_svdupq_n_b64_const
+ // CHECK: ptrue p0.d
+ // CHECK-NEXT: ret
+ return svdupq_n_b64(1, 1);
+}
def int_aarch64_sve_dup : AdvSIMD_SVE_DUP_Intrinsic;
def int_aarch64_sve_dup_x : AdvSIMD_SVE_DUP_Unpred_Intrinsic;
-
def int_aarch64_sve_index : AdvSIMD_SVE_Index_Intrinsic;
//
return IC.replaceInstUsesWith(II, Insert);
}
+static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
+ IntrinsicInst &II) {
+ LLVMContext &Ctx = II.getContext();
+ IRBuilder<> Builder(Ctx);
+ Builder.SetInsertPoint(&II);
+
+ // Check that the predicate is all active
+ auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
+ if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
+ return None;
+
+ const auto PTruePattern =
+ cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
+ if (PTruePattern != AArch64SVEPredPattern::all)
+ return None;
+
+ // Check that we have a compare of zero..
+ auto *DupX = dyn_cast<IntrinsicInst>(II.getArgOperand(2));
+ if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
+ return None;
+
+ auto *DupXArg = dyn_cast<ConstantInt>(DupX->getArgOperand(0));
+ if (!DupXArg || !DupXArg->isZero())
+ return None;
+
+ // ..against a dupq
+ auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
+ if (!DupQLane ||
+ DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
+ return None;
+
+ // Where the dupq is a lane 0 replicate of a vector insert
+ if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
+ return None;
+
+ auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
+ if (!VecIns ||
+ VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert)
+ return None;
+
+ // Where the vector insert is a fixed constant vector insert into undef at
+ // index zero
+ if (!isa<UndefValue>(VecIns->getArgOperand(0)))
+ return None;
+
+ if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
+ return None;
+
+ auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
+ if (!ConstVec)
+ return None;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
+ auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
+ if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
+ return None;
+
+ unsigned NumElts = VecTy->getNumElements();
+ unsigned PredicateBits = 0;
+
+ // Expand intrinsic operands to a 16-bit byte level predicate
+ for (unsigned I = 0; I < NumElts; ++I) {
+ auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
+ if (!Arg)
+ return None;
+ if (!Arg->isZero())
+ PredicateBits |= 1 << (I * (16 / NumElts));
+ }
+
+ // If all bits are zero bail early with an empty predicate
+ if (PredicateBits == 0) {
+ auto *PFalse = Constant::getNullValue(II.getType());
+ PFalse->takeName(&II);
+ return IC.replaceInstUsesWith(II, PFalse);
+ }
+
+ // Calculate largest predicate type used (where byte predicate is largest)
+ unsigned Mask = 8;
+ for (unsigned I = 0; I < 16; ++I)
+ if ((PredicateBits & (1 << I)) != 0)
+ Mask |= (I % 8);
+
+ unsigned PredSize = Mask & -Mask;
+ auto *PredType = ScalableVectorType::get(
+ Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
+
+ // Ensure all relevant bits are set
+ for (unsigned I = 0; I < 16; I += PredSize)
+ if ((PredicateBits & (1 << I)) == 0)
+ return None;
+
+ auto *PTruePat =
+ ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
+ auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
+ {PredType}, {PTruePat});
+ auto *ConvertToSVBool = Builder.CreateIntrinsic(
+ Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
+ auto *ConvertFromSVBool =
+ Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
+ {II.getType()}, {ConvertToSVBool});
+
+ ConvertFromSVBool->takeName(&II);
+ return IC.replaceInstUsesWith(II, ConvertFromSVBool);
+}
+
static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
IntrinsicInst &II) {
Value *Pg = II.getArgOperand(0);
return instCombineConvertFromSVBool(IC, II);
case Intrinsic::aarch64_sve_dup:
return instCombineSVEDup(IC, II);
+ case Intrinsic::aarch64_sve_cmpne:
+ case Intrinsic::aarch64_sve_cmpne_wide:
+ return instCombineSVECmpNE(IC, II);
case Intrinsic::aarch64_sve_rdffr:
return instCombineRDFFR(IC, II);
case Intrinsic::aarch64_sve_lasta:
--- /dev/null
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; DUPQ b8
+
+define <vscale x 16 x i1> @dupq_b_0() #0 {
+; CHECK-LABEL: @dupq_b_0(
+; CHECK: ret <vscale x 16 x i1> zeroinitializer
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %2 = tail call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef,
+ <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i64 0)
+ %3 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 16 x i1> %5
+}
+
+define <vscale x 16 x i1> @dupq_b_d() #0 {
+; CHECK-LABEL: @dupq_b_d(
+; CHECK: %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: %2 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
+; CHECK-NEXT: ret <vscale x 16 x i1> %2
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %2 = tail call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef,
+ <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+ i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i64 0)
+ %3 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 16 x i1> %5
+}
+
+define <vscale x 16 x i1> @dupq_b_w() #0 {
+; CHECK-LABEL: @dupq_b_w(
+; CHECK: %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT: %2 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
+; CHECK-NEXT: ret <vscale x 16 x i1> %2
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %2 = tail call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef,
+ <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0,
+ i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0>, i64 0)
+ %3 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 16 x i1> %5
+}
+
+define <vscale x 16 x i1> @dupq_b_h() #0 {
+; CHECK-LABEL: @dupq_b_h(
+; CHECK: %1 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT: %2 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
+; CHECK-NEXT: ret <vscale x 16 x i1> %2
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %2 = tail call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef,
+ <16 x i8> <i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0,
+ i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0>, i64 0)
+ %3 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 16 x i1> %5
+}
+
+define <vscale x 16 x i1> @dupq_b_b() #0 {
+; CHECK-LABEL: @dupq_b_b(
+; CHECK: %1 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+; CHECK-NEXT: ret <vscale x 16 x i1> %1
+ %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+ %2 = tail call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef,
+ <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+ i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, i64 0)
+ %3 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 16 x i1> %5
+}
+
+; DUPQ b16
+
+define <vscale x 8 x i1> @dupq_h_0() #0 {
+; CHECK-LABEL: @dupq_h_0(
+; CHECK: ret <vscale x 8 x i1> zeroinitializer
+ %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %2 = tail call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef,
+ <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i64 0)
+ %3 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.cmpne.wide.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 8 x i1> %5
+}
+
+define <vscale x 8 x i1> @dupq_h_d() #0 {
+; CHECK-LABEL: @dupq_h_d(
+; CHECK: %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: %2 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
+; CHECK-NEXT: %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
+; CHECK-NEXT: ret <vscale x 8 x i1> %3
+ %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %2 = tail call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef,
+ <8 x i16> <i16 1, i16 0, i16 0, i16 0, i16 1, i16 0, i16 0, i16 0>, i64 0)
+ %3 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.cmpne.wide.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 8 x i1> %5
+}
+
+define <vscale x 8 x i1> @dupq_h_w() #0 {
+; CHECK-LABEL: @dupq_h_w(
+; CHECK: %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT: %2 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
+; CHECK-NEXT: %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
+; CHECK-NEXT: ret <vscale x 8 x i1> %3
+ %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %2 = tail call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef,
+ <8 x i16> <i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0>, i64 0)
+ %3 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.cmpne.wide.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 8 x i1> %5
+}
+
+define <vscale x 8 x i1> @dupq_h_h() #0 {
+; CHECK-LABEL: @dupq_h_h(
+; CHECK: %1 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT: ret <vscale x 8 x i1> %1
+ %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+ %2 = tail call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef,
+ <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, i64 0)
+ %3 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.cmpne.wide.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 8 x i1> %5
+}
+
+; DUPQ b32
+
+define <vscale x 4 x i1> @dupq_w_0() #0 {
+; CHECK-LABEL: @dupq_w_0(
+; CHECK: ret <vscale x 4 x i1> zeroinitializer
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = tail call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef,
+ <4 x i32> <i32 0, i32 0, i32 0, i32 0>, i64 0)
+ %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 4 x i1> %5
+}
+
+define <vscale x 4 x i1> @dupq_w_d() #0 {
+; CHECK-LABEL: @dupq_w_d(
+; CHECK: %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: %2 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
+; CHECK-NEXT: %3 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %2)
+; CHECK-NEXT: ret <vscale x 4 x i1> %3
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = tail call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef,
+ <4 x i32> <i32 1, i32 0, i32 1, i32 0>, i64 0)
+ %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 4 x i1> %5
+}
+
+define <vscale x 4 x i1> @dupq_w_w() #0 {
+; CHECK-LABEL: @dupq_w_w(
+; CHECK: %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT: ret <vscale x 4 x i1> %1
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = tail call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef,
+ <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i64 0)
+ %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 4 x i1> %5
+}
+
+; DUPQ b64
+
+define <vscale x 2 x i1> @dupq_d_0() #0 {
+; CHECK-LABEL: @dupq_d_0(
+; CHECK: ret <vscale x 2 x i1> zeroinitializer
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef,
+ <2 x i64> <i64 0, i64 0>, i64 0)
+ %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 2 x i1> %5
+}
+
+define <vscale x 2 x i1> @dupq_d_d() #0 {
+; CHECK-LABEL: @dupq_d_d(
+; CHECK: %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: ret <vscale x 2 x i1> %1
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef,
+ <2 x i64> <i64 1, i64 1>, i64 0)
+ %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 2 x i1> %5
+}
+
+; Cases that cannot be converted
+
+define <vscale x 2 x i1> @dupq_neg1() #0 {
+; CHECK-LABEL: @dupq_neg1(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef,
+ <2 x i64> <i64 1, i64 0>, i64 0)
+ %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 2 x i1> %5
+}
+
+define <vscale x 4 x i1> @dupq_neg2() #0 {
+; CHECK-LABEL: @dupq_neg2(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = tail call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef,
+ <4 x i32> <i32 1, i32 0, i32 0, i32 1>, i64 0)
+ %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 4 x i1> %5
+}
+
+define <vscale x 4 x i1> @dupq_neg3() #0 {
+; CHECK-LABEL: @dupq_neg3(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = tail call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef,
+ <4 x i32> <i32 0, i32 1, i32 0, i32 1>, i64 0)
+ %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 4 x i1> %5
+}
+
+define <vscale x 4 x i1> @dupq_neg4() #0 {
+; CHECK-LABEL: @dupq_neg4(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = tail call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef,
+ <4 x i32> <i32 1, i32 1, i32 0, i32 0>, i64 0)
+ %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 4 x i1> %5
+}
+
+define <vscale x 4 x i1> @dupq_neg5() #0 {
+; CHECK-LABEL: @dupq_neg5(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = tail call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef,
+ <4 x i32> <i32 0, i32 0, i32 0, i32 1>, i64 0)
+ %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 4 x i1> %5
+}
+
+define <vscale x 4 x i1> @dupq_neg6(i1 %a) #0 {
+; CHECK-LABEL: @dupq_neg6(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %2 = zext i1 %a to i32
+ %3 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 poison>, i32 %2, i32 3
+ %4 = tail call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %3, i64 0)
+ %5 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %4 , i64 0)
+ %6 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %7 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 2 x i64> %6)
+ ret <vscale x 4 x i1> %7
+}
+
+define <vscale x 2 x i1> @dupq_neg7() #0 {
+; CHECK-LABEL: @dupq_neg7(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef,
+ <2 x i64> <i64 1, i64 1>, i64 1)
+ %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 2 x i1> %5
+}
+
+define <vscale x 2 x i1> @dupq_neg8() #0 {
+; CHECK-LABEL: @dupq_neg8(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef,
+ <2 x i64> <i64 1, i64 1>, i64 0)
+ %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2 , i64 1)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 2 x i1> %5
+}
+
+define <vscale x 2 x i1> @dupq_neg9(<vscale x 2 x i64> %x) #0 {
+; CHECK-LABEL: @dupq_neg9(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %x,
+ <2 x i64> <i64 1, i64 1>, i64 0)
+ %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 2 x i1> %5
+}
+
+define <vscale x 2 x i1> @dupq_neg10() #0 {
+; CHECK-LABEL: @dupq_neg10(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef,
+ <2 x i64> <i64 1, i64 1>, i64 0)
+ %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
+ %5 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 2 x i1> %5
+}
+
+define <vscale x 2 x i1> @dupq_neg11(<vscale x 2 x i1> %pg) #0 {
+; CHECK-LABEL: @dupq_neg11(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef,
+ <2 x i64> <i64 1, i64 1>, i64 0)
+ %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1 , i64 0)
+ %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %4 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %2, <vscale x 2 x i64> %3)
+ ret <vscale x 2 x i1> %4
+}
+
+define <vscale x 2 x i1> @dupq_neg12() #0 {
+; CHECK-LABEL: @dupq_neg12(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 15)
+ %2 = tail call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef,
+ <2 x i64> <i64 1, i64 1>, i64 0)
+ %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 0)
+ %5 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %3, <vscale x 2 x i64> %4)
+ ret <vscale x 2 x i1> %5
+}
+
+define <vscale x 2 x i1> @dupq_neg13(<vscale x 2 x i64> %x) #0 {
+; CHECK-LABEL: @dupq_neg13(
+; CHECK: cmpne
+; CHECK-NEXT: ret
+ %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+ %2 = tail call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef,
+ <2 x i64> <i64 1, i64 1>, i64 0)
+ %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2 , i64 0)
+ %4 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %3, <vscale x 2 x i64> %x)
+ ret <vscale x 2 x i1> %4
+}
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
+
+declare <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
+declare <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.cmpne.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.cmpne.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
+
+attributes #0 = { "target-features"="+sve" }