return R;
}
-// Emit code to clear the padding bits when returning or passing as an argument
-// a 16-bit floating-point value.
-llvm::Value *CodeGenFunction::EmitCMSEClearFP16(llvm::Value *Src) {
- llvm::Type *RetTy = Src->getType();
- assert(RetTy->isFloatTy() ||
- (RetTy->isIntegerTy() && RetTy->getIntegerBitWidth() == 32));
- if (RetTy->isFloatTy()) {
- llvm::Value *T0 = Builder.CreateBitCast(Src, Builder.getIntNTy(32));
- llvm::Value *T1 = Builder.CreateAnd(T0, 0xffff, "cmse.clear");
- return Builder.CreateBitCast(T1, RetTy);
- }
- return Builder.CreateAnd(Src, 0xffff, "cmse.clear");
-}
-
void CodeGenFunction::EmitFunctionEpilog(const CGFunctionInfo &FI,
bool EmitRetDbgLoc,
SourceLocation EndLoc) {
if (CurFuncDecl && CurFuncDecl->hasAttr<CmseNSEntryAttr>()) {
// For certain return types, clear padding bits, as they may reveal
// sensitive information.
- const Type *RTy = RetTy.getCanonicalType().getTypePtr();
- if (RTy->isFloat16Type() || RTy->isHalfType()) {
- // 16-bit floating-point types are passed in a 32-bit integer or float,
- // with unspecified upper bits.
- RV = EmitCMSEClearFP16(RV);
- } else {
- // Small struct/union types are passed as integers.
- auto *ITy = dyn_cast<llvm::IntegerType>(RV->getType());
- if (ITy != nullptr && isa<RecordType>(RetTy.getCanonicalType()))
- RV = EmitCMSEClearRecord(RV, ITy, RetTy);
- }
+ // Small struct/union types are passed as integers.
+ auto *ITy = dyn_cast<llvm::IntegerType>(RV->getType());
+ if (ITy != nullptr && isa<RecordType>(RetTy.getCanonicalType()))
+ RV = EmitCMSEClearRecord(RV, ITy, RetTy);
}
EmitReturnValueCheck(RV);
Ret = Builder.CreateRet(RV);
if (CallInfo.isCmseNSCall()) {
// For certain parameter types, clear padding bits, as they may reveal
// sensitive information.
- const Type *PTy = I->Ty.getCanonicalType().getTypePtr();
- // 16-bit floating-point types are passed in a 32-bit integer or
- // float, with unspecified upper bits.
- if (PTy->isFloat16Type() || PTy->isHalfType()) {
- Load = EmitCMSEClearFP16(Load);
- } else {
- // Small struct/union types are passed as integer arrays.
- auto *ATy = dyn_cast<llvm::ArrayType>(Load->getType());
- if (ATy != nullptr && isa<RecordType>(I->Ty.getCanonicalType()))
- Load = EmitCMSEClearRecord(Load, ATy, I->Ty);
- }
+ // Small struct/union types are passed as integer arrays.
+ auto *ATy = dyn_cast<llvm::ArrayType>(Load->getType());
+ if (ATy != nullptr && isa<RecordType>(I->Ty.getCanonicalType()))
+ Load = EmitCMSEClearRecord(Load, ATy, I->Ty);
}
IRCallArgs[FirstIRArg] = Load;
}
QualType RTy);
llvm::Value *EmitCMSEClearRecord(llvm::Value *V, llvm::ArrayType *ATy,
QualType RTy);
- llvm::Value *EmitCMSEClearFP16(llvm::Value *V);
llvm::Value *EmitCommonNeonBuiltinExpr(unsigned BuiltinID,
unsigned LLVMIntrinsic,
if (isIllegalVectorType(Ty))
return coerceIllegalVector(Ty);
- // _Float16 and __fp16 get passed as if it were an int or float, but
- // with the top 16 bits unspecified. This is not done for OpenCL as it handles
- // the half type natively, and does not need to interwork with AAPCS code.
- if ((Ty->isFloat16Type() || Ty->isHalfType()) &&
- !getContext().getLangOpts().NativeHalfArgsAndReturns) {
- llvm::Type *ResType = IsAAPCS_VFP ?
- llvm::Type::getFloatTy(getVMContext()) :
- llvm::Type::getInt32Ty(getVMContext());
- return ABIArgInfo::getDirect(ResType);
- }
-
// __bf16 gets passed using the bfloat IR type, or using i32 but
// with the top 16 bits unspecified.
if (Ty->isBFloat16Type() && IsFloatABISoftFP) {
return coerceIllegalVector(RetTy);
}
- // _Float16 and __fp16 get returned as if it were an int or float, but with
- // the top 16 bits unspecified. This is not done for OpenCL as it handles the
- // half type natively, and does not need to interwork with AAPCS code.
- if ((RetTy->isFloat16Type() || RetTy->isHalfType()) &&
- !getContext().getLangOpts().NativeHalfArgsAndReturns) {
- llvm::Type *ResType = IsAAPCS_VFP ?
- llvm::Type::getFloatTy(getVMContext()) :
- llvm::Type::getInt32Ty(getVMContext());
- return ABIArgInfo::getDirect(ResType);
- }
-
// if we're using the softfp float abi, __bf16 get returned as if it were an
// int but with the top 16 bits unspecified.
if (RetTy->isBFloat16Type()) {
// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fallow-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi hard -fallow-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
-// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fnative-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=NATIVE
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fnative-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=NATIVE
__fp16 g;
void t1(__fp16 a) { g = a; }
-// SOFT: define void @t1(i32 [[PARAM:%.*]])
-// SOFT: [[TRUNC:%.*]] = trunc i32 [[PARAM]] to i16
-// HARD: define arm_aapcs_vfpcc void @t1(float [[PARAM:%.*]])
-// HARD: [[BITCAST:%.*]] = bitcast float [[PARAM]] to i32
-// HARD: [[TRUNC:%.*]] = trunc i32 [[BITCAST]] to i16
-// CHECK: store i16 [[TRUNC]], i16* bitcast (half* @g to i16*)
+// SOFT: define void @t1(half [[PARAM:%.*]])
+// HARD: define arm_aapcs_vfpcc void @t1(half [[PARAM:%.*]])
// NATIVE: define void @t1(half [[PARAM:%.*]])
-// NATIVE: store half [[PARAM]], half* @g
+// CHECK: store half [[PARAM]], half* @g
__fp16 t2() { return g; }
-// SOFT: define i32 @t2()
-// HARD: define arm_aapcs_vfpcc float @t2()
+// SOFT: define half @t2()
+// HARD: define arm_aapcs_vfpcc half @t2()
// NATIVE: define half @t2()
-// CHECK: [[LOAD:%.*]] = load i16, i16* bitcast (half* @g to i16*)
-// CHECK: [[ZEXT:%.*]] = zext i16 [[LOAD]] to i32
-// SOFT: ret i32 [[ZEXT]]
-// HARD: [[BITCAST:%.*]] = bitcast i32 [[ZEXT]] to float
-// HARD: ret float [[BITCAST]]
-// NATIVE: [[LOAD:%.*]] = load half, half* @g
-// NATIVE: ret half [[LOAD]]
+// CHECK: [[LOAD:%.*]] = load half, half* @g
+// CHECK: ret half [[LOAD]]
_Float16 h;
void t3(_Float16 a) { h = a; }
-// SOFT: define void @t3(i32 [[PARAM:%.*]])
-// SOFT: [[TRUNC:%.*]] = trunc i32 [[PARAM]] to i16
-// HARD: define arm_aapcs_vfpcc void @t3(float [[PARAM:%.*]])
-// HARD: [[BITCAST:%.*]] = bitcast float [[PARAM]] to i32
-// HARD: [[TRUNC:%.*]] = trunc i32 [[BITCAST]] to i16
-// CHECK: store i16 [[TRUNC]], i16* bitcast (half* @h to i16*)
+// SOFT: define void @t3(half [[PARAM:%.*]])
+// HARD: define arm_aapcs_vfpcc void @t3(half [[PARAM:%.*]])
// NATIVE: define void @t3(half [[PARAM:%.*]])
-// NATIVE: store half [[PARAM]], half* @h
+// CHECK: store half [[PARAM]], half* @h
_Float16 t4() { return h; }
-// SOFT: define i32 @t4()
-// HARD: define arm_aapcs_vfpcc float @t4()
+// SOFT: define half @t4()
+// HARD: define arm_aapcs_vfpcc half @t4()
// NATIVE: define half @t4()
-// CHECK: [[LOAD:%.*]] = load i16, i16* bitcast (half* @h to i16*)
-// CHECK: [[ZEXT:%.*]] = zext i16 [[LOAD]] to i32
-// SOFT: ret i32 [[ZEXT]]
-// HARD: [[BITCAST:%.*]] = bitcast i32 [[ZEXT]] to float
-// HARD: ret float [[BITCAST]]
-// NATIVE: [[LOAD:%.*]] = load half, half* @h
-// NATIVE: ret half [[LOAD]]
+// CHECK: [[LOAD:%.*]] = load half, half* @h
+// CHECK: ret half [[LOAD]]
// CHECK-LABEL: @test_vcmpeqq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT: ret i16 [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT: ret i16 [[TMP2]]
//
mve_pred16_t test_vcmpeqq_n_f16(float16x8_t a, float16_t b)
{
// CHECK-LABEL: @test_vcmpeqq_m_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP4:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT: ret i16 [[TMP7]]
+// CHECK-NEXT: [[TMP2:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT: ret i16 [[TMP5]]
//
mve_pred16_t test_vcmpeqq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
{
// CHECK-LABEL: @test_vcmpneq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = fcmp une <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT: ret i16 [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = fcmp une <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT: ret i16 [[TMP2]]
//
mve_pred16_t test_vcmpneq_n_f16(float16x8_t a, float16_t b)
{
// CHECK-LABEL: @test_vcmpneq_m_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP4:%.*]] = fcmp une <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT: ret i16 [[TMP7]]
+// CHECK-NEXT: [[TMP2:%.*]] = fcmp une <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT: ret i16 [[TMP5]]
//
mve_pred16_t test_vcmpneq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
{
// CHECK-LABEL: @test_vcmpgeq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = fcmp oge <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT: ret i16 [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT: ret i16 [[TMP2]]
//
mve_pred16_t test_vcmpgeq_n_f16(float16x8_t a, float16_t b)
{
// CHECK-LABEL: @test_vcmpgeq_m_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP4:%.*]] = fcmp oge <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT: ret i16 [[TMP7]]
+// CHECK-NEXT: [[TMP2:%.*]] = fcmp oge <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT: ret i16 [[TMP5]]
//
mve_pred16_t test_vcmpgeq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
{
// CHECK-LABEL: @test_vcmpgtq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT: ret i16 [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT: ret i16 [[TMP2]]
//
mve_pred16_t test_vcmpgtq_n_f16(float16x8_t a, float16_t b)
{
// CHECK-LABEL: @test_vcmpgtq_m_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP4:%.*]] = fcmp ogt <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT: ret i16 [[TMP7]]
+// CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT: ret i16 [[TMP5]]
//
mve_pred16_t test_vcmpgtq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
{
// CHECK-LABEL: @test_vcmpleq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT: ret i16 [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT: ret i16 [[TMP2]]
//
mve_pred16_t test_vcmpleq_n_f16(float16x8_t a, float16_t b)
{
// CHECK-LABEL: @test_vcmpleq_m_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP4:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT: ret i16 [[TMP7]]
+// CHECK-NEXT: [[TMP2:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT: ret i16 [[TMP5]]
//
mve_pred16_t test_vcmpleq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
{
// CHECK-LABEL: @test_vcmpltq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT: ret i16 [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT: ret i16 [[TMP2]]
//
mve_pred16_t test_vcmpltq_n_f16(float16x8_t a, float16_t b)
{
// CHECK-LABEL: @test_vcmpltq_m_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP5:%.*]] = and <8 x i1> [[TMP3]], [[TMP4]]
-// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP5]])
-// CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-// CHECK-NEXT: ret i16 [[TMP7]]
+// CHECK-NEXT: [[TMP2:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT: ret i16 [[TMP5]]
//
mve_pred16_t test_vcmpltq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
{
// CHECK-LABEL: @_Z18test_vcmpeqq_n_f1619__simd128_float16_tDh(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[B:%.*]] = alloca half, align 2
-// CHECK-NEXT: [[TMP:%.*]] = alloca float, align 4
-// CHECK-NEXT: store float [[B_COERCE:%.*]], float* [[TMP]], align 4
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast half* [[B]] to i8*
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[TMP0]], i8* align 4 [[TMP1]], i32 2, i1 false)
-// CHECK-NEXT: [[B1:%.*]] = load half, half* [[B]], align 2
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-// CHECK-NEXT: ret i16 [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT: ret i16 [[TMP2]]
//
mve_pred16_t test_vcmpeqq_n_f16(float16x8_t a, float16_t b)
{
// CHECK-LABEL: @test_vdupq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
// CHECK-NEXT: ret <8 x half> [[DOTSPLAT]]
//
// CHECK-LABEL: @test_vdupq_m_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x half> [[DOTSPLAT]], <8 x half> [[INACTIVE:%.*]]
-// CHECK-NEXT: ret <8 x half> [[TMP4]]
+// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x half> [[DOTSPLAT]], <8 x half> [[INACTIVE:%.*]]
+// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vdupq_m_n_f16(float16x8_t inactive, float16_t a, mve_pred16_t p)
{
// CHECK-LABEL: @test_vdupq_x_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
// CHECK-NEXT: ret <8 x half> [[DOTSPLAT]]
//
// CHECK-LABEL: @test_vgetq_lane_f16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x half> [[A:%.*]], i32 2
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[TMP0]] to i16
-// CHECK-NEXT: [[TMP_0_INSERT_EXT:%.*]] = zext i16 [[TMP1]] to i32
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[TMP_0_INSERT_EXT]] to float
-// CHECK-NEXT: ret float [[TMP2]]
+// CHECK-NEXT: ret half [[TMP0]]
//
float16_t test_vgetq_lane_f16(float16x8_t a)
{
// CHECK-LABEL: @test_vsetq_lane_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[B:%.*]], half [[TMP1]], i32 4
-// CHECK-NEXT: ret <8 x half> [[TMP2]]
+// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x half> [[B:%.*]], half [[A:%.*]], i32 4
+// CHECK-NEXT: ret <8 x half> [[TMP0]]
//
float16x8_t test_vsetq_lane_f16(float16_t a, float16x8_t b)
{
// CHECK-LABEL: @test_vfmaq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[C:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]])
-// CHECK-NEXT: ret <8 x half> [[TMP2]]
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]])
+// CHECK-NEXT: ret <8 x half> [[TMP0]]
//
float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vfmasq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[C:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]])
-// CHECK-NEXT: ret <8 x half> [[TMP2]]
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]])
+// CHECK-NEXT: ret <8 x half> [[TMP0]]
//
float16x8_t test_vfmasq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vqdmlahq_n_s8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
-// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]])
// CHECK-NEXT: ret <16 x i8> [[TMP1]]
//
int8x16_t test_vqdmlahq_n_s8(int8x16_t a, int8x16_t b, int8_t c) {
// CHECK-LABEL: @test_vqdmlahq_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
-// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]])
// CHECK-NEXT: ret <8 x i16> [[TMP1]]
//
int16x8_t test_vqdmlahq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
// CHECK-LABEL: @test_vqdmlahq_n_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
int32x4_t test_vqdmlahq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
// CHECK-LABEL: @test_vqrdmlahq_n_s8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
-// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]])
// CHECK-NEXT: ret <16 x i8> [[TMP1]]
//
int8x16_t test_vqrdmlahq_n_s8(int8x16_t a, int8x16_t b, int8_t c) {
// CHECK-LABEL: @test_vqrdmlahq_n_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
-// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]])
// CHECK-NEXT: ret <8 x i16> [[TMP1]]
//
int16x8_t test_vqrdmlahq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
// CHECK-LABEL: @test_vqrdmlahq_n_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
int32x4_t test_vqrdmlahq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
// CHECK-LABEL: @test_vfmaq_m_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[C:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]], <8 x i1> [[TMP3]])
-// CHECK-NEXT: ret <8 x half> [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x half> [[A:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vfmaq_m_n_f16(float16x8_t a, float16x8_t b, float16_t c, mve_pred16_t p) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vfmasq_m_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[C_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[C:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]])
-// CHECK-NEXT: ret <8 x half> [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vfmasq_m_n_f16(float16x8_t a, float16x8_t b, float16_t c, mve_pred16_t p) {
#ifdef POLYMORPHIC
// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
// CHECK-NEXT: ret <16 x i8> [[TMP3]]
//
int8x16_t test_vmlaq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) {
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
// CHECK-NEXT: ret <8 x i16> [[TMP3]]
//
int16x8_t test_vmlaq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vmlaq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) {
// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
// CHECK-NEXT: ret <16 x i8> [[TMP3]]
//
uint8x16_t test_vmlaq_m_n_u8(uint8x16_t a, uint8x16_t b, uint8_t c, mve_pred16_t p) {
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
// CHECK-NEXT: ret <8 x i16> [[TMP3]]
//
uint16x8_t test_vmlaq_m_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c, mve_pred16_t p) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
uint32x4_t test_vmlaq_m_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c, mve_pred16_t p) {
// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
// CHECK-NEXT: ret <16 x i8> [[TMP3]]
//
int8x16_t test_vqdmlahq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) {
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
// CHECK-NEXT: ret <8 x i16> [[TMP3]]
//
int16x8_t test_vqdmlahq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqdmlahq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) {
// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]])
// CHECK-NEXT: ret <16 x i8> [[TMP3]]
//
int8x16_t test_vqrdmlahq_m_n_s8(int8x16_t a, int8x16_t b, int8_t c, mve_pred16_t p) {
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[C:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], <8 x i16> [[A:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]])
// CHECK-NEXT: ret <8 x i16> [[TMP3]]
//
int16x8_t test_vqrdmlahq_m_n_s16(int16x8_t a, int16x8_t b, int16_t c, mve_pred16_t p) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[C:%.*]], <4 x i1> [[TMP1]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqrdmlahq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_t p) {
// CHECK-LABEL: @test_vaddq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = fadd <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: ret <8 x half> [[TMP2]]
+// CHECK-NEXT: [[TMP0:%.*]] = fadd <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: ret <8 x half> [[TMP0]]
//
float16x8_t test_vaddq_n_f16(float16x8_t a, float16_t b)
{
// CHECK-LABEL: @test_vaddq_x_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> undef)
-// CHECK-NEXT: ret <8 x half> [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.add.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vaddq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
{
// CHECK-LABEL: @test_vminnmvq_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.minnmv.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast float undef to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
-// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT: ret float [[TMP5]]
+// CHECK-NEXT: [[TMP0:%.*]] = call half @llvm.arm.mve.minnmv.f16.v8f16(half [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT: ret half [[TMP0]]
//
float16_t test_vminnmvq_f16(float16_t a, float16x8_t b) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vminnmavq_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.minnmav.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast float undef to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
-// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT: ret float [[TMP5]]
+// CHECK-NEXT: [[TMP0:%.*]] = call half @llvm.arm.mve.minnmav.f16.v8f16(half [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT: ret half [[TMP0]]
//
float16_t test_vminnmavq_f16(float16_t a, float16x8_t b) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vmaxnmvq_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmv.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast float undef to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
-// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT: ret float [[TMP5]]
+// CHECK-NEXT: [[TMP0:%.*]] = call half @llvm.arm.mve.maxnmv.f16.v8f16(half [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT: ret half [[TMP0]]
//
float16_t test_vmaxnmvq_f16(float16_t a, float16x8_t b) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vmaxnmavq_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmav.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
-// CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast float undef to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
-// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT: ret float [[TMP5]]
+// CHECK-NEXT: [[TMP0:%.*]] = call half @llvm.arm.mve.maxnmav.f16.v8f16(half [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT: ret half [[TMP0]]
//
float16_t test_vmaxnmavq_f16(float16_t a, float16x8_t b) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vminnmvq_p_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast float undef to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
-// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT: ret float [[TMP7]]
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret half [[TMP2]]
//
float16_t test_vminnmvq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vminnmavq_p_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast float undef to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
-// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT: ret float [[TMP7]]
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret half [[TMP2]]
//
float16_t test_vminnmavq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vmaxnmvq_p_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast float undef to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
-// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT: ret float [[TMP7]]
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret half [[TMP2]]
//
float16_t test_vmaxnmvq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vmaxnmavq_p_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast float undef to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
-// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
-// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
-// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
-// CHECK-NEXT: ret float [[TMP7]]
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]])
+// CHECK-NEXT: ret half [[TMP2]]
//
float16_t test_vmaxnmavq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
#ifdef POLYMORPHIC
// CHECK-LABEL: @test_vmulq_m_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> [[INACTIVE:%.*]])
-// CHECK-NEXT: ret <8 x half> [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vmulq_m_n_f16(float16x8_t inactive, float16x8_t a, float16_t b, mve_pred16_t p)
{
// CHECK-LABEL: @test_vsubq_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x half> [[A:%.*]], [[DOTSPLAT]]
-// CHECK-NEXT: ret <8 x half> [[TMP2]]
+// CHECK-NEXT: [[TMP0:%.*]] = fsub <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT: ret <8 x half> [[TMP0]]
//
float16x8_t test_vsubq_n_f16(float16x8_t a, float16_t b)
{
// CHECK-LABEL: @test_vsubq_x_n_f16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[B_COERCE:%.*]] to i32
-// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[B:%.*]], i32 0
// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP3]], <8 x half> undef)
-// CHECK-NEXT: ret <8 x half> [[TMP4]]
+// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.sub.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[DOTSPLAT]], <8 x i1> [[TMP1]], <8 x half> undef)
+// CHECK-NEXT: ret <8 x half> [[TMP2]]
//
float16x8_t test_vsubq_x_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
{
+++ /dev/null
-// RUN: %clang_cc1 -triple thumbv8m.main -O0 -mcmse -S -emit-llvm \
-// RUN: -fallow-half-arguments-and-returns %s -o - | \
-// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-NOPT-SOFT
-// RUN: %clang_cc1 -triple thumbv8m.main -O2 -mcmse -S -emit-llvm \
-// RUN: -fallow-half-arguments-and-returns %s -o - | \
-// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-OPT-SOFT
-// RUN: %clang_cc1 -triple thumbv8m.main -O0 -mcmse -S -emit-llvm \
-// RUN: -fallow-half-arguments-and-returns -mfloat-abi hard %s -o - | \
-// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-NOPT-HARD
-// RUN: %clang_cc1 -triple thumbv8m.main -O2 -mcmse -S -emit-llvm \
-// RUN: -fallow-half-arguments-and-returns -mfloat-abi hard %s -o - | \
-// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-OPT-HARD
-
-__fp16 g0();
-__attribute__((cmse_nonsecure_entry)) __fp16 f0() {
- return g0();
-}
-// CHECK: define {{.*}}@f0()
-
-// CHECK-NOPT-SOFT: %[[V0:.*]] = load i32
-// CHECK-NOPT-SOFT: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-NOPT-SOFT: ret i32 %[[V1]]
-
-// CHECK-OPT-SOFT: %[[V0:.*]] = tail call {{.*}} @g0
-// CHECK-OPT-SOFT: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-OPT-SOFT: ret i32 %[[V1]]
-
-// CHECK-NOPT-HARD: %[[V0:.*]] = bitcast float {{.*}} to i32
-// CHECK-NOPT-HARD: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-NOPT-HARD: %[[V2:.*]] = bitcast i32 %[[V1]] to float
-// CHECK-NOPT-HARD: ret float %[[V2]]
-
-// CHECK-OPT-HARD: %[[V0:.*]] = bitcast float {{.*}} to i32
-// CHECK-OPT-HARD: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-OPT-HARD: %[[V2:.*]] = bitcast i32 %[[V1]] to float
-// CHECK-OPT-HARD: ret float %[[V2]]
-
-void __attribute__((cmse_nonsecure_call)) (*g1)(__fp16);
-__fp16 x;
-void f1() {
- g1(x);
-}
-// CHECK: define {{.*}}@f1()
-
-// CHECK-NOPT-SOFT: %[[V0:.*]] = load i32
-// CHECK-NOPT-SOFT: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-NOPT-SOFT: call {{.*}} void {{.*}}(i32 %[[V1]])
-
-// CHECK-OPT-SOFT: %[[V1:.*]] = zext i16 {{.*}} to i32
-// CHECK-OPT-SOFT: call {{.*}} void {{.*}}(i32 %[[V1]])
-
-// CHECK-NOPT-HARD: %[[V0:.*]] = bitcast float {{.*}} to i32
-// CHECK-NOPT-HARD: %[[V1:.*]] = and i32 %[[V0]], 65535
-// CHECK-NOPT-HARD: %[[V2:.*]] = bitcast i32 %[[V1]] to float
-// CHECK-NOPT-HARD: call {{.*}}(float %[[V2]])
-
-// CHECK-OPT-HARD: %[[V0:.*]] = zext i16 {{.*}} to i32
-// CHECK-OPT-HARD: %[[V1:.*]] = bitcast i32 %[[V0]] to float
-// CHECK-OPT-HARD: call {{.*}}(float %[[V1]])