class CDEIRInt<string name, list<Type> params = [], bit appendKind = 0>
: IRIntBase<"arm_cde_" # name, params, appendKind>;
+// Class for generating function macros in arm_cde.h:
+// "#define <name>(<params>) <definition>"
+class FunctionMacro<list<string> params_, string definition_> {
+ list<string> params = params_;
+ string definition = definition_;
+}
+
// Coprocessor immediate
def imm_coproc : Immediate<sint, IB_ConstRange<0, 7>>;
defm vcx3: CDE_VCXFP_m<(args imm_3b:$imm),
(args u32:$n, u32:$m), (args u64:$n, u64:$m),
(? (bitcast $n, FScalar), (bitcast $m, FScalar))>;
+
+// VCX* instructions operating on Q vector registers
+
+def v16u8 : VecOf<u8>;
+
+let pnt = PNT_None, params = [u8] in
+def vcx1q : CDEIntrinsic<Vector, (args imm_coproc:$cp, imm_12b:$imm),
+ (CDEIRInt<"vcx1q"> $cp, $imm)>;
+
+let pnt = PNT_Type, params = T.All, polymorphicOnly = 1 in {
+ def vcx1qa :
+ CDEIntrinsic<Vector, (args imm_coproc:$cp, Vector:$acc, imm_12b:$imm),
+ (bitcast (CDEIRInt<"vcx1qa"> $cp, (bitcast $acc, v16u8), $imm),
+ Vector)>;
+
+ def vcx2q :
+ CDEIntrinsic<Vector, (args imm_coproc:$cp, Vector:$n, imm_7b:$imm),
+ (bitcast (CDEIRInt<"vcx2q"> $cp, (bitcast $n, VecOf<u8>), $imm),
+ Vector)>;
+ def vcx2q_u8 :
+ CDEIntrinsic<v16u8, (args imm_coproc:$cp, Vector:$n, imm_7b:$imm),
+ (CDEIRInt<"vcx2q"> $cp, (bitcast $n, VecOf<u8>), $imm)>;
+
+ def vcx2qa_impl :
+ CDEIntrinsic<Vector,
+ (args imm_coproc:$cp, Vector:$acc, v16u8:$n, imm_7b:$imm),
+ (bitcast (CDEIRInt<"vcx2qa"> $cp, (bitcast $acc, v16u8), $n, $imm),
+ Vector)>;
+
+ def vcx3q_impl :
+ CDEIntrinsic<Vector,
+ (args imm_coproc:$cp, Vector:$n, v16u8:$m, imm_4b:$imm),
+ (bitcast (CDEIRInt<"vcx3q"> $cp, (bitcast $n, v16u8), $m, $imm),
+ Vector)>;
+ def vcx3q_u8_impl :
+ CDEIntrinsic<v16u8,
+ (args imm_coproc:$cp, Vector:$n, v16u8:$m, imm_4b:$imm),
+ (CDEIRInt<"vcx3q"> $cp, (bitcast $n, v16u8), $m, $imm)>;
+ def vcx3qa_impl :
+ CDEIntrinsic<Vector,
+ (args imm_coproc:$cp, Vector:$acc, v16u8:$n, v16u8:$m, imm_4b:$imm),
+ (bitcast (CDEIRInt<"vcx3qa"> $cp, (bitcast $acc, v16u8), $n, $m,
+ $imm),
+ Vector)>;
+}
+
+// Reinterpret intrinsics required to implement __arm_vcx*q with 2 or 3
+// polymorphic paramters.
+let params = [/* no u8 */ s8, u16, s16, u32, s32, u64, s64, f16, f32],
+ headerOnly = 1, polymorphicOnly = 1 in
+def vreinterpretq_u8 :
+ Intrinsic<v16u8, (args Vector:$x), (vreinterpret $x, v16u8)>;
+
+// We need vreinterpretq_u8_u8 to avoid doing smart tricks in the macros
+let params = [u8], polymorphicOnly = 1 in
+def vreinterpretq_u8_cde :
+ CDEIntrinsic<v16u8, (args Vector:$x), (id $x)>,
+ NameOverride<"vreinterpretq_u8">;
+
+
+def vcx2qa : FunctionMacro<
+ ["cp", "acc", "n", "imm"],
+ "__arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))">;
+
+def vcx3q : FunctionMacro<
+ ["cp", "n", "m", "imm"],
+ "__arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">;
+def vcx3q_u8 : FunctionMacro<
+ ["cp", "n", "m", "imm"],
+ "__arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">;
+def vcx3qa : FunctionMacro<
+ ["cp", "acc", "n", "m", "imm"],
+ "__arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), "
+ "__arm_vreinterpretq_u8(m), (imm))">;
--- /dev/null
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi \
+// RUN: -target-feature +cdecp0 -target-feature +cdecp1 \
+// RUN: -target-feature +mve.fp \
+// RUN: -mfloat-abi hard -O0 -disable-O0-optnone \
+// RUN: -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_cde.h>
+
+// CHECK-LABEL: @test_vcx1q_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111)
+// CHECK-NEXT: ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vcx1q_u8(void) {
+ return __arm_vcx1q_u8(0, 1111);
+}
+
+// CHECK-LABEL: @test_vcx1qa_1(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> [[ACC:%.*]], i32 1112)
+// CHECK-NEXT: ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vcx1qa_1(uint8x16_t acc) {
+ return __arm_vcx1qa(1, acc, 1112);
+}
+
+// CHECK-LABEL: @test_vcx1qa_2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[ACC:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> [[TMP0]], i32 1113)
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vcx1qa_2(int32x4_t acc) {
+ return __arm_vcx1qa(0, acc, 1113);
+}
+
+// CHECK-LABEL: @test_vcx2q_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 111)
+// CHECK-NEXT: ret <16 x i8> [[TMP1]]
+//
+uint8x16_t test_vcx2q_u8(float16x8_t n) {
+ return __arm_vcx2q_u8(1, n, 111);
+}
+
+// CHECK-LABEL: @test_vcx2q(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 112)
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vcx2q(float32x4_t n) {
+ return __arm_vcx2q(1, n, 112);
+}
+
+// CHECK-LABEL: @test_vcx2qa(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACC:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 113)
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[TMP3]]
+//
+float32x4_t test_vcx2qa(float32x4_t acc, int64x2_t n) {
+ return __arm_vcx2qa(0, acc, n, 113);
+}
+
+// CHECK-LABEL: @test_vcx3q_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[M:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 11)
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vcx3q_u8(uint16x8_t n, int32x4_t m) {
+ return __arm_vcx3q_u8(0, n, m, 11);
+}
+
+// CHECK-LABEL: @test_vcx3q(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12)
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK-NEXT: ret <2 x i64> [[TMP3]]
+//
+uint64x2_t test_vcx3q(uint64x2_t n, float32x4_t m) {
+ return __arm_vcx3q(1, n, m, 12);
+}
+
+// CHECK-LABEL: @test_vcx3qa(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> [[ACC:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 13)
+// CHECK-NEXT: ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vcx3qa(int8x16_t acc, uint16x8_t n, float32x4_t m) {
+ return __arm_vcx3qa(1, acc, n, m, 13);
+}
__arm_vcx3da_u64(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3da_u64' must be a constant integer}}
__arm_vcx3da_u64(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
}
+
+void test_vcxq(uint32_t a, uint8x16_t acc, float16x8_t n, int64x2_t m) {
+ (void)__arm_vcx1q_u8(0, 0);
+ __arm_vcx1q_u8(0, a); // expected-error {{argument to '__arm_vcx1q_u8' must be a constant integer}}
+ __arm_vcx1q_u8(0, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}}
+ __arm_vcx1qa(0, acc, a); // expected-error {{argument to '__arm_vcx1qa' must be a constant integer}}
+ __arm_vcx1qa(0, acc, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}}
+
+ (void)__arm_vcx2q_u8(0, n, 0);
+ __arm_vcx2q_u8(0, n, a); // expected-error {{argument to '__arm_vcx2q_u8' must be a constant integer}}
+ __arm_vcx2q_u8(0, n, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+ __arm_vcx2q(0, n, a); // expected-error {{argument to '__arm_vcx2q' must be a constant integer}}
+ __arm_vcx2q(0, n, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+ __arm_vcx2qa(0, n, acc, a); // expected-error {{argument to '__arm_vcx2qa_impl' must be a constant integer}}
+ __arm_vcx2qa(0, n, acc, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+
+ (void)__arm_vcx3q_u8(0, n, m, 0);
+ __arm_vcx3q_u8(0, n, m, a); // expected-error {{argument to '__arm_vcx3q_u8_impl' must be a constant integer}}
+ __arm_vcx3q_u8(0, n, m, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+ __arm_vcx3q(0, n, m, a); // expected-error {{argument to '__arm_vcx3q_impl' must be a constant integer}}
+ __arm_vcx3q(0, n, m, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+ __arm_vcx3qa(0, n, m, acc, a); // expected-error {{argument to '__arm_vcx3qa_impl' must be a constant integer}}
+ __arm_vcx3qa(0, n, m, acc, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+}
}
// -----------------------------------------------------------------------------
+// Class that describes an ACLE intrinsic implemented as a macro.
+//
+// This class is used when the intrinsic is polymorphic in 2 or 3 types, but we
+// want to avoid a combinatorial explosion by reinterpreting the arguments to
+// fixed types.
+
+class FunctionMacro {
+ std::vector<StringRef> Params;
+ StringRef Definition;
+
+public:
+ FunctionMacro(const Record &R);
+
+ const std::vector<StringRef> &getParams() const { return Params; }
+ StringRef getDefinition() const { return Definition; }
+};
+
+FunctionMacro::FunctionMacro(const Record &R) {
+ Params = R.getValueAsListOfStrings("params");
+ Definition = R.getValueAsString("definition");
+}
+
+// -----------------------------------------------------------------------------
// The class used for generating arm_cde.h and related Clang bits
//
class CdeEmitter : public EmitterBase {
+ std::map<StringRef, FunctionMacro> FunctionMacros;
+
public:
- CdeEmitter(RecordKeeper &Records) : EmitterBase(Records){};
+ CdeEmitter(RecordKeeper &Records);
void EmitHeader(raw_ostream &OS) override;
void EmitBuiltinDef(raw_ostream &OS) override;
void EmitBuiltinSema(raw_ostream &OS) override;
};
+CdeEmitter::CdeEmitter(RecordKeeper &Records) : EmitterBase(Records) {
+ for (Record *R : Records.getAllDerivedDefinitions("FunctionMacro"))
+ FunctionMacros.emplace(R->getName(), FunctionMacro(*R));
+}
+
void CdeEmitter::EmitHeader(raw_ostream &OS) {
// Accumulate pieces of the header file that will be enabled under various
// different combinations of #ifdef. The index into parts[] is one of the
}
}
+ for (const auto &kv : FunctionMacros) {
+ StringRef Name = kv.first;
+ const FunctionMacro &FM = kv.second;
+
+ raw_ostream &OS = parts[MVE];
+ OS << "#define "
+ << "__arm_" << Name << "(" << join(FM.getParams(), ", ") << ") "
+ << FM.getDefinition() << "\n";
+ }
+
for (auto &part : parts)
part << "\n";
defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>;
defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>;
+multiclass CDEVCXVecIntrinsics<list<LLVMType> args> {
+ def "" : Intrinsic<
+ [llvm_v16i8_ty],
+ !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
+ [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+ def a : Intrinsic<
+ [llvm_v16i8_ty],
+ !listconcat([llvm_i32_ty /* coproc */, llvm_v16i8_ty /* acc */],
+ args, [llvm_i32_ty /* imm */]),
+ [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
+}
+
+defm int_arm_cde_vcx1q : CDEVCXVecIntrinsics<[]>;
+defm int_arm_cde_vcx2q : CDEVCXVecIntrinsics<[llvm_v16i8_ty]>;
+defm int_arm_cde_vcx3q : CDEVCXVecIntrinsics<[llvm_v16i8_ty, llvm_v16i8_ty]>;
+
} // end TargetPrefix
(f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m,
imm_3b:$imm))>;
}
+
+let Predicates = [HasCDE, HasMVEInt] in {
+ def : Pat<(v16i8 (int_arm_cde_vcx1q timm:$coproc, timm:$imm)),
+ (v16i8 (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm))>;
+ def : Pat<(v16i8 (int_arm_cde_vcx1qa timm:$coproc, (v16i8 MQPR:$acc),
+ timm:$imm)),
+ (v16i8 (CDE_VCX1A_vec p_imm:$coproc, MQPR:$acc, imm_12b:$imm))>;
+
+ def : Pat<(v16i8 (int_arm_cde_vcx2q timm:$coproc, (v16i8 MQPR:$n), timm:$imm)),
+ (v16i8 (CDE_VCX2_vec p_imm:$coproc, MQPR:$n, imm_7b:$imm))>;
+ def : Pat<(v16i8 (int_arm_cde_vcx2qa timm:$coproc, (v16i8 MQPR:$acc),
+ (v16i8 MQPR:$n), timm:$imm)),
+ (v16i8 (CDE_VCX2A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n,
+ imm_7b:$imm))>;
+
+ def : Pat<(v16i8 (int_arm_cde_vcx3q timm:$coproc, (v16i8 MQPR:$n),
+ (v16i8 MQPR:$m), timm:$imm)),
+ (v16i8 (CDE_VCX3_vec p_imm:$coproc, MQPR:$n, MQPR:$m,
+ imm_4b:$imm))>;
+ def : Pat<(v16i8 (int_arm_cde_vcx3qa timm:$coproc, (v16i8 MQPR:$acc),
+ (v16i8 MQPR:$n), (v16i8 MQPR:$m),
+ timm:$imm)),
+ (v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m,
+ imm_4b:$imm))>;
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+
+declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg)
+declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg)
+declare <16 x i8> @llvm.arm.cde.vcx2q(i32 immarg, <16 x i8>, i32 immarg)
+declare <16 x i8> @llvm.arm.cde.vcx2qa(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg)
+declare <16 x i8> @llvm.arm.cde.vcx3q(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg)
+declare <16 x i8> @llvm.arm.cde.vcx3qa(i32 immarg, <16 x i8>, <16 x i8>, <16 x i8>, i32 immarg)
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx1q_u8() {
+; CHECK-LABEL: test_vcx1q_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vcx1 p0, q0, #1111
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111)
+ ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_1(<16 x i8> %acc) {
+; CHECK-LABEL: test_vcx1qa_1:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vcx1a p1, q0, #1112
+; CHECK-NEXT: bx lr
+entry:
+ %0 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> %acc, i32 1112)
+ ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vcx1qa_2(<4 x i32> %acc) {
+; CHECK-LABEL: test_vcx1qa_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vcx1a p0, q0, #1113
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <4 x i32> %acc to <16 x i8>
+ %1 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> %0, i32 1113)
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx2q_u8(<8 x half> %n) {
+; CHECK-LABEL: test_vcx2q_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vcx2 p1, q0, q0, #111
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <8 x half> %n to <16 x i8>
+ %1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 111)
+ ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcx2q(<4 x float> %n) {
+; CHECK-LABEL: test_vcx2q:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vcx2 p1, q0, q0, #112
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <4 x float> %n to <16 x i8>
+ %1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 112)
+ %2 = bitcast <16 x i8> %1 to <4 x float>
+ ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcx2qa(<4 x float> %acc, <2 x i64> %n) {
+; CHECK-LABEL: test_vcx2qa:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vcx2a p0, q0, q1, #113
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <4 x float> %acc to <16 x i8>
+ %1 = bitcast <2 x i64> %n to <16 x i8>
+ %2 = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> %0, <16 x i8> %1, i32 113)
+ %3 = bitcast <16 x i8> %2 to <4 x float>
+ ret <4 x float> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx3q_u8(<8 x i16> %n, <4 x i32> %m) {
+; CHECK-LABEL: test_vcx3q_u8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vcx3 p0, q0, q0, q1, #11
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <8 x i16> %n to <16 x i8>
+ %1 = bitcast <4 x i32> %m to <16 x i8>
+ %2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> %0, <16 x i8> %1, i32 11)
+ ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vcx3q(<2 x i64> %n, <4 x float> %m) {
+; CHECK-LABEL: test_vcx3q:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vcx3 p1, q0, q0, q1, #12
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <2 x i64> %n to <16 x i8>
+ %1 = bitcast <4 x float> %m to <16 x i8>
+ %2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> %0, <16 x i8> %1, i32 12)
+ %3 = bitcast <16 x i8> %2 to <2 x i64>
+ ret <2 x i64> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx3qa(<16 x i8> %acc, <8 x i16> %n, <4 x float> %m) {
+; CHECK-LABEL: test_vcx3qa:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vcx3a p1, q0, q1, q2, #13
+; CHECK-NEXT: bx lr
+entry:
+ %0 = bitcast <8 x i16> %n to <16 x i8>
+ %1 = bitcast <4 x float> %m to <16 x i8>
+ %2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13)
+ ret <16 x i8> %2
+}