/// <1, 3, 5, 7>).
static bool isDeinterleavingMask(ArrayRef<int> Mask);
+/// Returns true if the operation is a negation of V, and it works for both
+/// integers and floats.
+static bool isNeg(Value *V);
+
+/// Returns the operand for negation operation.
+static Value *getNegOperand(Value *V);
+
namespace {
class ComplexDeinterleavingLegacyPass : public FunctionPass {
// This two members are required exclusively for generating
// ComplexDeinterleavingOperation::Symmetric operations.
unsigned Opcode;
- FastMathFlags Flags;
+ std::optional<FastMathFlags> Flags;
ComplexDeinterleavingRotation Rotation =
ComplexDeinterleavingRotation::Rotation_0;
/// Return nullptr if it is not possible to construct a complex number.
/// \p Flags are needed to generate symmetric Add and Sub operations.
NodePtr identifyAdditions(std::list<Addend> &RealAddends,
- std::list<Addend> &ImagAddends, FastMathFlags Flags,
+ std::list<Addend> &ImagAddends,
+ std::optional<FastMathFlags> Flags,
NodePtr Accumulator);
/// Extract one addend that have both real and imaginary parts positive.
return true;
}
+bool isNeg(Value *V) {
+ return match(V, m_FNeg(m_Value())) || match(V, m_Neg(m_Value()));
+}
+
+Value *getNegOperand(Value *V) {
+ assert(isNeg(V));
+ auto *I = cast<Instruction>(V);
+ if (I->getOpcode() == Instruction::FNeg)
+ return I->getOperand(0);
+
+ return I->getOperand(1);
+}
+
bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) {
ComplexDeinterleavingGraph Graph(TL, TLI);
if (Graph.collectPotentialReductions(B))
return nullptr;
}
- if (Real->getOpcode() != Instruction::FMul ||
- Imag->getOpcode() != Instruction::FMul) {
- LLVM_DEBUG(dbgs() << " - Real or imaginary instruction is not fmul\n");
+ if ((Real->getOpcode() != Instruction::FMul &&
+ Real->getOpcode() != Instruction::Mul) ||
+ (Imag->getOpcode() != Instruction::FMul &&
+ Imag->getOpcode() != Instruction::Mul)) {
+ LLVM_DEBUG(
+ dbgs() << " - Real or imaginary instruction is not fmul or mul\n");
return nullptr;
}
R1 = Op;
}
- if (match(I0, m_Neg(m_Value(Op)))) {
+ if (isNeg(I0)) {
Negs |= 2;
Negs ^= 1;
I0 = Op;
LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag
<< "\n");
// Determine rotation
+ auto IsAdd = [](unsigned Op) {
+ return Op == Instruction::FAdd || Op == Instruction::Add;
+ };
+ auto IsSub = [](unsigned Op) {
+ return Op == Instruction::FSub || Op == Instruction::Sub;
+ };
ComplexDeinterleavingRotation Rotation;
- if (Real->getOpcode() == Instruction::FAdd &&
- Imag->getOpcode() == Instruction::FAdd)
+ if (IsAdd(Real->getOpcode()) && IsAdd(Imag->getOpcode()))
Rotation = ComplexDeinterleavingRotation::Rotation_0;
- else if (Real->getOpcode() == Instruction::FSub &&
- Imag->getOpcode() == Instruction::FAdd)
+ else if (IsSub(Real->getOpcode()) && IsAdd(Imag->getOpcode()))
Rotation = ComplexDeinterleavingRotation::Rotation_90;
- else if (Real->getOpcode() == Instruction::FSub &&
- Imag->getOpcode() == Instruction::FSub)
+ else if (IsSub(Real->getOpcode()) && IsSub(Imag->getOpcode()))
Rotation = ComplexDeinterleavingRotation::Rotation_180;
- else if (Real->getOpcode() == Instruction::FAdd &&
- Imag->getOpcode() == Instruction::FSub)
+ else if (IsAdd(Real->getOpcode()) && IsSub(Imag->getOpcode()))
Rotation = ComplexDeinterleavingRotation::Rotation_270;
else {
LLVM_DEBUG(dbgs() << " - Unhandled rotation.\n");
return nullptr;
}
- if (!Real->getFastMathFlags().allowContract() ||
- !Imag->getFastMathFlags().allowContract()) {
+ if (isa<FPMathOperator>(Real) &&
+ (!Real->getFastMathFlags().allowContract() ||
+ !Imag->getFastMathFlags().allowContract())) {
LLVM_DEBUG(dbgs() << " - Contract is missing from the FastMath flags.\n");
return nullptr;
}
case Instruction::FSub:
case Instruction::FMul:
case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
return true;
default:
return false;
ComplexDeinterleavingGraph::NodePtr
ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
Instruction *Imag) {
+ auto IsOperationSupported = [](unsigned Opcode) -> bool {
+ return Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+ Opcode == Instruction::FNeg || Opcode == Instruction::Add ||
+ Opcode == Instruction::Sub;
+ };
- if ((Real->getOpcode() != Instruction::FAdd &&
- Real->getOpcode() != Instruction::FSub &&
- Real->getOpcode() != Instruction::FNeg) ||
- (Imag->getOpcode() != Instruction::FAdd &&
- Imag->getOpcode() != Instruction::FSub &&
- Imag->getOpcode() != Instruction::FNeg))
+ if (!IsOperationSupported(Real->getOpcode()) ||
+ !IsOperationSupported(Imag->getOpcode()))
return nullptr;
- if (Real->getFastMathFlags() != Imag->getFastMathFlags()) {
- LLVM_DEBUG(
- dbgs()
- << "The flags in Real and Imaginary instructions are not identical\n");
- return nullptr;
- }
+ std::optional<FastMathFlags> Flags;
+ if (isa<FPMathOperator>(Real)) {
+ if (Real->getFastMathFlags() != Imag->getFastMathFlags()) {
+ LLVM_DEBUG(dbgs() << "The flags in Real and Imaginary instructions are "
+ "not identical\n");
+ return nullptr;
+ }
- FastMathFlags Flags = Real->getFastMathFlags();
- if (!Flags.allowReassoc()) {
- LLVM_DEBUG(
- dbgs() << "the 'Reassoc' attribute is missing in the FastMath flags\n");
- return nullptr;
+ Flags = Real->getFastMathFlags();
+ if (!Flags->allowReassoc()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "the 'Reassoc' attribute is missing in the FastMath flags\n");
+ return nullptr;
+ }
}
// Collect multiplications and addend instructions from the given instruction
Addends.emplace_back(I, IsPositive);
continue;
}
-
- if (I->getOpcode() == Instruction::FAdd) {
+ switch (I->getOpcode()) {
+ case Instruction::FAdd:
+ case Instruction::Add:
Worklist.emplace_back(I->getOperand(1), IsPositive);
Worklist.emplace_back(I->getOperand(0), IsPositive);
- } else if (I->getOpcode() == Instruction::FSub) {
+ break;
+ case Instruction::FSub:
Worklist.emplace_back(I->getOperand(1), !IsPositive);
Worklist.emplace_back(I->getOperand(0), IsPositive);
- } else if (I->getOpcode() == Instruction::FMul) {
+ break;
+ case Instruction::Sub:
+ if (isNeg(I)) {
+ Worklist.emplace_back(getNegOperand(I), !IsPositive);
+ } else {
+ Worklist.emplace_back(I->getOperand(1), !IsPositive);
+ Worklist.emplace_back(I->getOperand(0), IsPositive);
+ }
+ break;
+ case Instruction::FMul:
+ case Instruction::Mul: {
Value *A, *B;
- if (match(I->getOperand(0), m_FNeg(m_Value(A)))) {
+ if (isNeg(I->getOperand(0))) {
+ A = getNegOperand(I->getOperand(0));
IsPositive = !IsPositive;
} else {
A = I->getOperand(0);
}
- if (match(I->getOperand(1), m_FNeg(m_Value(B)))) {
+ if (isNeg(I->getOperand(1))) {
+ B = getNegOperand(I->getOperand(1));
IsPositive = !IsPositive;
} else {
B = I->getOperand(1);
}
Muls.push_back(Product{A, B, IsPositive});
- } else if (I->getOpcode() == Instruction::FNeg) {
+ break;
+ }
+ case Instruction::FNeg:
Worklist.emplace_back(I->getOperand(0), !IsPositive);
- } else {
+ break;
+ default:
Addends.emplace_back(I, IsPositive);
continue;
}
- if (I->getFastMathFlags() != Flags) {
+ if (Flags && I->getFastMathFlags() != *Flags) {
LLVM_DEBUG(dbgs() << "The instruction's fast math flags are "
"inconsistent with the root instructions' flags: "
<< *I << "\n");
}
ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyAdditions(std::list<Addend> &RealAddends,
- std::list<Addend> &ImagAddends,
- FastMathFlags Flags,
- NodePtr Accumulator = nullptr) {
+ComplexDeinterleavingGraph::identifyAdditions(
+ std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends,
+ std::optional<FastMathFlags> Flags, NodePtr Accumulator = nullptr) {
if (RealAddends.size() != ImagAddends.size())
return nullptr;
if (Rotation == llvm::ComplexDeinterleavingRotation::Rotation_0) {
TmpNode = prepareCompositeNode(
ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr);
- TmpNode->Opcode = Instruction::FAdd;
- TmpNode->Flags = Flags;
+ if (Flags) {
+ TmpNode->Opcode = Instruction::FAdd;
+ TmpNode->Flags = *Flags;
+ } else {
+ TmpNode->Opcode = Instruction::Add;
+ }
} else if (Rotation ==
llvm::ComplexDeinterleavingRotation::Rotation_180) {
TmpNode = prepareCompositeNode(
ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr);
- TmpNode->Opcode = Instruction::FSub;
- TmpNode->Flags = Flags;
+ if (Flags) {
+ TmpNode->Opcode = Instruction::FSub;
+ TmpNode->Flags = *Flags;
+ } else {
+ TmpNode->Opcode = Instruction::Sub;
+ }
} else {
TmpNode = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd,
nullptr, nullptr);
}
static Value *replaceSymmetricNode(IRBuilderBase &B, unsigned Opcode,
- FastMathFlags Flags, Value *InputA,
- Value *InputB) {
+ std::optional<FastMathFlags> Flags,
+ Value *InputA, Value *InputB) {
Value *I;
switch (Opcode) {
case Instruction::FNeg:
case Instruction::FAdd:
I = B.CreateFAdd(InputA, InputB);
break;
+ case Instruction::Add:
+ I = B.CreateAdd(InputA, InputB);
+ break;
case Instruction::FSub:
I = B.CreateFSub(InputA, InputB);
break;
+ case Instruction::Sub:
+ I = B.CreateSub(InputA, InputB);
+ break;
case Instruction::FMul:
I = B.CreateFMul(InputA, InputB);
break;
+ case Instruction::Mul:
+ I = B.CreateMul(InputA, InputB);
+ break;
default:
llvm_unreachable("Incorrect symmetric opcode");
}
- cast<Instruction>(I)->setFastMathFlags(Flags);
+ if (Flags)
+ cast<Instruction>(I)->setFastMathFlags(*Flags);
return I;
}
}
bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
- return Subtarget->hasSVE() || Subtarget->hasComplxNum();
+ return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
+ Subtarget->hasComplxNum();
}
bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
!llvm::isPowerOf2_32(VTyWidth))
return false;
+ if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2()) {
+ unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
+ return 8 <= ScalarWidth && ScalarWidth <= 64;
+ }
+
return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
}
Value *Accumulator) const {
VectorType *Ty = cast<VectorType>(InputA->getType());
bool IsScalable = Ty->isScalableTy();
+ bool IsInt = Ty->getElementType()->isIntegerTy();
unsigned TyWidth =
Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
if (Accumulator == nullptr)
- Accumulator = ConstantFP::get(Ty, 0);
+ Accumulator = Constant::getNullValue(Ty);
if (IsScalable) {
- auto *Mask = B.CreateVectorSplat(Ty->getElementCount(), B.getInt1(true));
+ if (IsInt)
+ return B.CreateIntrinsic(
+ Intrinsic::aarch64_sve_cmla_x, Ty,
+ {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
+
+ auto *Mask = B.getAllOnesMask(Ty->getElementCount());
return B.CreateIntrinsic(
Intrinsic::aarch64_sve_fcmla, Ty,
{Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
if (OperationType == ComplexDeinterleavingOperation::CAdd) {
if (IsScalable) {
- auto *Mask = B.CreateVectorSplat(Ty->getElementCount(), B.getInt1(true));
if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
- Rotation == ComplexDeinterleavingRotation::Rotation_270)
+ Rotation == ComplexDeinterleavingRotation::Rotation_270) {
+ if (IsInt)
+ return B.CreateIntrinsic(
+ Intrinsic::aarch64_sve_cadd_x, Ty,
+ {InputA, InputB, B.getInt32((int)Rotation * 90)});
+
+ auto *Mask = B.getAllOnesMask(Ty->getElementCount());
return B.CreateIntrinsic(
Intrinsic::aarch64_sve_fcadd, Ty,
{Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
+ }
return nullptr;
}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to not transform as the type's minimum size is less than 128 bits.
+define <vscale x 4 x i16> @complex_add_v4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
+; CHECK-LABEL: complex_add_v4i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: uunpkhi z3.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z4.d, z0.d, z2.d
+; CHECK-NEXT: uzp2 z0.d, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z1.d, z3.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z3.d
+; CHECK-NEXT: sub z0.d, z1.d, z0.d
+; CHECK-NEXT: add z1.d, z2.d, z4.d
+; CHECK-NEXT: zip2 z2.d, z0.d, z1.d
+; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
+ %a.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
+ %b.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 1
+ %0 = sub <vscale x 2 x i16> %b.real, %a.imag
+ %1 = add <vscale x 2 x i16> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1)
+ ret <vscale x 4 x i16> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x i16> @complex_add_v8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: complex_add_v8i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z1.h, z1.h, z0.h, #90
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
+ %a.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
+ %b.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 1
+ %0 = sub <vscale x 4 x i16> %b.real, %a.imag
+ %1 = add <vscale x 4 x i16> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1)
+ ret <vscale x 8 x i16> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 16 x i16> @complex_add_v16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-LABEL: complex_add_v16i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z2.h, z2.h, z0.h, #90
+; CHECK-NEXT: cadd z3.h, z3.h, z1.h, #90
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
+ %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
+ %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+ %0 = sub <vscale x 8 x i16> %b.real, %a.imag
+ %1 = add <vscale x 8 x i16> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1)
+ ret <vscale x 16 x i16> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 32 x i16> @complex_add_v32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b) {
+; CHECK-LABEL: complex_add_v32i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z6.h, z6.h, z2.h, #90
+; CHECK-NEXT: cadd z4.h, z4.h, z0.h, #90
+; CHECK-NEXT: cadd z5.h, z5.h, z1.h, #90
+; CHECK-NEXT: cadd z7.h, z7.h, z3.h, #90
+; CHECK-NEXT: mov z0.d, z4.d
+; CHECK-NEXT: mov z1.d, z5.d
+; CHECK-NEXT: mov z2.d, z6.d
+; CHECK-NEXT: mov z3.d, z7.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
+ %a.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
+ %b.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 1
+ %0 = sub <vscale x 16 x i16> %b.real, %a.imag
+ %1 = add <vscale x 16 x i16> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1)
+ ret <vscale x 32 x i16> %interleaved.vec
+}
+
+declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
+
+declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to not transform as the type's minimum size is less than 128 bits.
+define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
+; CHECK-LABEL: complex_mul_v4i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: uunpkhi z3.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z4.d, z0.d, z2.d
+; CHECK-NEXT: uzp2 z0.d, z0.d, z2.d
+; CHECK-NEXT: uzp2 z2.d, z1.d, z3.d
+; CHECK-NEXT: uzp1 z1.d, z1.d, z3.d
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mul z3.d, z1.d, z0.d
+; CHECK-NEXT: mul z1.d, z1.d, z4.d
+; CHECK-NEXT: mla z3.d, p0/m, z2.d, z4.d
+; CHECK-NEXT: msb z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: zip2 z1.d, z0.d, z3.d
+; CHECK-NEXT: zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
+ %a.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
+ %b.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 1
+ %0 = mul <vscale x 2 x i16> %b.imag, %a.real
+ %1 = mul <vscale x 2 x i16> %b.real, %a.imag
+ %2 = add <vscale x 2 x i16> %1, %0
+ %3 = mul <vscale x 2 x i16> %b.real, %a.real
+ %4 = mul <vscale x 2 x i16> %a.imag, %b.imag
+ %5 = sub <vscale x 2 x i16> %3, %4
+ %interleaved.vec = tail call <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16> %5, <vscale x 2 x i16> %2)
+ ret <vscale x 4 x i16> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x i16> @complex_mul_v8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: complex_mul_v8i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z2.h, #0 // =0x0
+; CHECK-NEXT: cmla z2.h, z1.h, z0.h, #0
+; CHECK-NEXT: cmla z2.h, z1.h, z0.h, #90
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
+ %a.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
+ %b.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 1
+ %0 = mul <vscale x 4 x i16> %b.imag, %a.real
+ %1 = mul <vscale x 4 x i16> %b.real, %a.imag
+ %2 = add <vscale x 4 x i16> %1, %0
+ %3 = mul <vscale x 4 x i16> %b.real, %a.real
+ %4 = mul <vscale x 4 x i16> %a.imag, %b.imag
+ %5 = sub <vscale x 4 x i16> %3, %4
+ %interleaved.vec = tail call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %5, <vscale x 4 x i16> %2)
+ ret <vscale x 8 x i16> %interleaved.vec
+}
+; Expected to transform
+define <vscale x 16 x i16> @complex_mul_v16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-LABEL: complex_mul_v16i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z4.h, #0 // =0x0
+; CHECK-NEXT: mov z5.d, z4.d
+; CHECK-NEXT: cmla z4.h, z3.h, z1.h, #0
+; CHECK-NEXT: cmla z5.h, z2.h, z0.h, #0
+; CHECK-NEXT: cmla z4.h, z3.h, z1.h, #90
+; CHECK-NEXT: cmla z5.h, z2.h, z0.h, #90
+; CHECK-NEXT: mov z1.d, z4.d
+; CHECK-NEXT: mov z0.d, z5.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
+ %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
+ %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+ %0 = mul <vscale x 8 x i16> %b.imag, %a.real
+ %1 = mul <vscale x 8 x i16> %b.real, %a.imag
+ %2 = add <vscale x 8 x i16> %1, %0
+ %3 = mul <vscale x 8 x i16> %b.real, %a.real
+ %4 = mul <vscale x 8 x i16> %a.imag, %b.imag
+ %5 = sub <vscale x 8 x i16> %3, %4
+ %interleaved.vec = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %5, <vscale x 8 x i16> %2)
+ ret <vscale x 16 x i16> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 32 x i16> @complex_mul_v32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b) {
+; CHECK-LABEL: complex_mul_v32i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z24.h, #0 // =0x0
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: mov z26.d, z24.d
+; CHECK-NEXT: mov z27.d, z24.d
+; CHECK-NEXT: cmla z25.h, z4.h, z0.h, #0
+; CHECK-NEXT: cmla z26.h, z5.h, z1.h, #0
+; CHECK-NEXT: cmla z27.h, z6.h, z2.h, #0
+; CHECK-NEXT: cmla z24.h, z7.h, z3.h, #0
+; CHECK-NEXT: cmla z25.h, z4.h, z0.h, #90
+; CHECK-NEXT: cmla z26.h, z5.h, z1.h, #90
+; CHECK-NEXT: cmla z27.h, z6.h, z2.h, #90
+; CHECK-NEXT: cmla z24.h, z7.h, z3.h, #90
+; CHECK-NEXT: mov z0.d, z25.d
+; CHECK-NEXT: mov z1.d, z26.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: mov z3.d, z24.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
+ %a.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
+ %b.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 1
+ %0 = mul <vscale x 16 x i16> %b.imag, %a.real
+ %1 = mul <vscale x 16 x i16> %b.real, %a.imag
+ %2 = add <vscale x 16 x i16> %1, %0
+ %3 = mul <vscale x 16 x i16> %b.real, %a.real
+ %4 = mul <vscale x 16 x i16> %a.imag, %b.imag
+ %5 = sub <vscale x 16 x i16> %3, %4
+ %interleaved.vec = tail call <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16> %5, <vscale x 16 x i16> %2)
+ ret <vscale x 32 x i16> %interleaved.vec
+}
+
+declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
+
+declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+
+declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
+
+
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <vscale x 4 x i32> @complex_add_v4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: complex_add_v4i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z1.s, z1.s, z0.s, #90
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
+ %a.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
+ %b.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 1
+ %0 = sub <vscale x 2 x i32> %b.real, %a.imag
+ %1 = add <vscale x 2 x i32> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1)
+ ret <vscale x 4 x i32> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x i32> @complex_add_v8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b) {
+; CHECK-LABEL: complex_add_v8i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z2.s, z2.s, z0.s, #90
+; CHECK-NEXT: cadd z3.s, z3.s, z1.s, #90
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+ %a.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
+ %b.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 1
+ %0 = sub <vscale x 4 x i32> %b.real, %a.imag
+ %1 = add <vscale x 4 x i32> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1)
+ ret <vscale x 8 x i32> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 16 x i32> @complex_add_v16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b) {
+; CHECK-LABEL: complex_add_v16i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z6.s, z6.s, z2.s, #90
+; CHECK-NEXT: cadd z4.s, z4.s, z0.s, #90
+; CHECK-NEXT: cadd z5.s, z5.s, z1.s, #90
+; CHECK-NEXT: cadd z7.s, z7.s, z3.s, #90
+; CHECK-NEXT: mov z0.d, z4.d
+; CHECK-NEXT: mov z1.d, z5.d
+; CHECK-NEXT: mov z2.d, z6.d
+; CHECK-NEXT: mov z3.d, z7.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
+ %a.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
+ %b.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 1
+ %0 = sub <vscale x 8 x i32> %b.real, %a.imag
+ %1 = add <vscale x 8 x i32> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1)
+ ret <vscale x 16 x i32> %interleaved.vec
+}
+
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <vscale x 4 x i32> @complex_mul_v4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: complex_mul_v4i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z2.s, #0 // =0x0
+; CHECK-NEXT: cmla z2.s, z1.s, z0.s, #0
+; CHECK-NEXT: cmla z2.s, z1.s, z0.s, #90
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
+ %a.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
+ %b.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 1
+ %0 = mul <vscale x 2 x i32> %b.imag, %a.real
+ %1 = mul <vscale x 2 x i32> %b.real, %a.imag
+ %2 = add <vscale x 2 x i32> %1, %0
+ %3 = mul <vscale x 2 x i32> %b.real, %a.real
+ %4 = mul <vscale x 2 x i32> %a.imag, %b.imag
+ %5 = sub <vscale x 2 x i32> %3, %4
+ %interleaved.vec = tail call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %5, <vscale x 2 x i32> %2)
+ ret <vscale x 4 x i32> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x i32> @complex_mul_v8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b) {
+; CHECK-LABEL: complex_mul_v8i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z4.s, #0 // =0x0
+; CHECK-NEXT: mov z5.d, z4.d
+; CHECK-NEXT: cmla z4.s, z3.s, z1.s, #0
+; CHECK-NEXT: cmla z5.s, z2.s, z0.s, #0
+; CHECK-NEXT: cmla z4.s, z3.s, z1.s, #90
+; CHECK-NEXT: cmla z5.s, z2.s, z0.s, #90
+; CHECK-NEXT: mov z1.d, z4.d
+; CHECK-NEXT: mov z0.d, z5.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+ %a.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
+ %b.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 1
+ %0 = mul <vscale x 4 x i32> %b.imag, %a.real
+ %1 = mul <vscale x 4 x i32> %b.real, %a.imag
+ %2 = add <vscale x 4 x i32> %1, %0
+ %3 = mul <vscale x 4 x i32> %b.real, %a.real
+ %4 = mul <vscale x 4 x i32> %a.imag, %b.imag
+ %5 = sub <vscale x 4 x i32> %3, %4
+ %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %5, <vscale x 4 x i32> %2)
+ ret <vscale x 8 x i32> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 16 x i32> @complex_mul_v16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b) {
+; CHECK-LABEL: complex_mul_v16i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z24.s, #0 // =0x0
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: mov z26.d, z24.d
+; CHECK-NEXT: mov z27.d, z24.d
+; CHECK-NEXT: cmla z25.s, z4.s, z0.s, #0
+; CHECK-NEXT: cmla z26.s, z5.s, z1.s, #0
+; CHECK-NEXT: cmla z27.s, z6.s, z2.s, #0
+; CHECK-NEXT: cmla z24.s, z7.s, z3.s, #0
+; CHECK-NEXT: cmla z25.s, z4.s, z0.s, #90
+; CHECK-NEXT: cmla z26.s, z5.s, z1.s, #90
+; CHECK-NEXT: cmla z27.s, z6.s, z2.s, #90
+; CHECK-NEXT: cmla z24.s, z7.s, z3.s, #90
+; CHECK-NEXT: mov z0.d, z25.d
+; CHECK-NEXT: mov z1.d, z26.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: mov z3.d, z24.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
+ %a.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
+ %b.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 1
+ %0 = mul <vscale x 8 x i32> %b.imag, %a.real
+ %1 = mul <vscale x 8 x i32> %b.real, %a.imag
+ %2 = add <vscale x 8 x i32> %1, %0
+ %3 = mul <vscale x 8 x i32> %b.real, %a.real
+ %4 = mul <vscale x 8 x i32> %a.imag, %b.imag
+ %5 = sub <vscale x 8 x i32> %3, %4
+ %interleaved.vec = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %5, <vscale x 8 x i32> %2)
+ ret <vscale x 16 x i32> %interleaved.vec
+}
+
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
+
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <vscale x 2 x i64> @complex_add_v2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: complex_add_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z1.d, z1.d, z0.d, #90
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
+ %a.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
+ %b.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 1
+ %0 = sub <vscale x 1 x i64> %b.real, %a.imag
+ %1 = add <vscale x 1 x i64> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1)
+ ret <vscale x 2 x i64> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 4 x i64> @complex_add_v4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b) {
+; CHECK-LABEL: complex_add_v4i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z2.d, z2.d, z0.d, #90
+; CHECK-NEXT: cadd z3.d, z3.d, z1.d, #90
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
+ %a.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
+ %b.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 1
+ %0 = sub <vscale x 2 x i64> %b.real, %a.imag
+ %1 = add <vscale x 2 x i64> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1)
+ ret <vscale x 4 x i64> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x i64> @complex_add_v8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b) {
+; CHECK-LABEL: complex_add_v8i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z6.d, z6.d, z2.d, #90
+; CHECK-NEXT: cadd z4.d, z4.d, z0.d, #90
+; CHECK-NEXT: cadd z5.d, z5.d, z1.d, #90
+; CHECK-NEXT: cadd z7.d, z7.d, z3.d, #90
+; CHECK-NEXT: mov z0.d, z4.d
+; CHECK-NEXT: mov z1.d, z5.d
+; CHECK-NEXT: mov z2.d, z6.d
+; CHECK-NEXT: mov z3.d, z7.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+ %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+ %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
+ %0 = sub <vscale x 4 x i64> %b.real, %a.imag
+ %1 = add <vscale x 4 x i64> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1)
+ ret <vscale x 8 x i64> %interleaved.vec
+}
+
+declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <vscale x 2 x i64> @complex_mul_v2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: complex_mul_v2i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z2.d, #0 // =0x0
+; CHECK-NEXT: cmla z2.d, z1.d, z0.d, #0
+; CHECK-NEXT: cmla z2.d, z1.d, z0.d, #90
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
+ %a.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
+ %b.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 1
+ %0 = mul <vscale x 1 x i64> %b.imag, %a.real
+ %1 = mul <vscale x 1 x i64> %b.real, %a.imag
+ %2 = add <vscale x 1 x i64> %1, %0
+ %3 = mul <vscale x 1 x i64> %b.real, %a.real
+ %4 = mul <vscale x 1 x i64> %a.imag, %b.imag
+ %5 = sub <vscale x 1 x i64> %3, %4
+ %interleaved.vec = tail call <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64> %5, <vscale x 1 x i64> %2)
+ ret <vscale x 2 x i64> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 4 x i64> @complex_mul_v4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b) {
+; CHECK-LABEL: complex_mul_v4i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z4.d, #0 // =0x0
+; CHECK-NEXT: mov z5.d, z4.d
+; CHECK-NEXT: cmla z4.d, z3.d, z1.d, #0
+; CHECK-NEXT: cmla z5.d, z2.d, z0.d, #0
+; CHECK-NEXT: cmla z4.d, z3.d, z1.d, #90
+; CHECK-NEXT: cmla z5.d, z2.d, z0.d, #90
+; CHECK-NEXT: mov z1.d, z4.d
+; CHECK-NEXT: mov z0.d, z5.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
+ %a.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
+ %b.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 1
+ %0 = mul <vscale x 2 x i64> %b.imag, %a.real
+ %1 = mul <vscale x 2 x i64> %b.real, %a.imag
+ %2 = add <vscale x 2 x i64> %1, %0
+ %3 = mul <vscale x 2 x i64> %b.real, %a.real
+ %4 = mul <vscale x 2 x i64> %a.imag, %b.imag
+ %5 = sub <vscale x 2 x i64> %3, %4
+ %interleaved.vec = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %5, <vscale x 2 x i64> %2)
+ ret <vscale x 4 x i64> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x i64> @complex_mul_v8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b) {
+; CHECK-LABEL: complex_mul_v8i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z24.d, #0 // =0x0
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: mov z26.d, z24.d
+; CHECK-NEXT: mov z27.d, z24.d
+; CHECK-NEXT: cmla z25.d, z4.d, z0.d, #0
+; CHECK-NEXT: cmla z26.d, z5.d, z1.d, #0
+; CHECK-NEXT: cmla z27.d, z6.d, z2.d, #0
+; CHECK-NEXT: cmla z24.d, z7.d, z3.d, #0
+; CHECK-NEXT: cmla z25.d, z4.d, z0.d, #90
+; CHECK-NEXT: cmla z26.d, z5.d, z1.d, #90
+; CHECK-NEXT: cmla z27.d, z6.d, z2.d, #90
+; CHECK-NEXT: cmla z24.d, z7.d, z3.d, #90
+; CHECK-NEXT: mov z0.d, z25.d
+; CHECK-NEXT: mov z1.d, z26.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: mov z3.d, z24.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+ %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+ %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
+ %0 = mul <vscale x 4 x i64> %b.imag, %a.real
+ %1 = mul <vscale x 4 x i64> %b.real, %a.imag
+ %2 = add <vscale x 4 x i64> %1, %0
+ %3 = mul <vscale x 4 x i64> %b.real, %a.real
+ %4 = mul <vscale x 4 x i64> %a.imag, %b.imag
+ %5 = sub <vscale x 4 x i64> %3, %4
+ %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %5, <vscale x 4 x i64> %2)
+ ret <vscale x 8 x i64> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x i64> @complex_minus_mul_v8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b) {
+; CHECK-LABEL: complex_minus_mul_v8i64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z24.d, #0 // =0x0
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: mov z26.d, z24.d
+; CHECK-NEXT: mov z27.d, z24.d
+; CHECK-NEXT: cmla z25.d, z4.d, z0.d, #270
+; CHECK-NEXT: cmla z26.d, z5.d, z1.d, #270
+; CHECK-NEXT: cmla z27.d, z6.d, z2.d, #270
+; CHECK-NEXT: cmla z24.d, z7.d, z3.d, #270
+; CHECK-NEXT: cmla z25.d, z4.d, z0.d, #180
+; CHECK-NEXT: cmla z26.d, z5.d, z1.d, #180
+; CHECK-NEXT: cmla z27.d, z6.d, z2.d, #180
+; CHECK-NEXT: cmla z24.d, z7.d, z3.d, #180
+; CHECK-NEXT: mov z0.d, z25.d
+; CHECK-NEXT: mov z1.d, z26.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: mov z3.d, z24.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+ %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
+ %0 = sub <vscale x 4 x i64> zeroinitializer, %a.real
+ %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+ %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
+ %1 = mul <vscale x 4 x i64> %b.real, %0
+ %2 = mul <vscale x 4 x i64> %b.imag, %a.imag
+ %3 = add <vscale x 4 x i64> %2, %1
+ %4 = mul <vscale x 4 x i64> %b.real, %a.imag
+ %5 = mul <vscale x 4 x i64> %b.imag, %0
+ %6 = sub <vscale x 4 x i64> %5, %4
+ %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %3, <vscale x 4 x i64> %6)
+ ret <vscale x 8 x i64> %interleaved.vec
+}
+
+declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
+
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve2 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to not transform as the type's minimum size is less than 128 bits.
+define <vscale x 8 x i8> @complex_add_v8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: complex_add_v8i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uunpkhi z2.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpkhi z3.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uzp1 z4.s, z0.s, z2.s
+; CHECK-NEXT: uzp2 z0.s, z0.s, z2.s
+; CHECK-NEXT: uzp2 z2.s, z1.s, z3.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z3.s
+; CHECK-NEXT: sub z0.s, z1.s, z0.s
+; CHECK-NEXT: add z1.s, z2.s, z4.s
+; CHECK-NEXT: zip2 z2.s, z0.s, z1.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %a)
+ %a.real = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %b)
+ %b.real = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %b.deinterleaved, 1
+ %0 = sub <vscale x 4 x i8> %b.real, %a.imag
+ %1 = add <vscale x 4 x i8> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 8 x i8> @llvm.experimental.vector.interleave2.nxv8i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1)
+ ret <vscale x 8 x i8> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 16 x i8> @complex_add_v16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: complex_add_v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z1.b, z1.b, z0.b, #90
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %a)
+ %a.real = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %b)
+ %b.real = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %b.deinterleaved, 1
+ %0 = sub <vscale x 8 x i8> %b.real, %a.imag
+ %1 = add <vscale x 8 x i8> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1)
+ ret <vscale x 16 x i8> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 32 x i8> @complex_add_v32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-LABEL: complex_add_v32i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cadd z2.b, z2.b, z0.b, #90
+; CHECK-NEXT: cadd z3.b, z3.b, z1.b, #90
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: ret
+entry:
+ %a.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a)
+ %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+ %b.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b)
+ %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+ %0 = sub <vscale x 16 x i8> %b.real, %a.imag
+ %1 = add <vscale x 16 x i8> %b.imag, %a.real
+ %interleaved.vec = tail call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1)
+ ret <vscale x 32 x i8> %interleaved.vec
+}
+
+declare { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8>)
+declare <vscale x 8 x i8> @llvm.experimental.vector.interleave2.nxv8i8(<vscale x 4 x i8>, <vscale x 4 x i8>)
+
+declare { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
+
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)