CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+
+ // Floating point types returned as "direct" go into F1 .. F8; note that
+ // only the ELFv2 ABI fully utilizes all these registers.
+ CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+ CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
- CCIfType<[f32], CCAssignToReg<[F1, F2]>>,
- CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4]>>,
-
- // Vector types are always returned in V2.
- CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>,
- CCIfType<[v2f64, v2i64], CCAssignToReg<[VSH2]>>
+ // Vector types returned as "direct" go into V2 .. V9; note that only the
+ // ELFv2 ABI fully utilizes all these registers.
+ CCIfType<[v16i8, v8i16, v4i32, v4f32],
+ CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>,
+ CCIfType<[v2f64, v2i64],
+ CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>
]>;
CCIfType<[i32], CCPromoteToType<i64>>,
CCIfType<[i64], CCAssignToReg<[X3, X4]>>,
CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
- CCIfType<[f32], CCAssignToReg<[F1, F2]>>,
- CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4]>>,
- CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>,
- CCIfType<[v2f64, v2i64], CCAssignToReg<[VSH2]>>
+ CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+ CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+ CCIfType<[v16i8, v8i16, v4i32, v4f32],
+ CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>,
+ CCIfType<[v2f64, v2i64],
+ CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>
]>;
//===----------------------------------------------------------------------===//
unsigned ArgSize = ArgVT.getStoreSize();
if (Flags.isByVal())
ArgSize = Flags.getByValSize();
- ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+
+ // Round up to multiples of the pointer size, except for array members,
+ // which are always packed.
+ if (!Flags.isInConsecutiveRegs())
+ ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
return ArgSize;
}
/// CalculateStackSlotAlignment - Calculates the alignment of this argument
/// on the stack.
-static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags,
+static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
+ ISD::ArgFlagsTy Flags,
unsigned PtrByteSize) {
unsigned Align = PtrByteSize;
}
}
+ // Array members are always packed to their original alignment.
+ if (Flags.isInConsecutiveRegs()) {
+ // If the array member was split into multiple registers, the first
+ // needs to be aligned to the size of the full type. (Except for
+ // ppcf128, which is only aligned as its f64 components.)
+ if (Flags.isSplit() && OrigVT != MVT::ppcf128)
+ Align = OrigVT.getStoreSize();
+ else
+ Align = ArgVT.getStoreSize();
+ }
+
return Align;
}
/// stack slot (instead of being passed in registers). ArgOffset,
/// AvailableFPRs, and AvailableVRs must hold the current argument
/// position, and will be updated to account for this argument.
-static bool CalculateStackSlotUsed(EVT ArgVT, ISD::ArgFlagsTy Flags,
+static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
+ ISD::ArgFlagsTy Flags,
unsigned PtrByteSize,
unsigned LinkageSize,
unsigned ParamAreaSize,
bool UseMemory = false;
// Respect alignment of argument on the stack.
- unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize);
+ unsigned Align =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
// If there's no space left in the argument save area, we must
// use memory (this check also catches zero-sized arguments).
// Allocate argument on the stack.
ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+ if (Flags.isInConsecutiveRegsLast())
+ ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
// If we overran the argument save area, we must use memory
// (this check catches arguments passed partially in memory)
if (ArgOffset > LinkageSize + ParamAreaSize)
unsigned AvailableFPRs = Num_FPR_Regs;
unsigned AvailableVRs = Num_VR_Regs;
for (unsigned i = 0, e = Ins.size(); i != e; ++i)
- if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].Flags,
+ if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
PtrByteSize, LinkageSize, ParamAreaSize,
NumBytes, AvailableFPRs, AvailableVRs))
HasParameterArea = true;
SDValue ArgVal;
bool needsLoad = false;
EVT ObjectVT = Ins[ArgNo].VT;
+ EVT OrigVT = Ins[ArgNo].ArgVT;
unsigned ObjSize = ObjectVT.getStoreSize();
unsigned ArgSize = ObjSize;
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
/* Respect alignment of argument on the stack. */
unsigned Align =
- CalculateStackSlotAlignment(ObjectVT, Flags, PtrByteSize);
+ CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
unsigned CurArgOffset = ArgOffset;
case MVT::i1:
case MVT::i32:
case MVT::i64:
+ // These can be scalar arguments or elements of an integer array type
+ // passed directly. Clang may use those instead of "byval" aggregate
+ // types to avoid forcing arguments to memory unnecessarily.
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
case MVT::f32:
case MVT::f64:
+ // These can be scalar arguments or elements of a float array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // float aggregates.
if (FPR_idx != Num_FPR_Regs) {
unsigned VReg;
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
++FPR_idx;
+ } else if (GPR_idx != Num_GPR_Regs) {
+ // This can only ever happen in the presence of f32 array types,
+ // since otherwise we never run out of FPRs before running out
+ // of GPRs.
+ unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+ if (ObjectVT == MVT::f32) {
+ if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
+ ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
+ DAG.getConstant(32, MVT::i32));
+ ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+ }
+
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
} else {
needsLoad = true;
- ArgSize = PtrByteSize;
}
- ArgOffset += 8;
+ // When passing an array of floats, the array occupies consecutive
+ // space in the argument area; only round up to the next doubleword
+ // at the end of the array. Otherwise, each float takes 8 bytes.
+ ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
+ ArgOffset += ArgSize;
+ if (Flags.isInConsecutiveRegsLast())
+ ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
break;
case MVT::v4f32:
case MVT::v4i32:
case MVT::v16i8:
case MVT::v2f64:
case MVT::v2i64:
+ // These can be scalar arguments or elements of a vector array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // vector aggregates.
if (VR_idx != Num_VR_Regs) {
unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ?
MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) :
for (unsigned i = 0; i != NumOps; ++i) {
ISD::ArgFlagsTy Flags = Outs[i].Flags;
EVT ArgVT = Outs[i].VT;
+ EVT OrigVT = Outs[i].ArgVT;
/* Respect alignment of argument on the stack. */
- unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize);
+ unsigned Align =
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
NumBytes = ((NumBytes + Align - 1) / Align) * Align;
NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+ if (Flags.isInConsecutiveRegsLast())
+ NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
}
unsigned NumBytesActuallyUsed = NumBytes;
for (unsigned i = 0; i != NumOps; ++i) {
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ EVT ArgVT = Outs[i].VT;
+ EVT OrigVT = Outs[i].ArgVT;
/* Respect alignment of argument on the stack. */
unsigned Align =
- CalculateStackSlotAlignment(Outs[i].VT, Flags, PtrByteSize);
+ CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
/* Compute GPR index associated with argument offset. */
case MVT::i1:
case MVT::i32:
case MVT::i64:
+ // These can be scalar arguments or elements of an integer array type
+ // passed directly. Clang may use those instead of "byval" aggregate
+ // types to avoid forcing arguments to memory unnecessarily.
if (GPR_idx != NumGPRs) {
RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg));
} else {
ArgOffset += PtrByteSize;
break;
case MVT::f32:
- case MVT::f64:
- if (FPR_idx != NumFPRs) {
+ case MVT::f64: {
+ // These can be scalar arguments or elements of a float array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // float aggregates.
+
+ // Named arguments go into FPRs first, and once they overflow, the
+ // remaining arguments go into GPRs and then the parameter save area.
+ // Unnamed arguments for vararg functions always go to GPRs and
+ // then the parameter save area. For now, put all arguments to vararg
+ // routines always in both locations (FPR *and* GPR or stack slot).
+ bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
+
+ // First load the argument into the next available FPR.
+ if (FPR_idx != NumFPRs)
RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
- if (isVarArg) {
- // A single float or an aggregate containing only a single float
- // must be passed right-justified in the stack doubleword, and
- // in the GPR, if one is available.
- SDValue StoreOff;
- if (Arg.getSimpleValueType().SimpleTy == MVT::f32 &&
- !isLittleEndian) {
- SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
- StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
- } else
- StoreOff = PtrOff;
-
- SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff,
- MachinePointerInfo(), false, false, 0);
- MemOpChains.push_back(Store);
-
- // Float varargs are always shadowed in available integer registers
- if (GPR_idx != NumGPRs) {
- SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
- MachinePointerInfo(), false, false,
- false, 0);
- MemOpChains.push_back(Load.getValue(1));
- RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
- }
- }
+ // Next, load the argument into GPR or stack slot if needed.
+ if (!NeedGPROrStack)
+ ;
+ else if (GPR_idx != NumGPRs) {
+ // In the non-vararg case, this can only ever happen in the
+ // presence of f32 array types, since otherwise we never run
+ // out of FPRs before running out of GPRs.
+ SDValue ArgVal;
+
+ // Double values are always passed in a single GPR.
+ if (Arg.getValueType() != MVT::f32) {
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+
+ // Non-array float values are extended and passed in a GPR.
+ } else if (!Flags.isInConsecutiveRegs()) {
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+ ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+
+ // If we have an array of floats, we collect every odd element
+ // together with its predecessor into one GPR.
+ } else if (ArgOffset % PtrByteSize != 0) {
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
+ Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+ if (!isLittleEndian)
+ std::swap(Lo, Hi);
+ ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+
+ // The final element, if even, goes into the first half of a GPR.
+ } else if (Flags.isInConsecutiveRegsLast()) {
+ ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+ ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+ if (!isLittleEndian)
+ ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
+ DAG.getConstant(32, MVT::i32));
+
+ // Non-final even elements are skipped; they will be handled
+ // together the with subsequent argument on the next go-around.
+ } else
+ ArgVal = SDValue();
+
+ if (ArgVal.getNode())
+ RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal));
} else {
// Single-precision floating-point values are mapped to the
// second (rightmost) word of the stack doubleword.
- if (Arg.getValueType() == MVT::f32 && !isLittleEndian) {
+ if (Arg.getValueType() == MVT::f32 &&
+ !isLittleEndian && !Flags.isInConsecutiveRegs()) {
SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
}
true, isTailCall, false, MemOpChains,
TailCallArguments, dl);
}
- ArgOffset += 8;
+ // When passing an array of floats, the array occupies consecutive
+ // space in the argument area; only round up to the next doubleword
+ // at the end of the array. Otherwise, each float takes 8 bytes.
+ ArgOffset += (Arg.getValueType() == MVT::f32 &&
+ Flags.isInConsecutiveRegs()) ? 4 : 8;
+ if (Flags.isInConsecutiveRegsLast())
+ ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
break;
+ }
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i8:
case MVT::v2f64:
case MVT::v2i64:
+ // These can be scalar arguments or elements of a vector array type
+ // passed directly. The latter are used to implement ELFv2 homogenous
+ // vector aggregates.
+
// For a varargs call, named arguments go into VRs or on the stack as
// usual; unnamed arguments always go to the stack or the corresponding
// GPRs when within range. For now, we always put the value in both
FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo) const override;
+ /// \brief Returns true if an argument of type Ty needs to be passed in a
+ /// contiguous block of registers in calling convention CallConv.
+ bool functionArgumentNeedsConsecutiveRegisters(
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override {
+ // We support any array type as "consecutive" block in the parameter
+ // save area. The element type defines the alignment requirement and
+ // whether the argument should go in GPRs, FPRs, or VRs if available.
+ //
+ // Note that clang uses this capability both to implement the ELFv2
+ // homogeneous float/vector aggregate ABI, and to avoid having to use
+ // "byval" when passing aggregates that might fully fit in registers.
+ return Ty->isArrayTy();
+ }
+
private:
SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
--- /dev/null
+; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+;
+; Verify use of registers for float/vector aggregate return.
+;
+
+define [8 x float] @return_float([8 x float] %x) {
+entry:
+ ret [8 x float] %x
+}
+; CHECK-LABEL: @return_float
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define [8 x double] @return_double([8 x double] %x) {
+entry:
+ ret [8 x double] %x
+}
+; CHECK-LABEL: @return_double
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define [4 x ppc_fp128] @return_ppcf128([4 x ppc_fp128] %x) {
+entry:
+ ret [4 x ppc_fp128] %x
+}
+; CHECK-LABEL: @return_ppcf128
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define [8 x <4 x i32>] @return_v4i32([8 x <4 x i32>] %x) {
+entry:
+ ret [8 x <4 x i32>] %x
+}
+; CHECK-LABEL: @return_v4i32
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+
+;
+; Verify amount of space taken up by aggregates in the parameter save area.
+;
+
+define i64 @callee_float([7 x float] %a, [7 x float] %b, i64 %c) {
+entry:
+ ret i64 %c
+}
+; CHECK-LABEL: @callee_float
+; CHECK: ld 3, 96(1)
+; CHECK: blr
+
+define void @caller_float(i64 %x, [7 x float] %y) {
+entry:
+ tail call void @test_float([7 x float] %y, [7 x float] %y, i64 %x)
+ ret void
+}
+; CHECK-LABEL: @caller_float
+; CHECK: std 3, 96(1)
+; CHECK: bl test_float
+
+declare void @test_float([7 x float], [7 x float], i64)
+
+define i64 @callee_double(i64 %a, [7 x double] %b, i64 %c) {
+entry:
+ ret i64 %c
+}
+; CHECK-LABEL: @callee_double
+; CHECK: ld 3, 96(1)
+; CHECK: blr
+
+define void @caller_double(i64 %x, [7 x double] %y) {
+entry:
+ tail call void @test_double(i64 %x, [7 x double] %y, i64 %x)
+ ret void
+}
+; CHECK-LABEL: @caller_double
+; CHECK: std 3, 96(1)
+; CHECK: bl test_double
+
+declare void @test_double(i64, [7 x double], i64)
+
+define i64 @callee_ppcf128(i64 %a, [4 x ppc_fp128] %b, i64 %c) {
+entry:
+ ret i64 %c
+}
+; CHECK-LABEL: @callee_ppcf128
+; CHECK: ld 3, 104(1)
+; CHECK: blr
+
+define void @caller_ppcf128(i64 %x, [4 x ppc_fp128] %y) {
+entry:
+ tail call void @test_ppcf128(i64 %x, [4 x ppc_fp128] %y, i64 %x)
+ ret void
+}
+; CHECK-LABEL: @caller_ppcf128
+; CHECK: std 3, 104(1)
+; CHECK: bl test_ppcf128
+
+declare void @test_ppcf128(i64, [4 x ppc_fp128], i64)
+
+define i64 @callee_i64(i64 %a, [7 x i64] %b, i64 %c) {
+entry:
+ ret i64 %c
+}
+; CHECK-LABEL: @callee_i64
+; CHECK: ld 3, 96(1)
+; CHECK: blr
+
+define void @caller_i64(i64 %x, [7 x i64] %y) {
+entry:
+ tail call void @test_i64(i64 %x, [7 x i64] %y, i64 %x)
+ ret void
+}
+; CHECK-LABEL: @caller_i64
+; CHECK: std 3, 96(1)
+; CHECK: bl test_i64
+
+declare void @test_i64(i64, [7 x i64], i64)
+
+define i64 @callee_i128(i64 %a, [4 x i128] %b, i64 %c) {
+entry:
+ ret i64 %c
+}
+; CHECK-LABEL: @callee_i128
+; CHECK: ld 3, 112(1)
+; CHECK: blr
+
+define void @caller_i128(i64 %x, [4 x i128] %y) {
+entry:
+ tail call void @test_i128(i64 %x, [4 x i128] %y, i64 %x)
+ ret void
+}
+; CHECK-LABEL: @caller_i128
+; CHECK: std 3, 112(1)
+; CHECK: bl test_i128
+
+declare void @test_i128(i64, [4 x i128], i64)
+
+define i64 @callee_v4i32(i64 %a, [4 x <4 x i32>] %b, i64 %c) {
+entry:
+ ret i64 %c
+}
+; CHECK-LABEL: @callee_v4i32
+; CHECK: ld 3, 112(1)
+; CHECK: blr
+
+define void @caller_v4i32(i64 %x, [4 x <4 x i32>] %y) {
+entry:
+ tail call void @test_v4i32(i64 %x, [4 x <4 x i32>] %y, i64 %x)
+ ret void
+}
+; CHECK-LABEL: @caller_v4i32
+; CHECK: std 3, 112(1)
+; CHECK: bl test_v4i32
+
+declare void @test_v4i32(i64, [4 x <4 x i32>], i64)
+
+
+;
+; Verify handling of floating point arguments in GPRs
+;
+
+%struct.float8 = type { [8 x float] }
+%struct.float5 = type { [5 x float] }
+%struct.float2 = type { [2 x float] }
+
+@g8 = common global %struct.float8 zeroinitializer, align 4
+@g5 = common global %struct.float5 zeroinitializer, align 4
+@g2 = common global %struct.float2 zeroinitializer, align 4
+
+define float @callee0([7 x float] %a, [7 x float] %b) {
+entry:
+ %b.extract = extractvalue [7 x float] %b, 6
+ ret float %b.extract
+}
+; CHECK-LABEL: @callee0
+; CHECK: stw 10, [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller0([7 x float] %a) {
+entry:
+ tail call void @test0([7 x float] %a, [7 x float] %a)
+ ret void
+}
+; CHECK-LABEL: @caller0
+; CHECK-DAG: fmr 8, 1
+; CHECK-DAG: fmr 9, 2
+; CHECK-DAG: fmr 10, 3
+; CHECK-DAG: fmr 11, 4
+; CHECK-DAG: fmr 12, 5
+; CHECK-DAG: fmr 13, 6
+; CHECK-DAG: stfs 7, [[OFF:[0-9]+]](1)
+; CHECK-DAG: lwz 10, [[OFF]](1)
+; CHECK: bl test0
+
+declare void @test0([7 x float], [7 x float])
+
+define float @callee1([8 x float] %a, [8 x float] %b) {
+entry:
+ %b.extract = extractvalue [8 x float] %b, 7
+ ret float %b.extract
+}
+; CHECK-LABEL: @callee1
+; CHECK: rldicl [[REG:[0-9]+]], 10, 32, 32
+; CHECK: stw [[REG]], [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller1([8 x float] %a) {
+entry:
+ tail call void @test1([8 x float] %a, [8 x float] %a)
+ ret void
+}
+; CHECK-LABEL: @caller1
+; CHECK-DAG: fmr 9, 1
+; CHECK-DAG: fmr 10, 2
+; CHECK-DAG: fmr 11, 3
+; CHECK-DAG: fmr 12, 4
+; CHECK-DAG: fmr 13, 5
+; CHECK-DAG: stfs 5, [[OFF0:[0-9]+]](1)
+; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1)
+; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1)
+; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1)
+; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1)
+; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1)
+; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1)
+; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1)
+; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
+; CHECK-DAG: sldi [[REG3]], [[REG3]], 32
+; CHECK-DAG: or 9, [[REG0]], [[REG1]]
+; CHECK-DAG: or 10, [[REG2]], [[REG3]]
+; CHECK: bl test1
+
+declare void @test1([8 x float], [8 x float])
+
+define float @callee2([8 x float] %a, [5 x float] %b, [2 x float] %c) {
+entry:
+ %c.extract = extractvalue [2 x float] %c, 1
+ ret float %c.extract
+}
+; CHECK-LABEL: @callee2
+; CHECK: rldicl [[REG:[0-9]+]], 10, 32, 32
+; CHECK: stw [[REG]], [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller2() {
+entry:
+ %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4
+ %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4
+ %2 = load [2 x float]* getelementptr inbounds (%struct.float2* @g2, i64 0, i32 0), align 4
+ tail call void @test2([8 x float] %0, [5 x float] %1, [2 x float] %2)
+ ret void
+}
+; CHECK-LABEL: @caller2
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK-DAG: lfs 1, 0([[REG]])
+; CHECK-DAG: lfs 2, 4([[REG]])
+; CHECK-DAG: lfs 3, 8([[REG]])
+; CHECK-DAG: lfs 4, 12([[REG]])
+; CHECK-DAG: lfs 5, 16([[REG]])
+; CHECK-DAG: lfs 6, 20([[REG]])
+; CHECK-DAG: lfs 7, 24([[REG]])
+; CHECK-DAG: lfs 8, 28([[REG]])
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK-DAG: lfs 9, 0([[REG]])
+; CHECK-DAG: lfs 10, 4([[REG]])
+; CHECK-DAG: lfs 11, 8([[REG]])
+; CHECK-DAG: lfs 12, 12([[REG]])
+; CHECK-DAG: lfs 13, 16([[REG]])
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK-DAG: lwz [[REG0:[0-9]+]], 0([[REG]])
+; CHECK-DAG: lwz [[REG1:[0-9]+]], 4([[REG]])
+; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
+; CHECK-DAG: or 10, [[REG0]], [[REG1]]
+; CHECK: bl test2
+
+declare void @test2([8 x float], [5 x float], [2 x float])
+
+define double @callee3([8 x float] %a, [5 x float] %b, double %c) {
+entry:
+ ret double %c
+}
+; CHECK-LABEL: @callee3
+; CHECK: std 10, [[OFF:.*]](1)
+; CHECK: lfd 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller3(double %d) {
+entry:
+ %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4
+ %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4
+ tail call void @test3([8 x float] %0, [5 x float] %1, double %d)
+ ret void
+}
+; CHECK-LABEL: @caller3
+; CHECK: stfd 1, [[OFF:.*]](1)
+; CHECK: ld 10, [[OFF]](1)
+; CHECK: bl test3
+
+declare void @test3([8 x float], [5 x float], double)
+
+define float @callee4([8 x float] %a, [5 x float] %b, float %c) {
+entry:
+ ret float %c
+}
+; CHECK-LABEL: @callee4
+; CHECK: stw 10, [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller4(float %f) {
+entry:
+ %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4
+ %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4
+ tail call void @test4([8 x float] %0, [5 x float] %1, float %f)
+ ret void
+}
+; CHECK-LABEL: @caller4
+; CHECK: stfs 1, [[OFF:.*]](1)
+; CHECK: lwz 10, [[OFF]](1)
+; CHECK: bl test4
+
+declare void @test4([8 x float], [5 x float], float)
+
ret void
}
-; CHECK: stfs {{[0-9]+}}, 60(1)
-; CHECK: ld 4, 56(1)
+; CHECK: stfs {{[0-9]+}}, 116(1)
+; CHECK: lwz 4, 116(1)
; CHECK: bl
declare void @testvaSf1(i32, ...)