void dump() const;
+ /// In most cases this function returns the ABI alignment for a given type,
+ /// except for illegal vector types where the alignment exceeds that of the
+ /// stack. In such cases we attempt to break the vector down to a legal type
+ /// and return the ABI alignment for that instead.
+ Align getReducedAlign(EVT VT, bool UseABI);
+
/// Create a stack temporary based on the size in bytes and the alignment
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment);
SDLoc dl(Op);
// Create the stack frame object. Make sure it is aligned for both
// the source and destination types.
- SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT);
+
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest part.
+ Align DestAlign = DAG.getReducedAlign(DestVT, /*UseABI=*/false);
+ Align OpAlign = DAG.getReducedAlign(Op.getValueType(), /*UseABI=*/false);
+ Align Align = std::max(DestAlign, OpAlign);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(Op.getValueType().getStoreSize(), Align);
// Emit a store to the stack slot.
- SDValue Store =
- DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr, MachinePointerInfo());
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr,
+ MachinePointerInfo(), Align);
// Result is a load from the stack slot.
- return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo());
+ return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align);
}
/// Replace the node's results with custom code provided by the target and
// Create the stack frame object. Make sure it is aligned for both
// the source and expanded destination types.
- Align Alignment = DAG.getDataLayout().getPrefTypeAlign(
- NOutVT.getTypeForEVT(*DAG.getContext()));
- SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment.value());
+
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest part.
+ Align InAlign = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+ Align NOutAlign = DAG.getReducedAlign(NOutVT, /*UseABI=*/false);
+ Align Align = std::max(InAlign, NOutAlign);
+ SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Align);
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo);
// Load the first half from the stack slot.
- Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, Alignment);
+ Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, NOutAlign);
// Increment the pointer to the other half.
unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
// Load the second half from the stack slot.
Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
- PtrInfo.getWithOffset(IncrementSize), Alignment);
+ PtrInfo.getWithOffset(IncrementSize), NOutAlign);
// Handle endianness of the load.
if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
}
// Spill the vector to the stack.
- SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest part.
+ Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
auto &MF = DAG.getMachineFunction();
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
- Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
- Align Alignment = DAG.getDataLayout().getPrefTypeAlign(VecType);
- SDValue Store =
- DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, Alignment);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+ SmallestAlign);
// Store the new subvector into the specified index.
SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
MachinePointerInfo::getUnknownStack(MF));
// Load the Lo part from the stack slot.
- Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo, Alignment);
+ Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo,
+ SmallestAlign);
// Increment the pointer to the other part.
unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
// Load the Hi part from the stack slot.
Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
- PtrInfo.getWithOffset(IncrementSize), Alignment);
+ PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
}
void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
}
// Spill the vector to the stack.
- SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest part.
+ Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
auto &MF = DAG.getMachineFunction();
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
- Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
- Align Alignment = DAG.getDataLayout().getPrefTypeAlign(VecType);
- SDValue Store =
- DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, Alignment);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+ SmallestAlign);
// Store the new element. This may be larger than the vector element type,
// so use a truncating store.
SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
- Store = DAG.getTruncStore(Store, dl, Elt, EltPtr,
- MachinePointerInfo::getUnknownStack(MF), EltVT);
+ Store = DAG.getTruncStore(
+ Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
+ commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
// Load the Lo part from the stack slot.
- Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, Alignment);
+ Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
// Increment the pointer to the other part.
unsigned IncrementSize = LoVT.getSizeInBits() / 8;
// Load the Hi part from the stack slot.
Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
- PtrInfo.getWithOffset(IncrementSize), Alignment);
+ PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
// If we adjusted the original type, we need to truncate the results.
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
}
// Store the vector to the stack.
- SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest part.
+ Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
auto &MF = DAG.getMachineFunction();
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+ SmallestAlign);
// Load back the required element.
StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
return DAG.getExtLoad(
ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
- MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT);
+ MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT,
+ commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
}
SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
MachinePointerInfo(VD));
}
+Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
+ const DataLayout &DL = getDataLayout();
+ Type *Ty = VT.getTypeForEVT(*getContext());
+ Align RedAlign = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
+
+ if (TLI->isTypeLegal(VT) || !VT.isVector())
+ return RedAlign;
+
+ const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+ const Align StackAlign = TFI->getStackAlign();
+
+ // See if we can choose a smaller ABI alignment in cases where it's an
+ // illegal vector type that will get broken down.
+ if (RedAlign > StackAlign) {
+ EVT IntermediateVT;
+ MVT RegisterVT;
+ unsigned NumIntermediates;
+ unsigned NumRegs = TLI->getVectorTypeBreakdown(
+ *getContext(), VT, IntermediateVT, NumIntermediates, RegisterVT);
+ Ty = IntermediateVT.getTypeForEVT(*getContext());
+ Align RedAlign2 = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
+ if (RedAlign2 < RedAlign)
+ RedAlign = RedAlign2;
+ }
+
+ return RedAlign;
+}
+
SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
MachineFrameInfo &MFI = MF->getFrameInfo();
int FrameIdx = MFI.CreateStackObject(Bytes, Alignment, false);
; CHECK: mov v[[R]].d[1], v{{[0-9]+}}.d[0]
; CHECK: str q[[R]], [x{{[0-9]+}}]
}
+
+; In this test the illegal type has a preferred alignment greater than the
+; stack alignment, that gets reduced to the alignment of a broken down
+; legal type.
+define <32 x i8> @test_lanex_32xi8(<32 x i8> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_32xi8
+; CHECK: stp q0, q1, [sp, #-32]!
+; CHECK: ldp q0, q1, [sp], #32
+ %b = insertelement <32 x i8> %a, i8 30, i32 %x
+ ret <32 x i8> %b
+}
+
; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
; GCN-NOT: s_mov_b32 s0
-; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
-; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
+; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
+; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
; GCN: NumVgprs: 256
-; GCN: ScratchSize: 1536
+; GCN: ScratchSize: 768
define amdgpu_vs void @main([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, [16 x <4 x i32>] addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
bb:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-128, %rsp
-; KNL-NEXT: subq $256, %rsp ## imm = 0x100
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $192, %rsp
; KNL-NEXT: movl 744(%rbp), %eax
; KNL-NEXT: andl $127, %eax
; KNL-NEXT: vmovd %edi, %xmm0
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-128, %rsp
-; SKX-NEXT: subq $256, %rsp ## imm = 0x100
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $192, %rsp
; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-128, %rsp
-; KNL-NEXT: subq $256, %rsp ## imm = 0x100
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $192, %rsp
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-128, %rsp
-; SKX-NEXT: subq $256, %rsp ## imm = 0x100
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $192, %rsp
; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1
define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v32i8_var:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: andl $31, %edi
-; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
-; SSE-NEXT: movb (%rsp,%rdi), %al
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movb -40(%rsp,%rdi), %al
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v32i8_var:
define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v16i16_var:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: andl $15, %edi
-; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
-; SSE-NEXT: movzwl (%rsp,%rdi,2), %eax
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movzwl -40(%rsp,%rdi,2), %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v16i16_var:
define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v8i32_var:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: andl $7, %edi
-; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
-; SSE-NEXT: movl (%rsp,%rdi,4), %eax
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -40(%rsp,%rdi,4), %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v8i32_var:
define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v4i64_var:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: andl $3, %edi
-; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
-; SSE-NEXT: movq (%rsp,%rdi,8), %rax
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -40(%rsp,%rdi,8), %rax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v4i64_var:
; CHECK-LIBCALL-NEXT: subq $88, %rsp
; CHECK-LIBCALL-NEXT: movl (%rdi), %eax
; CHECK-LIBCALL-NEXT: movl 4(%rdi), %ecx
-; CHECK-LIBCALL-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; CHECK-LIBCALL-NEXT: movl %eax, (%rsp)
; CHECK-LIBCALL-NEXT: movl %ecx, {{[0-9]+}}(%rsp)
-; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0
; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-LIBCALL-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
-; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-LIBCALL-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0]
define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) nounwind {
; X64-LABEL: PR23476:
; X64: # %bb.0:
-; X64-NEXT: pushq %rbp
-; X64-NEXT: movq %rsp, %rbp
-; X64-NEXT: andq $-64, %rsp
-; X64-NEXT: subq $128, %rsp
; X64-NEXT: movq %rsi, %xmm0
; X64-NEXT: movq %rdi, %xmm1
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: movq %rdx, %xmm2
; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; X64-NEXT: movl 16(%rbp), %eax
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: andl $7, %eax
; X64-NEXT: movq %r8, %xmm0
-; X64-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp)
-; X64-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp)
-; X64-NEXT: movdqa %xmm1, (%rsp)
-; X64-NEXT: movq (%rsp,%rax,8), %rax
+; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -72(%rsp,%rax,8), %rax
; X64-NEXT: movq %rax, (%r9)
-; X64-NEXT: movq %rbp, %rsp
-; X64-NEXT: popq %rbp
; X64-NEXT: retq
;
; X32-LABEL: PR23476:
; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: andl $-64, %esp
-; X32-NEXT: subl $128, %esp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $80, %esp
; X32-NEXT: movl 52(%ebp), %eax
; X32-NEXT: andl $7, %eax
; X32-NEXT: movl 48(%ebp), %ecx
; X32AVX: # %bb.0:
; X32AVX-NEXT: pushl %ebp
; X32AVX-NEXT: movl %esp, %ebp
-; X32AVX-NEXT: andl $-64, %esp
-; X32AVX-NEXT: subl $128, %esp
+; X32AVX-NEXT: andl $-32, %esp
+; X32AVX-NEXT: subl $96, %esp
; X32AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32AVX-NEXT: movl 52(%ebp), %eax
; X32AVX-NEXT: andl $7, %eax
define <32 x i8> @arg_i8_v32i8(i8 %x, i32 %y) nounwind {
; SSE-LABEL: arg_i8_v32i8:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: andl $31, %esi
-; SSE-NEXT: movb %dil, (%rsp,%rsi)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movb %dil, -40(%rsp,%rsi)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_i8_v32i8:
define <16 x i16> @arg_i16_v16i16(i16 %x, i32 %y) nounwind {
; SSE-LABEL: arg_i16_v16i16:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: andl $15, %esi
-; SSE-NEXT: movw %di, (%rsp,%rsi,2)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movw %di, -40(%rsp,%rsi,2)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_i16_v16i16:
define <8 x i32> @arg_i32_v8i32(i32 %x, i32 %y) nounwind {
; SSE-LABEL: arg_i32_v8i32:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: andl $7, %esi
-; SSE-NEXT: movl %edi, (%rsp,%rsi,4)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movl %edi, -40(%rsp,%rsi,4)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_i32_v8i32:
define <4 x i64> @arg_i64_v4i64(i64 %x, i32 %y) nounwind {
; SSE-LABEL: arg_i64_v4i64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: andl $3, %esi
-; SSE-NEXT: movq %rdi, (%rsp,%rsi,8)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movq %rdi, -40(%rsp,%rsi,8)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_i64_v4i64:
define <8 x float> @arg_f32_v8f32(float %x, i32 %y) nounwind {
; SSE-LABEL: arg_f32_v8f32:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: andl $7, %edi
-; SSE-NEXT: movss %xmm0, (%rsp,%rdi,4)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movss %xmm0, -40(%rsp,%rdi,4)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_f32_v8f32:
define <4 x double> @arg_f64_v4f64(double %x, i32 %y) nounwind {
; SSE-LABEL: arg_f64_v4f64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: andl $3, %edi
-; SSE-NEXT: movsd %xmm0, (%rsp,%rdi,8)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movsd %xmm0, -40(%rsp,%rdi,8)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: arg_f64_v4f64:
define <32 x i8> @load_i8_v32i8(i8* %p, i32 %y) nounwind {
; SSE-LABEL: load_i8_v32i8:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movb (%rdi), %al
; SSE-NEXT: andl $31, %esi
-; SSE-NEXT: movb %al, (%rsp,%rsi)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movb %al, -40(%rsp,%rsi)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i8_v32i8:
define <16 x i16> @load_i16_v16i16(i16* %p, i32 %y) nounwind {
; SSE-LABEL: load_i16_v16i16:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movzwl (%rdi), %eax
; SSE-NEXT: andl $15, %esi
-; SSE-NEXT: movw %ax, (%rsp,%rsi,2)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movw %ax, -40(%rsp,%rsi,2)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i16_v16i16:
define <8 x i32> @load_i32_v8i32(i32* %p, i32 %y) nounwind {
; SSE-LABEL: load_i32_v8i32:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movl (%rdi), %eax
; SSE-NEXT: andl $7, %esi
-; SSE-NEXT: movl %eax, (%rsp,%rsi,4)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movl %eax, -40(%rsp,%rsi,4)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: load_i32_v8i32:
define <4 x i64> @load_i64_v4i64(i64* %p, i32 %y) nounwind {
; SSE-LABEL: load_i64_v4i64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movq (%rdi), %rax
; SSE-NEXT: andl $3, %esi
-; SSE-NEXT: movq %rax, (%rsp,%rsi,8)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movq %rax, -40(%rsp,%rsi,8)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: load_i64_v4i64:
define <8 x float> @load_f32_v8f32(float* %p, i32 %y) nounwind {
; SSE-LABEL: load_f32_v8f32:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: andl $7, %esi
-; SSE-NEXT: movss %xmm0, (%rsp,%rsi,4)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movss %xmm0, -40(%rsp,%rsi,4)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: load_f32_v8f32:
define <4 x double> @load_f64_v4f64(double* %p, i32 %y) nounwind {
; SSE-LABEL: load_f64_v4f64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: movq %rsp, %rbp
-; SSE-NEXT: andq $-32, %rsp
-; SSE-NEXT: subq $64, %rsp
; SSE-NEXT: # kill: def $esi killed $esi def $rsi
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: andl $3, %esi
-; SSE-NEXT: movsd %xmm0, (%rsp,%rsi,8)
-; SSE-NEXT: movaps (%rsp), %xmm0
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: movq %rbp, %rsp
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movsd %xmm0, -40(%rsp,%rsi,8)
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: load_f64_v4f64:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
+; X86-NEXT: subl $64, %esp
; X86-NEXT: movzwl 8(%ebp), %esi
; X86-NEXT: movzwl 12(%ebp), %edi
; X86-NEXT: movzwl 20(%ebp), %ebx
; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
; SSE3: # %bb.0:
; SSE3-NEXT: pushq %rbp
-; SSE3-NEXT: movq %rsp, %rbp
; SSE3-NEXT: pushq %r15
; SSE3-NEXT: pushq %r14
; SSE3-NEXT: pushq %r13
; SSE3-NEXT: pushq %r12
; SSE3-NEXT: pushq %rbx
-; SSE3-NEXT: andq $-32, %rsp
-; SSE3-NEXT: subq $608, %rsp # imm = 0x260
-; SSE3-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE3-NEXT: subq $424, %rsp # imm = 0x1A8
+; SSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
-; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
-; SSE3-NEXT: andl $31, %r9d
-; SSE3-NEXT: movzbl 64(%rsp,%r9), %ebx
-; SSE3-NEXT: movd %ebx, %xmm8
-; SSE3-NEXT: andl $31, %eax
-; SSE3-NEXT: movzbl 96(%rsp,%rax), %eax
-; SSE3-NEXT: movd %eax, %xmm15
+; SSE3-NEXT: movaps %xmm0, (%rsp)
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT: andl $31, %r8d
+; SSE3-NEXT: movzbl -96(%rsp,%r8), %esi
+; SSE3-NEXT: movd %esi, %xmm8
+; SSE3-NEXT: andl $31, %ebp
+; SSE3-NEXT: movzbl -64(%rsp,%rbp), %esi
+; SSE3-NEXT: movd %esi, %xmm15
; SSE3-NEXT: andl $31, %edx
-; SSE3-NEXT: movzbl 128(%rsp,%rdx), %eax
-; SSE3-NEXT: movd %eax, %xmm9
+; SSE3-NEXT: movzbl -32(%rsp,%rdx), %edx
+; SSE3-NEXT: movd %edx, %xmm9
; SSE3-NEXT: andl $31, %ecx
-; SSE3-NEXT: movzbl 160(%rsp,%rcx), %eax
-; SSE3-NEXT: movd %eax, %xmm3
-; SSE3-NEXT: andl $31, %esi
-; SSE3-NEXT: movzbl 192(%rsp,%rsi), %eax
+; SSE3-NEXT: movzbl (%rsp,%rcx), %ecx
+; SSE3-NEXT: movd %ecx, %xmm3
+; SSE3-NEXT: andl $31, %eax
+; SSE3-NEXT: movzbl 32(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm10
; SSE3-NEXT: andl $31, %edi
-; SSE3-NEXT: movzbl 224(%rsp,%rdi), %eax
+; SSE3-NEXT: movzbl 64(%rsp,%rdi), %eax
; SSE3-NEXT: movd %eax, %xmm7
-; SSE3-NEXT: andl $31, %r8d
-; SSE3-NEXT: movzbl 256(%rsp,%r8), %eax
+; SSE3-NEXT: andl $31, %ebx
+; SSE3-NEXT: movzbl 96(%rsp,%rbx), %eax
; SSE3-NEXT: movd %eax, %xmm11
-; SSE3-NEXT: andl $31, %r10d
-; SSE3-NEXT: movzbl 288(%rsp,%r10), %eax
+; SSE3-NEXT: andl $31, %r9d
+; SSE3-NEXT: movzbl 128(%rsp,%r9), %eax
; SSE3-NEXT: movd %eax, %xmm6
; SSE3-NEXT: andl $31, %r13d
-; SSE3-NEXT: movzbl 320(%rsp,%r13), %eax
+; SSE3-NEXT: movzbl 160(%rsp,%r13), %eax
; SSE3-NEXT: movd %eax, %xmm12
; SSE3-NEXT: andl $31, %r12d
-; SSE3-NEXT: movzbl 352(%rsp,%r12), %eax
+; SSE3-NEXT: movzbl 192(%rsp,%r12), %eax
; SSE3-NEXT: movd %eax, %xmm5
; SSE3-NEXT: andl $31, %r15d
-; SSE3-NEXT: movzbl 384(%rsp,%r15), %eax
+; SSE3-NEXT: movzbl 224(%rsp,%r15), %eax
; SSE3-NEXT: movd %eax, %xmm13
; SSE3-NEXT: andl $31, %r14d
-; SSE3-NEXT: movzbl 416(%rsp,%r14), %eax
+; SSE3-NEXT: movzbl 256(%rsp,%r14), %eax
; SSE3-NEXT: movd %eax, %xmm4
; SSE3-NEXT: andl $31, %r11d
-; SSE3-NEXT: movzbl 448(%rsp,%r11), %eax
+; SSE3-NEXT: movzbl 288(%rsp,%r11), %eax
; SSE3-NEXT: movd %eax, %xmm14
-; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE3-NEXT: andl $31, %eax
-; SSE3-NEXT: movzbl 480(%rsp,%rax), %eax
+; SSE3-NEXT: andl $31, %r10d
+; SSE3-NEXT: movzbl 320(%rsp,%r10), %eax
; SSE3-NEXT: movd %eax, %xmm1
; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE3-NEXT: andl $31, %eax
-; SSE3-NEXT: movzbl 512(%rsp,%rax), %eax
+; SSE3-NEXT: movzbl 352(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm2
; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE3-NEXT: andl $31, %eax
-; SSE3-NEXT: movzbl 544(%rsp,%rax), %eax
+; SSE3-NEXT: movzbl 384(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSE3-NEXT: leaq -40(%rbp), %rsp
+; SSE3-NEXT: addq $424, %rsp # imm = 0x1A8
; SSE3-NEXT: popq %rbx
; SSE3-NEXT: popq %r12
; SSE3-NEXT: popq %r13
; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pushq %rbp
-; SSSE3-NEXT: movq %rsp, %rbp
; SSSE3-NEXT: pushq %r15
; SSSE3-NEXT: pushq %r14
; SSSE3-NEXT: pushq %r13
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: andq $-32, %rsp
-; SSSE3-NEXT: subq $608, %rsp # imm = 0x260
-; SSSE3-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSSE3-NEXT: subq $424, %rsp # imm = 0x1A8
+; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
-; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT: andl $31, %r9d
-; SSSE3-NEXT: movzbl 64(%rsp,%r9), %ebx
-; SSSE3-NEXT: movd %ebx, %xmm8
-; SSSE3-NEXT: andl $31, %eax
-; SSSE3-NEXT: movzbl 96(%rsp,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm15
+; SSSE3-NEXT: movaps %xmm0, (%rsp)
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSSE3-NEXT: andl $31, %r8d
+; SSSE3-NEXT: movzbl -96(%rsp,%r8), %esi
+; SSSE3-NEXT: movd %esi, %xmm8
+; SSSE3-NEXT: andl $31, %ebp
+; SSSE3-NEXT: movzbl -64(%rsp,%rbp), %esi
+; SSSE3-NEXT: movd %esi, %xmm15
; SSSE3-NEXT: andl $31, %edx
-; SSSE3-NEXT: movzbl 128(%rsp,%rdx), %eax
-; SSSE3-NEXT: movd %eax, %xmm9
+; SSSE3-NEXT: movzbl -32(%rsp,%rdx), %edx
+; SSSE3-NEXT: movd %edx, %xmm9
; SSSE3-NEXT: andl $31, %ecx
-; SSSE3-NEXT: movzbl 160(%rsp,%rcx), %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: andl $31, %esi
-; SSSE3-NEXT: movzbl 192(%rsp,%rsi), %eax
+; SSSE3-NEXT: movzbl (%rsp,%rcx), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm3
+; SSSE3-NEXT: andl $31, %eax
+; SSSE3-NEXT: movzbl 32(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm10
; SSSE3-NEXT: andl $31, %edi
-; SSSE3-NEXT: movzbl 224(%rsp,%rdi), %eax
+; SSSE3-NEXT: movzbl 64(%rsp,%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm7
-; SSSE3-NEXT: andl $31, %r8d
-; SSSE3-NEXT: movzbl 256(%rsp,%r8), %eax
+; SSSE3-NEXT: andl $31, %ebx
+; SSSE3-NEXT: movzbl 96(%rsp,%rbx), %eax
; SSSE3-NEXT: movd %eax, %xmm11
-; SSSE3-NEXT: andl $31, %r10d
-; SSSE3-NEXT: movzbl 288(%rsp,%r10), %eax
+; SSSE3-NEXT: andl $31, %r9d
+; SSSE3-NEXT: movzbl 128(%rsp,%r9), %eax
; SSSE3-NEXT: movd %eax, %xmm6
; SSSE3-NEXT: andl $31, %r13d
-; SSSE3-NEXT: movzbl 320(%rsp,%r13), %eax
+; SSSE3-NEXT: movzbl 160(%rsp,%r13), %eax
; SSSE3-NEXT: movd %eax, %xmm12
; SSSE3-NEXT: andl $31, %r12d
-; SSSE3-NEXT: movzbl 352(%rsp,%r12), %eax
+; SSSE3-NEXT: movzbl 192(%rsp,%r12), %eax
; SSSE3-NEXT: movd %eax, %xmm5
; SSSE3-NEXT: andl $31, %r15d
-; SSSE3-NEXT: movzbl 384(%rsp,%r15), %eax
+; SSSE3-NEXT: movzbl 224(%rsp,%r15), %eax
; SSSE3-NEXT: movd %eax, %xmm13
; SSSE3-NEXT: andl $31, %r14d
-; SSSE3-NEXT: movzbl 416(%rsp,%r14), %eax
+; SSSE3-NEXT: movzbl 256(%rsp,%r14), %eax
; SSSE3-NEXT: movd %eax, %xmm4
; SSSE3-NEXT: andl $31, %r11d
-; SSSE3-NEXT: movzbl 448(%rsp,%r11), %eax
+; SSSE3-NEXT: movzbl 288(%rsp,%r11), %eax
; SSSE3-NEXT: movd %eax, %xmm14
-; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSSE3-NEXT: andl $31, %eax
-; SSSE3-NEXT: movzbl 480(%rsp,%rax), %eax
+; SSSE3-NEXT: andl $31, %r10d
+; SSSE3-NEXT: movzbl 320(%rsp,%r10), %eax
; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSSE3-NEXT: andl $31, %eax
-; SSSE3-NEXT: movzbl 512(%rsp,%rax), %eax
+; SSSE3-NEXT: movzbl 352(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSSE3-NEXT: andl $31, %eax
-; SSSE3-NEXT: movzbl 544(%rsp,%rax), %eax
+; SSSE3-NEXT: movzbl 384(%rsp,%rax), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSSE3-NEXT: leaq -40(%rbp), %rsp
+; SSSE3-NEXT: addq $424, %rsp # imm = 0x1A8
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
;
; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pushq %rbp
-; SSE41-NEXT: movq %rsp, %rbp
-; SSE41-NEXT: andq $-32, %rsp
-; SSE41-NEXT: subq $544, %rsp # imm = 0x220
+; SSE41-NEXT: subq $392, %rsp # imm = 0x188
; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; SSE41-NEXT: movaps %xmm0, (%rsp)
-; SSE41-NEXT: movzbl 480(%rsp,%rax), %eax
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movzbl 352(%rsp,%rax), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pextrb $1, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $1, 448(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $1, 320(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $2, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $2, 416(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $2, 288(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $3, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $3, 384(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $3, 256(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $4, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $4, 352(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $4, 224(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $5, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $5, 320(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $5, 192(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $6, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $6, 288(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $6, 160(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $7, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $7, 256(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $7, 128(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $8, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $8, 224(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $8, 96(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $9, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $9, 192(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $9, 64(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $10, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $10, 160(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $10, 32(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $11, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $11, 128(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $11, (%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $12, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $12, 96(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $12, -32(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $13, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $13, 64(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $13, -64(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $14, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $14, 32(%rsp,%rax), %xmm0
+; SSE41-NEXT: pinsrb $14, -96(%rsp,%rax), %xmm0
; SSE41-NEXT: pextrb $15, %xmm2, %eax
; SSE41-NEXT: andl $31, %eax
-; SSE41-NEXT: pinsrb $15, (%rsp,%rax), %xmm0
-; SSE41-NEXT: movq %rbp, %rsp
-; SSE41-NEXT: popq %rbp
+; SSE41-NEXT: pinsrb $15, -128(%rsp,%rax), %xmm0
+; SSE41-NEXT: addq $392, %rsp # imm = 0x188
; SSE41-NEXT: retq
;
; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
; X32-SSE1-NEXT: pushl %ebp
; X32-SSE1-NEXT: movl %esp, %ebp
; X32-SSE1-NEXT: andl $-16, %esp
-; X32-SSE1-NEXT: subl $32, %esp
+; X32-SSE1-NEXT: subl $16, %esp
; X32-SSE1-NEXT: movl $-2147483648, %eax # imm = 0x80000000
; X32-SSE1-NEXT: movl 12(%ebp), %ecx
; X32-SSE1-NEXT: xorl %eax, %ecx
define <8 x float> @f(<8 x float> %a, i32 %b) nounwind {
; X32-LABEL: f:
; X32: ## %bb.0: ## %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: andl $-32, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: subl $44, %esp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: andl $7, %eax
; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-NEXT: movaps %xmm0, (%esp)
; X32-NEXT: movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000
; X32-NEXT: movaps (%esp), %xmm0
; X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
+; X32-NEXT: addl $44, %esp
; X32-NEXT: retl
;
; X64-LABEL: f:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: pushq %rbp
-; X64-NEXT: movq %rsp, %rbp
-; X64-NEXT: andq $-32, %rsp
-; X64-NEXT: subq $64, %rsp
; X64-NEXT: ## kill: def $edi killed $edi def $rdi
-; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; X64-NEXT: movaps %xmm0, (%rsp)
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl $7, %edi
-; X64-NEXT: movl $1084227584, (%rsp,%rdi,4) ## imm = 0x40A00000
-; X64-NEXT: movaps (%rsp), %xmm0
-; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; X64-NEXT: movq %rbp, %rsp
-; X64-NEXT: popq %rbp
+; X64-NEXT: movl $1084227584, -40(%rsp,%rdi,4) ## imm = 0x40A00000
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
; X64-NEXT: retq
entry:
%vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pushl %ebp
; X32-SSE-NEXT: movl %esp, %ebp
-; X32-SSE-NEXT: andl $-128, %esp
-; X32-SSE-NEXT: subl $384, %esp # imm = 0x180
+; X32-SSE-NEXT: andl $-16, %esp
+; X32-SSE-NEXT: subl $272, %esp # imm = 0x110
; X32-SSE-NEXT: movl 88(%ebp), %ecx
; X32-SSE-NEXT: movdqa 72(%ebp), %xmm0
; X32-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
;
; X64-SSE-LABEL: extract_any_extend_vector_inreg_v16i64:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: pushq %rbp
-; X64-SSE-NEXT: movq %rsp, %rbp
-; X64-SSE-NEXT: andq $-128, %rsp
-; X64-SSE-NEXT: subq $256, %rsp # imm = 0x100
+; X64-SSE-NEXT: pushq %rax
; X64-SSE-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; X64-SSE-NEXT: xorps %xmm0, %xmm0
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movaps %xmm0, (%rsp)
-; X64-SSE-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: andl $15, %edi
-; X64-SSE-NEXT: movq (%rsp,%rdi,8), %rax
-; X64-SSE-NEXT: movq %rbp, %rsp
-; X64-SSE-NEXT: popq %rbp
+; X64-SSE-NEXT: movq -128(%rsp,%rdi,8), %rax
+; X64-SSE-NEXT: popq %rcx
; X64-SSE-NEXT: retq
;
; X32-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
; X32-AVX: # %bb.0:
; X32-AVX-NEXT: pushl %ebp
; X32-AVX-NEXT: movl %esp, %ebp
-; X32-AVX-NEXT: andl $-128, %esp
-; X32-AVX-NEXT: subl $384, %esp # imm = 0x180
+; X32-AVX-NEXT: andl $-32, %esp
+; X32-AVX-NEXT: subl $288, %esp # imm = 0x120
; X32-AVX-NEXT: movl 40(%ebp), %ecx
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: pushq %rbp
; X64-AVX-NEXT: movq %rsp, %rbp
-; X64-AVX-NEXT: andq $-128, %rsp
-; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100
+; X64-AVX-NEXT: andq $-32, %rsp
+; X64-AVX-NEXT: subq $160, %rsp
; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi
; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3]
; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero