[CodeGen] Ensure callers of CreateStackTemporary use sensible alignments

author David Sherwood <david.sherwood@arm.com>

Tue, 2 Jun 2020 10:16:23 +0000 (11:16 +0100)

committer David Sherwood <david.sherwood@arm.com>

Tue, 9 Jun 2020 07:10:17 +0000 (08:10 +0100)
author David Sherwood <david.sherwood@arm.com>
Tue, 2 Jun 2020 10:16:23 +0000 (11:16 +0100)
committer David Sherwood <david.sherwood@arm.com>
Tue, 9 Jun 2020 07:10:17 +0000 (08:10 +0100)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h

index 4e452c2941fb70949bd0a4bfb271b3e325e8987b..f9706ee9f4e4e36d4a9e5045eec850db78ef0537 100644 (file)
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1608,6 +1608,12 @@ public:
  
    void dump() const;
  
+  /// In most cases this function returns the ABI alignment for a given type,
+  /// except for illegal vector types where the alignment exceeds that of the
+  /// stack. In such cases we attempt to break the vector down to a legal type
+  /// and return the ABI alignment for that instead.
+  Align getReducedAlign(EVT VT, bool UseABI);
+
    /// Create a stack temporary based on the size in bytes and the alignment
    SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment);
  
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp

index 9885110d64f9b74548e1429897e2e814e960aa99..2e1377c2c1735b5a474da4a385aecdf24ec7ca1b 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -889,12 +889,19 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
    SDLoc dl(Op);
    // Create the stack frame object.  Make sure it is aligned for both
    // the source and destination types.
-  SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT);
+
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align DestAlign = DAG.getReducedAlign(DestVT, /*UseABI=*/false);
+  Align OpAlign = DAG.getReducedAlign(Op.getValueType(), /*UseABI=*/false);
+  Align Align = std::max(DestAlign, OpAlign);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(Op.getValueType().getStoreSize(), Align);
    // Emit a store to the stack slot.
-  SDValue Store =
-      DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr, MachinePointerInfo());
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr,
+                               MachinePointerInfo(), Align);
    // Result is a load from the stack slot.
-  return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo());
+  return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align);
  }
  
  /// Replace the node's results with custom code provided by the target and
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp

index 666f128a4cc233803d43fe04726e695b6b35a7f4..9cd3b8f76d6ca7767eaf011ce6028bd419428b77 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -156,9 +156,13 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
  
    // Create the stack frame object.  Make sure it is aligned for both
    // the source and expanded destination types.
-  Align Alignment = DAG.getDataLayout().getPrefTypeAlign(
-      NOutVT.getTypeForEVT(*DAG.getContext()));
-  SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment.value());
+
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align InAlign = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+  Align NOutAlign = DAG.getReducedAlign(NOutVT, /*UseABI=*/false);
+  Align Align = std::max(InAlign, NOutAlign);
+  SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Align);
    int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
    MachinePointerInfo PtrInfo =
        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
@@ -167,7 +171,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
    SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo);
  
    // Load the first half from the stack slot.
-  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, Alignment);
+  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, NOutAlign);
  
    // Increment the pointer to the other half.
    unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
@@ -175,7 +179,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
  
    // Load the second half from the stack slot.
    Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
-                   PtrInfo.getWithOffset(IncrementSize), Alignment);
+                   PtrInfo.getWithOffset(IncrementSize), NOutAlign);
  
    // Handle endianness of the load.
    if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

index 9ebf4ea9637c5e145ff8f6e4c0f8fd2815a5e2ad..297b8aa3e848fffb1589eba160784a22e34ef6cc 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1158,15 +1158,17 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
    }
  
    // Spill the vector to the stack.
-  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
    auto &MF = DAG.getMachineFunction();
    auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
    auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
-  Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
-  Align Alignment = DAG.getDataLayout().getPrefTypeAlign(VecType);
  
-  SDValue Store =
-      DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, Alignment);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                               SmallestAlign);
  
    // Store the new subvector into the specified index.
    SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
@@ -1174,7 +1176,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
                         MachinePointerInfo::getUnknownStack(MF));
  
    // Load the Lo part from the stack slot.
-  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo, Alignment);
+  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo,
+                   SmallestAlign);
  
    // Increment the pointer to the other part.
    unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
@@ -1182,7 +1185,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
  
    // Load the Hi part from the stack slot.
    Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
-                   PtrInfo.getWithOffset(IncrementSize), Alignment);
+                   PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
  }
  
  void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
@@ -1454,27 +1457,30 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
    }
  
    // Spill the vector to the stack.
-  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
    auto &MF = DAG.getMachineFunction();
    auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
    auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
-  Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
-  Align Alignment = DAG.getDataLayout().getPrefTypeAlign(VecType);
  
-  SDValue Store =
-      DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, Alignment);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                               SmallestAlign);
  
    // Store the new element.  This may be larger than the vector element type,
    // so use a truncating store.
    SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
-  Store = DAG.getTruncStore(Store, dl, Elt, EltPtr,
-                            MachinePointerInfo::getUnknownStack(MF), EltVT);
+  Store = DAG.getTruncStore(
+      Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
+      commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
  
    EVT LoVT, HiVT;
    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
  
    // Load the Lo part from the stack slot.
-  Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, Alignment);
+  Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
  
    // Increment the pointer to the other part.
    unsigned IncrementSize = LoVT.getSizeInBits() / 8;
@@ -1482,7 +1488,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
  
    // Load the Hi part from the stack slot.
    Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
-                   PtrInfo.getWithOffset(IncrementSize), Alignment);
+                   PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
  
    // If we adjusted the original type, we need to truncate the results.
    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
@@ -2223,11 +2229,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
    }
  
    // Store the vector to the stack.
-  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
    auto &MF = DAG.getMachineFunction();
    auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
    auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                               SmallestAlign);
  
    // Load back the required element.
    StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
@@ -2242,7 +2253,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
  
    return DAG.getExtLoad(
        ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT);
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT,
+      commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
  }
  
  SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

index 44c2b01ae976283816251dd543ca023b2756ef6e..80f03c4222848647d928081c46e14641ce49196c 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1993,6 +1993,34 @@ SDValue SelectionDAG::expandVACopy(SDNode *Node) {
                    MachinePointerInfo(VD));
  }
  
+Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
+  const DataLayout &DL = getDataLayout();
+  Type *Ty = VT.getTypeForEVT(*getContext());
+  Align RedAlign = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
+
+  if (TLI->isTypeLegal(VT) || !VT.isVector())
+    return RedAlign;
+
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  const Align StackAlign = TFI->getStackAlign();
+
+  // See if we can choose a smaller ABI alignment in cases where it's an
+  // illegal vector type that will get broken down.
+  if (RedAlign > StackAlign) {
+    EVT IntermediateVT;
+    MVT RegisterVT;
+    unsigned NumIntermediates;
+    unsigned NumRegs = TLI->getVectorTypeBreakdown(
+        *getContext(), VT, IntermediateVT, NumIntermediates, RegisterVT);
+    Ty = IntermediateVT.getTypeForEVT(*getContext());
+    Align RedAlign2 = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
+    if (RedAlign2 < RedAlign)
+      RedAlign = RedAlign2;
+  }
+
+  return RedAlign;
+}
+
  SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
    MachineFrameInfo &MFI = MF->getFrameInfo();
    int FrameIdx = MFI.CreateStackObject(Bytes, Alignment, false);
diff --git a/llvm/test/CodeGen/AArch64/build-one-lane.ll b/llvm/test/CodeGen/AArch64/build-one-lane.ll

index 55225975c5151e67175f258ddf2a129cfb34ab66..78dfaa9d17697adb3023c30747d51836ba6489ad 100644 (file)
--- a/llvm/test/CodeGen/AArch64/build-one-lane.ll
+++ b/llvm/test/CodeGen/AArch64/build-one-lane.ll
@@ -270,3 +270,15 @@ define void @v2f64st(<2 x double>* %p, double %s) nounwind {
  ; CHECK: mov  v[[R]].d[1], v{{[0-9]+}}.d[0]
  ; CHECK: str  q[[R]], [x{{[0-9]+}}]
  }
+
+; In this test the illegal type has a preferred alignment greater than the
+; stack alignment, that gets reduced to the alignment of a broken down
+; legal type.
+define <32 x i8> @test_lanex_32xi8(<32 x i8> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_32xi8
+; CHECK:       stp q0, q1, [sp, #-32]!
+; CHECK:       ldp q0, q1, [sp], #32
+  %b = insertelement <32 x i8> %a, i8 30, i32 %x
+  ret <32 x i8> %b
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll

index 0b2eb6a7ae17b1c977cc21a9952f5bb020286180..2c852f066c4fd3f8f3cd7ea2bc0fec04716f1328 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -26,8 +26,8 @@
  ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
  ; GCN-NOT: s_mov_b32 s0
  
-; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
-; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
+; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
+; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
  
  ; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
  ; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll

index d9327368ac82b6970ddd4c874972555546731f0e..f42df585df2a13c98ebdda37ce32e1045b01fab0 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -25,7 +25,7 @@
  ; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
  ; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
  ; GCN: NumVgprs: 256
-; GCN: ScratchSize: 1536
+; GCN: ScratchSize: 768
  
  define amdgpu_vs void @main([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, [16 x <4 x i32>] addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
  bb:
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll

index d2062f21762c95f50bb63014b3f9f9618cee654c..f6ffd6419c13a59edbbbb2cc7f6ba307b0da1856 100644 (file)
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1768,8 +1768,8 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
  ; KNL-NEXT:    .cfi_offset %rbp, -16
  ; KNL-NEXT:    movq %rsp, %rbp
  ; KNL-NEXT:    .cfi_def_cfa_register %rbp
-; KNL-NEXT:    andq $-128, %rsp
-; KNL-NEXT:    subq $256, %rsp ## imm = 0x100
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $192, %rsp
  ; KNL-NEXT:    movl 744(%rbp), %eax
  ; KNL-NEXT:    andl $127, %eax
  ; KNL-NEXT:    vmovd %edi, %xmm0
@@ -1939,8 +1939,8 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
  ; SKX-NEXT:    .cfi_offset %rbp, -16
  ; SKX-NEXT:    movq %rsp, %rbp
  ; SKX-NEXT:    .cfi_def_cfa_register %rbp
-; SKX-NEXT:    andq $-128, %rsp
-; SKX-NEXT:    subq $256, %rsp ## imm = 0x100
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $192, %rsp
  ; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
  ; SKX-NEXT:    vpinsrb $1, 232(%rbp), %xmm0, %xmm0
  ; SKX-NEXT:    vpinsrb $2, 240(%rbp), %xmm0, %xmm0
@@ -2076,8 +2076,8 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
  ; KNL-NEXT:    .cfi_offset %rbp, -16
  ; KNL-NEXT:    movq %rsp, %rbp
  ; KNL-NEXT:    .cfi_def_cfa_register %rbp
-; KNL-NEXT:    andq $-128, %rsp
-; KNL-NEXT:    subq $256, %rsp ## imm = 0x100
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $192, %rsp
  ; KNL-NEXT:    ## kill: def $esi killed $esi def $rsi
  ; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
  ; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm3
@@ -2153,8 +2153,8 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index
  ; SKX-NEXT:    .cfi_offset %rbp, -16
  ; SKX-NEXT:    movq %rsp, %rbp
  ; SKX-NEXT:    .cfi_def_cfa_register %rbp
-; SKX-NEXT:    andq $-128, %rsp
-; SKX-NEXT:    subq $256, %rsp ## imm = 0x100
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $192, %rsp
  ; SKX-NEXT:    ## kill: def $esi killed $esi def $rsi
  ; SKX-NEXT:    vptestmb %zmm0, %zmm0, %k0
  ; SKX-NEXT:    vptestmb %zmm1, %zmm1, %k1
diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll

index a95c4daf3b095bd3ac0965b258376c4d790ba18c..cf06f8dcb13e1503d801a0c370543c92f50841d2 100644 (file)
--- a/llvm/test/CodeGen/X86/extractelement-index.ll
+++ b/llvm/test/CodeGen/X86/extractelement-index.ll
@@ -443,16 +443,10 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
  define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
  ; SSE-LABEL: extractelement_v32i8_var:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    andl $31, %edi
-; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, (%rsp)
-; SSE-NEXT:    movb (%rsp,%rdi), %al
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movb -40(%rsp,%rdi), %al
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: extractelement_v32i8_var:
@@ -493,16 +487,10 @@ define i16 @extractelement_v8i16_var(<8 x i16> %a, i256 %i) nounwind {
  define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
  ; SSE-LABEL: extractelement_v16i16_var:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    andl $15, %edi
-; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, (%rsp)
-; SSE-NEXT:    movzwl (%rsp,%rdi,2), %eax
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: extractelement_v16i16_var:
@@ -543,16 +531,10 @@ define i32 @extractelement_v4i32_var(<4 x i32> %a, i256 %i) nounwind {
  define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
  ; SSE-LABEL: extractelement_v8i32_var:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    andl $7, %edi
-; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, (%rsp)
-; SSE-NEXT:    movl (%rsp,%rdi,4), %eax
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl -40(%rsp,%rdi,4), %eax
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: extractelement_v8i32_var:
@@ -593,16 +575,10 @@ define i64 @extractelement_v2i64_var(<2 x i64> %a, i256 %i) nounwind {
  define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
  ; SSE-LABEL: extractelement_v4i64_var:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    andl $3, %edi
-; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, (%rsp)
-; SSE-NEXT:    movq (%rsp,%rdi,8), %rax
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -40(%rsp,%rdi,8), %rax
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: extractelement_v4i64_var:
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll

index 82d1fbe2e0ddf4c944dd7d4d126810bce335b13f..1fabce24cc25b49493dee5acc6b910cf2534eb81 100644 (file)
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -382,16 +382,16 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
  ; CHECK-LIBCALL-NEXT:    subq $88, %rsp
  ; CHECK-LIBCALL-NEXT:    movl (%rdi), %eax
  ; CHECK-LIBCALL-NEXT:    movl 4(%rdi), %ecx
-; CHECK-LIBCALL-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
+; CHECK-LIBCALL-NEXT:    movl %eax, (%rsp)
  ; CHECK-LIBCALL-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
-; CHECK-LIBCALL-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-LIBCALL-NEXT:    movaps (%rsp), %xmm0
  ; CHECK-LIBCALL-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
  ; CHECK-LIBCALL-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
-; CHECK-LIBCALL-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
  ; CHECK-LIBCALL-NEXT:    pextrw $1, %xmm0, %edi
  ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
  ; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-LIBCALL-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
  ; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %edi
  ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
  ; CHECK-LIBCALL-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -400,11 +400,11 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
  ; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
  ; CHECK-LIBCALL-NEXT:    pextrw $1, %xmm0, %edi
  ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
  ; CHECK-LIBCALL-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
  ; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %edi
  ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
  ; CHECK-LIBCALL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
  ; CHECK-LIBCALL-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
  ; CHECK-LIBCALL-NEXT:    # xmm0 = xmm0[0],mem[0]
diff --git a/llvm/test/CodeGen/X86/i64-mem-copy.ll b/llvm/test/CodeGen/X86/i64-mem-copy.ll

index 4d3e3623ad3703bae5a6ef7329cb25a6e2bf0400..0190e91216ce468b90c2af1d0d43a91ffe6e3872 100644 (file)
--- a/llvm/test/CodeGen/X86/i64-mem-copy.ll
+++ b/llvm/test/CodeGen/X86/i64-mem-copy.ll
@@ -109,34 +109,28 @@ define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, i64* %i) {
  define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) nounwind {
  ; X64-LABEL: PR23476:
  ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    movq %rsp, %rbp
-; X64-NEXT:    andq $-64, %rsp
-; X64-NEXT:    subq $128, %rsp
  ; X64-NEXT:    movq %rsi, %xmm0
  ; X64-NEXT:    movq %rdi, %xmm1
  ; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
  ; X64-NEXT:    movq %rcx, %xmm0
  ; X64-NEXT:    movq %rdx, %xmm2
  ; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; X64-NEXT:    movl 16(%rbp), %eax
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %eax
  ; X64-NEXT:    andl $7, %eax
  ; X64-NEXT:    movq %r8, %xmm0
-; X64-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp)
-; X64-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp)
-; X64-NEXT:    movdqa %xmm1, (%rsp)
-; X64-NEXT:    movq (%rsp,%rax,8), %rax
+; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movq -72(%rsp,%rax,8), %rax
  ; X64-NEXT:    movq %rax, (%r9)
-; X64-NEXT:    movq %rbp, %rsp
-; X64-NEXT:    popq %rbp
  ; X64-NEXT:    retq
  ;
  ; X32-LABEL: PR23476:
  ; X32:       # %bb.0:
  ; X32-NEXT:    pushl %ebp
  ; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $80, %esp
  ; X32-NEXT:    movl 52(%ebp), %eax
  ; X32-NEXT:    andl $7, %eax
  ; X32-NEXT:    movl 48(%ebp), %ecx
@@ -156,8 +150,8 @@ define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) nounwind {
  ; X32AVX:       # %bb.0:
  ; X32AVX-NEXT:    pushl %ebp
  ; X32AVX-NEXT:    movl %esp, %ebp
-; X32AVX-NEXT:    andl $-64, %esp
-; X32AVX-NEXT:    subl $128, %esp
+; X32AVX-NEXT:    andl $-32, %esp
+; X32AVX-NEXT:    subl $96, %esp
  ; X32AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
  ; X32AVX-NEXT:    movl 52(%ebp), %eax
  ; X32AVX-NEXT:    andl $7, %eax
diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll

index 564c789c98801084a1376b2a17b59037617e27cf..2e3824c8f03f48825009433df4f0f0ad2179df4f 100644 (file)
--- a/llvm/test/CodeGen/X86/insertelement-var-index.ll
+++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll
@@ -262,17 +262,11 @@ define <2 x double> @load_f64_v2f64(double* %p, i32 %y) nounwind {
  define <32 x i8> @arg_i8_v32i8(i8 %x, i32 %y) nounwind {
  ; SSE-LABEL: arg_i8_v32i8:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
  ; SSE-NEXT:    andl $31, %esi
-; SSE-NEXT:    movb %dil, (%rsp,%rsi)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movb %dil, -40(%rsp,%rsi)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX1-LABEL: arg_i8_v32i8:
@@ -295,17 +289,11 @@ define <32 x i8> @arg_i8_v32i8(i8 %x, i32 %y) nounwind {
  define <16 x i16> @arg_i16_v16i16(i16 %x, i32 %y) nounwind {
  ; SSE-LABEL: arg_i16_v16i16:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
  ; SSE-NEXT:    andl $15, %esi
-; SSE-NEXT:    movw %di, (%rsp,%rsi,2)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movw %di, -40(%rsp,%rsi,2)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX1-LABEL: arg_i16_v16i16:
@@ -328,17 +316,11 @@ define <16 x i16> @arg_i16_v16i16(i16 %x, i32 %y) nounwind {
  define <8 x i32> @arg_i32_v8i32(i32 %x, i32 %y) nounwind {
  ; SSE-LABEL: arg_i32_v8i32:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
  ; SSE-NEXT:    andl $7, %esi
-; SSE-NEXT:    movl %edi, (%rsp,%rsi,4)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movl %edi, -40(%rsp,%rsi,4)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX1-LABEL: arg_i32_v8i32:
@@ -360,17 +342,11 @@ define <8 x i32> @arg_i32_v8i32(i32 %x, i32 %y) nounwind {
  define <4 x i64> @arg_i64_v4i64(i64 %x, i32 %y) nounwind {
  ; SSE-LABEL: arg_i64_v4i64:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
  ; SSE-NEXT:    andl $3, %esi
-; SSE-NEXT:    movq %rdi, (%rsp,%rsi,8)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movq %rdi, -40(%rsp,%rsi,8)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX1-LABEL: arg_i64_v4i64:
@@ -392,17 +368,11 @@ define <4 x i64> @arg_i64_v4i64(i64 %x, i32 %y) nounwind {
  define <8 x float> @arg_f32_v8f32(float %x, i32 %y) nounwind {
  ; SSE-LABEL: arg_f32_v8f32:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
  ; SSE-NEXT:    andl $7, %edi
-; SSE-NEXT:    movss %xmm0, (%rsp,%rdi,4)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movss %xmm0, -40(%rsp,%rdi,4)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX1-LABEL: arg_f32_v8f32:
@@ -422,17 +392,11 @@ define <8 x float> @arg_f32_v8f32(float %x, i32 %y) nounwind {
  define <4 x double> @arg_f64_v4f64(double %x, i32 %y) nounwind {
  ; SSE-LABEL: arg_f64_v4f64:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
  ; SSE-NEXT:    andl $3, %edi
-; SSE-NEXT:    movsd %xmm0, (%rsp,%rdi,8)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movsd %xmm0, -40(%rsp,%rdi,8)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX1-LABEL: arg_f64_v4f64:
@@ -452,18 +416,12 @@ define <4 x double> @arg_f64_v4f64(double %x, i32 %y) nounwind {
  define <32 x i8> @load_i8_v32i8(i8* %p, i32 %y) nounwind {
  ; SSE-LABEL: load_i8_v32i8:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
  ; SSE-NEXT:    movb (%rdi), %al
  ; SSE-NEXT:    andl $31, %esi
-; SSE-NEXT:    movb %al, (%rsp,%rsi)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movb %al, -40(%rsp,%rsi)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX1-LABEL: load_i8_v32i8:
@@ -487,18 +445,12 @@ define <32 x i8> @load_i8_v32i8(i8* %p, i32 %y) nounwind {
  define <16 x i16> @load_i16_v16i16(i16* %p, i32 %y) nounwind {
  ; SSE-LABEL: load_i16_v16i16:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
  ; SSE-NEXT:    movzwl (%rdi), %eax
  ; SSE-NEXT:    andl $15, %esi
-; SSE-NEXT:    movw %ax, (%rsp,%rsi,2)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movw %ax, -40(%rsp,%rsi,2)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX1-LABEL: load_i16_v16i16:
@@ -522,18 +474,12 @@ define <16 x i16> @load_i16_v16i16(i16* %p, i32 %y) nounwind {
  define <8 x i32> @load_i32_v8i32(i32* %p, i32 %y) nounwind {
  ; SSE-LABEL: load_i32_v8i32:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
  ; SSE-NEXT:    movl (%rdi), %eax
  ; SSE-NEXT:    andl $7, %esi
-; SSE-NEXT:    movl %eax, (%rsp,%rsi,4)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movl %eax, -40(%rsp,%rsi,4)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: load_i32_v8i32:
@@ -548,18 +494,12 @@ define <8 x i32> @load_i32_v8i32(i32* %p, i32 %y) nounwind {
  define <4 x i64> @load_i64_v4i64(i64* %p, i32 %y) nounwind {
  ; SSE-LABEL: load_i64_v4i64:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
  ; SSE-NEXT:    movq (%rdi), %rax
  ; SSE-NEXT:    andl $3, %esi
-; SSE-NEXT:    movq %rax, (%rsp,%rsi,8)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movq %rax, -40(%rsp,%rsi,8)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: load_i64_v4i64:
@@ -574,18 +514,12 @@ define <4 x i64> @load_i64_v4i64(i64* %p, i32 %y) nounwind {
  define <8 x float> @load_f32_v8f32(float* %p, i32 %y) nounwind {
  ; SSE-LABEL: load_f32_v8f32:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
  ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
  ; SSE-NEXT:    andl $7, %esi
-; SSE-NEXT:    movss %xmm0, (%rsp,%rsi,4)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movss %xmm0, -40(%rsp,%rsi,4)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: load_f32_v8f32:
@@ -600,18 +534,12 @@ define <8 x float> @load_f32_v8f32(float* %p, i32 %y) nounwind {
  define <4 x double> @load_f64_v4f64(double* %p, i32 %y) nounwind {
  ; SSE-LABEL: load_f64_v4f64:
  ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    movq %rsp, %rbp
-; SSE-NEXT:    andq $-32, %rsp
-; SSE-NEXT:    subq $64, %rsp
  ; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
  ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
  ; SSE-NEXT:    andl $3, %esi
-; SSE-NEXT:    movsd %xmm0, (%rsp,%rsi,8)
-; SSE-NEXT:    movaps (%rsp), %xmm0
-; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT:    movq %rbp, %rsp
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    movsd %xmm0, -40(%rsp,%rsi,8)
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: load_f64_v4f64:
diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll

index f5c1b3bc2351e7581052cf3b7b3f4364e85259b5..3d4cf50fcf079ef64c80dd802c703cc733a684e6 100644 (file)
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -66,7 +66,7 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
  ; X86-NEXT:    pushl %edi
  ; X86-NEXT:    pushl %esi
  ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    subl $64, %esp
  ; X86-NEXT:    movzwl 8(%ebp), %esi
  ; X86-NEXT:    movzwl 12(%ebp), %edi
  ; X86-NEXT:    movzwl 20(%ebp), %ebx
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll

index 9767f8624572033342571f277f99fce15f980372..8bc971e79f50730a0b2bc751ce3804107b566917 100644 (file)
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -643,116 +643,112 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
  ; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
  ; SSE3:       # %bb.0:
  ; SSE3-NEXT:    pushq %rbp
-; SSE3-NEXT:    movq %rsp, %rbp
  ; SSE3-NEXT:    pushq %r15
  ; SSE3-NEXT:    pushq %r14
  ; SSE3-NEXT:    pushq %r13
  ; SSE3-NEXT:    pushq %r12
  ; SSE3-NEXT:    pushq %rbx
-; SSE3-NEXT:    andq $-32, %rsp
-; SSE3-NEXT:    subq $608, %rsp # imm = 0x260
-; SSE3-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE3-NEXT:    subq $424, %rsp # imm = 0x1A8
+; SSE3-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
  ; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
  ; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r14d
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r15d
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r12d
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r13d
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
-; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
  ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r9d
-; SSE3-NEXT:    andl $31, %r9d
-; SSE3-NEXT:    movzbl 64(%rsp,%r9), %ebx
-; SSE3-NEXT:    movd %ebx, %xmm8
-; SSE3-NEXT:    andl $31, %eax
-; SSE3-NEXT:    movzbl 96(%rsp,%rax), %eax
-; SSE3-NEXT:    movd %eax, %xmm15
+; SSE3-NEXT:    movaps %xmm0, (%rsp)
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT:    andl $31, %r8d
+; SSE3-NEXT:    movzbl -96(%rsp,%r8), %esi
+; SSE3-NEXT:    movd %esi, %xmm8
+; SSE3-NEXT:    andl $31, %ebp
+; SSE3-NEXT:    movzbl -64(%rsp,%rbp), %esi
+; SSE3-NEXT:    movd %esi, %xmm15
  ; SSE3-NEXT:    andl $31, %edx
-; SSE3-NEXT:    movzbl 128(%rsp,%rdx), %eax
-; SSE3-NEXT:    movd %eax, %xmm9
+; SSE3-NEXT:    movzbl -32(%rsp,%rdx), %edx
+; SSE3-NEXT:    movd %edx, %xmm9
  ; SSE3-NEXT:    andl $31, %ecx
-; SSE3-NEXT:    movzbl 160(%rsp,%rcx), %eax
-; SSE3-NEXT:    movd %eax, %xmm3
-; SSE3-NEXT:    andl $31, %esi
-; SSE3-NEXT:    movzbl 192(%rsp,%rsi), %eax
+; SSE3-NEXT:    movzbl (%rsp,%rcx), %ecx
+; SSE3-NEXT:    movd %ecx, %xmm3
+; SSE3-NEXT:    andl $31, %eax
+; SSE3-NEXT:    movzbl 32(%rsp,%rax), %eax
  ; SSE3-NEXT:    movd %eax, %xmm10
  ; SSE3-NEXT:    andl $31, %edi
-; SSE3-NEXT:    movzbl 224(%rsp,%rdi), %eax
+; SSE3-NEXT:    movzbl 64(%rsp,%rdi), %eax
  ; SSE3-NEXT:    movd %eax, %xmm7
-; SSE3-NEXT:    andl $31, %r8d
-; SSE3-NEXT:    movzbl 256(%rsp,%r8), %eax
+; SSE3-NEXT:    andl $31, %ebx
+; SSE3-NEXT:    movzbl 96(%rsp,%rbx), %eax
  ; SSE3-NEXT:    movd %eax, %xmm11
-; SSE3-NEXT:    andl $31, %r10d
-; SSE3-NEXT:    movzbl 288(%rsp,%r10), %eax
+; SSE3-NEXT:    andl $31, %r9d
+; SSE3-NEXT:    movzbl 128(%rsp,%r9), %eax
  ; SSE3-NEXT:    movd %eax, %xmm6
  ; SSE3-NEXT:    andl $31, %r13d
-; SSE3-NEXT:    movzbl 320(%rsp,%r13), %eax
+; SSE3-NEXT:    movzbl 160(%rsp,%r13), %eax
  ; SSE3-NEXT:    movd %eax, %xmm12
  ; SSE3-NEXT:    andl $31, %r12d
-; SSE3-NEXT:    movzbl 352(%rsp,%r12), %eax
+; SSE3-NEXT:    movzbl 192(%rsp,%r12), %eax
  ; SSE3-NEXT:    movd %eax, %xmm5
  ; SSE3-NEXT:    andl $31, %r15d
-; SSE3-NEXT:    movzbl 384(%rsp,%r15), %eax
+; SSE3-NEXT:    movzbl 224(%rsp,%r15), %eax
  ; SSE3-NEXT:    movd %eax, %xmm13
  ; SSE3-NEXT:    andl $31, %r14d
-; SSE3-NEXT:    movzbl 416(%rsp,%r14), %eax
+; SSE3-NEXT:    movzbl 256(%rsp,%r14), %eax
  ; SSE3-NEXT:    movd %eax, %xmm4
  ; SSE3-NEXT:    andl $31, %r11d
-; SSE3-NEXT:    movzbl 448(%rsp,%r11), %eax
+; SSE3-NEXT:    movzbl 288(%rsp,%r11), %eax
  ; SSE3-NEXT:    movd %eax, %xmm14
-; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE3-NEXT:    andl $31, %eax
-; SSE3-NEXT:    movzbl 480(%rsp,%rax), %eax
+; SSE3-NEXT:    andl $31, %r10d
+; SSE3-NEXT:    movzbl 320(%rsp,%r10), %eax
  ; SSE3-NEXT:    movd %eax, %xmm1
  ; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
  ; SSE3-NEXT:    andl $31, %eax
-; SSE3-NEXT:    movzbl 512(%rsp,%rax), %eax
+; SSE3-NEXT:    movzbl 352(%rsp,%rax), %eax
  ; SSE3-NEXT:    movd %eax, %xmm2
  ; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
  ; SSE3-NEXT:    andl $31, %eax
-; SSE3-NEXT:    movzbl 544(%rsp,%rax), %eax
+; SSE3-NEXT:    movzbl 384(%rsp,%rax), %eax
  ; SSE3-NEXT:    movd %eax, %xmm0
  ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
  ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -769,7 +765,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
  ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
  ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
  ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSE3-NEXT:    leaq -40(%rbp), %rsp
+; SSE3-NEXT:    addq $424, %rsp # imm = 0x1A8
  ; SSE3-NEXT:    popq %rbx
  ; SSE3-NEXT:    popq %r12
  ; SSE3-NEXT:    popq %r13
@@ -781,116 +777,112 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
  ; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
  ; SSSE3:       # %bb.0:
  ; SSSE3-NEXT:    pushq %rbp
-; SSSE3-NEXT:    movq %rsp, %rbp
  ; SSSE3-NEXT:    pushq %r15
  ; SSSE3-NEXT:    pushq %r14
  ; SSSE3-NEXT:    pushq %r13
  ; SSSE3-NEXT:    pushq %r12
  ; SSSE3-NEXT:    pushq %rbx
-; SSSE3-NEXT:    andq $-32, %rsp
-; SSSE3-NEXT:    subq $608, %rsp # imm = 0x260
-; SSSE3-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    subq $424, %rsp # imm = 0x1A8
+; SSSE3-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
  ; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
  ; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
-; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r14d
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r15d
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r12d
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r13d
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
  ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT:    andl $31, %r9d
-; SSSE3-NEXT:    movzbl 64(%rsp,%r9), %ebx
-; SSSE3-NEXT:    movd %ebx, %xmm8
-; SSSE3-NEXT:    andl $31, %eax
-; SSSE3-NEXT:    movzbl 96(%rsp,%rax), %eax
-; SSSE3-NEXT:    movd %eax, %xmm15
+; SSSE3-NEXT:    movaps %xmm0, (%rsp)
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSSE3-NEXT:    andl $31, %r8d
+; SSSE3-NEXT:    movzbl -96(%rsp,%r8), %esi
+; SSSE3-NEXT:    movd %esi, %xmm8
+; SSSE3-NEXT:    andl $31, %ebp
+; SSSE3-NEXT:    movzbl -64(%rsp,%rbp), %esi
+; SSSE3-NEXT:    movd %esi, %xmm15
  ; SSSE3-NEXT:    andl $31, %edx
-; SSSE3-NEXT:    movzbl 128(%rsp,%rdx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm9
+; SSSE3-NEXT:    movzbl -32(%rsp,%rdx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm9
  ; SSSE3-NEXT:    andl $31, %ecx
-; SSSE3-NEXT:    movzbl 160(%rsp,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    andl $31, %esi
-; SSSE3-NEXT:    movzbl 192(%rsp,%rsi), %eax
+; SSSE3-NEXT:    movzbl (%rsp,%rcx), %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    andl $31, %eax
+; SSSE3-NEXT:    movzbl 32(%rsp,%rax), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm10
  ; SSSE3-NEXT:    andl $31, %edi
-; SSSE3-NEXT:    movzbl 224(%rsp,%rdi), %eax
+; SSSE3-NEXT:    movzbl 64(%rsp,%rdi), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm7
-; SSSE3-NEXT:    andl $31, %r8d
-; SSSE3-NEXT:    movzbl 256(%rsp,%r8), %eax
+; SSSE3-NEXT:    andl $31, %ebx
+; SSSE3-NEXT:    movzbl 96(%rsp,%rbx), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm11
-; SSSE3-NEXT:    andl $31, %r10d
-; SSSE3-NEXT:    movzbl 288(%rsp,%r10), %eax
+; SSSE3-NEXT:    andl $31, %r9d
+; SSSE3-NEXT:    movzbl 128(%rsp,%r9), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm6
  ; SSSE3-NEXT:    andl $31, %r13d
-; SSSE3-NEXT:    movzbl 320(%rsp,%r13), %eax
+; SSSE3-NEXT:    movzbl 160(%rsp,%r13), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm12
  ; SSSE3-NEXT:    andl $31, %r12d
-; SSSE3-NEXT:    movzbl 352(%rsp,%r12), %eax
+; SSSE3-NEXT:    movzbl 192(%rsp,%r12), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm5
  ; SSSE3-NEXT:    andl $31, %r15d
-; SSSE3-NEXT:    movzbl 384(%rsp,%r15), %eax
+; SSSE3-NEXT:    movzbl 224(%rsp,%r15), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm13
  ; SSSE3-NEXT:    andl $31, %r14d
-; SSSE3-NEXT:    movzbl 416(%rsp,%r14), %eax
+; SSSE3-NEXT:    movzbl 256(%rsp,%r14), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm4
  ; SSSE3-NEXT:    andl $31, %r11d
-; SSSE3-NEXT:    movzbl 448(%rsp,%r11), %eax
+; SSSE3-NEXT:    movzbl 288(%rsp,%r11), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm14
-; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSSE3-NEXT:    andl $31, %eax
-; SSSE3-NEXT:    movzbl 480(%rsp,%rax), %eax
+; SSSE3-NEXT:    andl $31, %r10d
+; SSSE3-NEXT:    movzbl 320(%rsp,%r10), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm1
  ; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
  ; SSSE3-NEXT:    andl $31, %eax
-; SSSE3-NEXT:    movzbl 512(%rsp,%rax), %eax
+; SSSE3-NEXT:    movzbl 352(%rsp,%rax), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm2
  ; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
  ; SSSE3-NEXT:    andl $31, %eax
-; SSSE3-NEXT:    movzbl 544(%rsp,%rax), %eax
+; SSSE3-NEXT:    movzbl 384(%rsp,%rax), %eax
  ; SSSE3-NEXT:    movd %eax, %xmm0
  ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
  ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -907,7 +899,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
  ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
  ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
  ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSSE3-NEXT:    leaq -40(%rbp), %rsp
+; SSSE3-NEXT:    addq $424, %rsp # imm = 0x1A8
  ; SSSE3-NEXT:    popq %rbx
  ; SSSE3-NEXT:    popq %r12
  ; SSSE3-NEXT:    popq %r13
@@ -918,10 +910,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
  ;
  ; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
  ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pushq %rbp
-; SSE41-NEXT:    movq %rsp, %rbp
-; SSE41-NEXT:    andq $-32, %rsp
-; SSE41-NEXT:    subq $544, %rsp # imm = 0x220
+; SSE41-NEXT:    subq $392, %rsp # imm = 0x188
  ; SSE41-NEXT:    movd %xmm2, %eax
  ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
@@ -947,64 +936,63 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
  ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
  ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
  ; SSE41-NEXT:    movaps %xmm0, (%rsp)
-; SSE41-NEXT:    movzbl 480(%rsp,%rax), %eax
+; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movzbl 352(%rsp,%rax), %eax
  ; SSE41-NEXT:    movd %eax, %xmm0
  ; SSE41-NEXT:    pextrb $1, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $1, 448(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $1, 320(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $2, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $2, 416(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $2, 288(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $3, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $3, 384(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $3, 256(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $4, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $4, 352(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $4, 224(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $5, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $5, 320(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $5, 192(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $6, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $6, 288(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $6, 160(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $7, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $7, 256(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $7, 128(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $8, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $8, 224(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $8, 96(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $9, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $9, 192(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $9, 64(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $10, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $10, 160(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $10, 32(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $11, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $11, 128(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $11, (%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $12, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $12, 96(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $12, -32(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $13, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $13, 64(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $13, -64(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $14, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $14, 32(%rsp,%rax), %xmm0
+; SSE41-NEXT:    pinsrb $14, -96(%rsp,%rax), %xmm0
  ; SSE41-NEXT:    pextrb $15, %xmm2, %eax
  ; SSE41-NEXT:    andl $31, %eax
-; SSE41-NEXT:    pinsrb $15, (%rsp,%rax), %xmm0
-; SSE41-NEXT:    movq %rbp, %rsp
-; SSE41-NEXT:    popq %rbp
+; SSE41-NEXT:    pinsrb $15, -128(%rsp,%rax), %xmm0
+; SSE41-NEXT:    addq $392, %rsp # imm = 0x188
  ; SSE41-NEXT:    retq
  ;
  ; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
diff --git a/llvm/test/CodeGen/X86/vec_fneg.ll b/llvm/test/CodeGen/X86/vec_fneg.ll

index c3c1932c2311e52c3b450e927878876477df98a0..3794bd2ce94bf7175336884b6496b79babf7389a 100644 (file)
--- a/llvm/test/CodeGen/X86/vec_fneg.ll
+++ b/llvm/test/CodeGen/X86/vec_fneg.ll
@@ -121,7 +121,7 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind {
  ; X32-SSE1-NEXT:    pushl %ebp
  ; X32-SSE1-NEXT:    movl %esp, %ebp
  ; X32-SSE1-NEXT:    andl $-16, %esp
-; X32-SSE1-NEXT:    subl $32, %esp
+; X32-SSE1-NEXT:    subl $16, %esp
  ; X32-SSE1-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
  ; X32-SSE1-NEXT:    movl 12(%ebp), %ecx
  ; X32-SSE1-NEXT:    xorl %eax, %ecx
diff --git a/llvm/test/CodeGen/X86/vec_insert-4.ll b/llvm/test/CodeGen/X86/vec_insert-4.ll

index 2c34b3b7d7a1d89d0e79e36f11722273afa6c4a9..ed8833b95b2b5dc24ec108b6439c5bb96a11a4cc 100644 (file)
--- a/llvm/test/CodeGen/X86/vec_insert-4.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-4.ll
@@ -5,36 +5,26 @@
  define <8 x float> @f(<8 x float> %a, i32 %b) nounwind  {
  ; X32-LABEL: f:
  ; X32:       ## %bb.0: ## %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-32, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    subl $44, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
  ; X32-NEXT:    andl $7, %eax
  ; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
  ; X32-NEXT:    movaps %xmm0, (%esp)
  ; X32-NEXT:    movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000
  ; X32-NEXT:    movaps (%esp), %xmm0
  ; X32-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
+; X32-NEXT:    addl $44, %esp
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: f:
  ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    movq %rsp, %rbp
-; X64-NEXT:    andq $-32, %rsp
-; X64-NEXT:    subq $64, %rsp
  ; X64-NEXT:    ## kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps %xmm0, (%rsp)
+; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
  ; X64-NEXT:    andl $7, %edi
-; X64-NEXT:    movl $1084227584, (%rsp,%rdi,4) ## imm = 0x40A00000
-; X64-NEXT:    movaps (%rsp), %xmm0
-; X64-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; X64-NEXT:    movq %rbp, %rsp
-; X64-NEXT:    popq %rbp
+; X64-NEXT:    movl $1084227584, -40(%rsp,%rdi,4) ## imm = 0x40A00000
+; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
  ; X64-NEXT:    retq
  entry:
    %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
diff --git a/llvm/test/CodeGen/X86/vector-extend-inreg.ll b/llvm/test/CodeGen/X86/vector-extend-inreg.ll

index 98a35c4a7934f93dc09008a87452a7fd91815ebb..f6ab64975cd359a4ef50141a256cb63bc981a2c8 100644 (file)
--- a/llvm/test/CodeGen/X86/vector-extend-inreg.ll
+++ b/llvm/test/CodeGen/X86/vector-extend-inreg.ll
@@ -9,8 +9,8 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
  ; X32-SSE:       # %bb.0:
  ; X32-SSE-NEXT:    pushl %ebp
  ; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    andl $-128, %esp
-; X32-SSE-NEXT:    subl $384, %esp # imm = 0x180
+; X32-SSE-NEXT:    andl $-16, %esp
+; X32-SSE-NEXT:    subl $272, %esp # imm = 0x110
  ; X32-SSE-NEXT:    movl 88(%ebp), %ecx
  ; X32-SSE-NEXT:    movdqa 72(%ebp), %xmm0
  ; X32-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
@@ -43,33 +43,29 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
  ;
  ; X64-SSE-LABEL: extract_any_extend_vector_inreg_v16i64:
  ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rbp
-; X64-SSE-NEXT:    movq %rsp, %rbp
-; X64-SSE-NEXT:    andq $-128, %rsp
-; X64-SSE-NEXT:    subq $256, %rsp # imm = 0x100
+; X64-SSE-NEXT:    pushq %rax
  ; X64-SSE-NEXT:    # kill: def $edi killed $edi def $rdi
  ; X64-SSE-NEXT:    psrldq {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
  ; X64-SSE-NEXT:    xorps %xmm0, %xmm0
-; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movaps %xmm0, (%rsp)
-; X64-SSE-NEXT:    movdqa %xmm7, {{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp)
  ; X64-SSE-NEXT:    andl $15, %edi
-; X64-SSE-NEXT:    movq (%rsp,%rdi,8), %rax
-; X64-SSE-NEXT:    movq %rbp, %rsp
-; X64-SSE-NEXT:    popq %rbp
+; X64-SSE-NEXT:    movq -128(%rsp,%rdi,8), %rax
+; X64-SSE-NEXT:    popq %rcx
  ; X64-SSE-NEXT:    retq
  ;
  ; X32-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
  ; X32-AVX:       # %bb.0:
  ; X32-AVX-NEXT:    pushl %ebp
  ; X32-AVX-NEXT:    movl %esp, %ebp
-; X32-AVX-NEXT:    andl $-128, %esp
-; X32-AVX-NEXT:    subl $384, %esp # imm = 0x180
+; X32-AVX-NEXT:    andl $-32, %esp
+; X32-AVX-NEXT:    subl $288, %esp # imm = 0x120
  ; X32-AVX-NEXT:    movl 40(%ebp), %ecx
  ; X32-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
  ; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
@@ -96,8 +92,8 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
  ; X64-AVX:       # %bb.0:
  ; X64-AVX-NEXT:    pushq %rbp
  ; X64-AVX-NEXT:    movq %rsp, %rbp
-; X64-AVX-NEXT:    andq $-128, %rsp
-; X64-AVX-NEXT:    subq $256, %rsp # imm = 0x100
+; X64-AVX-NEXT:    andq $-32, %rsp
+; X64-AVX-NEXT:    subq $160, %rsp
  ; X64-AVX-NEXT:    # kill: def $edi killed $edi def $rdi
  ; X64-AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3]
  ; X64-AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
author	David Sherwood <david.sherwood@arm.com>
	Tue, 2 Jun 2020 10:16:23 +0000 (11:16 +0100)
committer	David Sherwood <david.sherwood@arm.com>
	Tue, 9 Jun 2020 07:10:17 +0000 (08:10 +0100)
llvm/include/llvm/CodeGen/SelectionDAG.h		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/build-one-lane.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/scratch-simple.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll		patch \| blob \| history
llvm/test/CodeGen/X86/avx512-insert-extract.ll		patch \| blob \| history
llvm/test/CodeGen/X86/extractelement-index.ll		patch \| blob \| history
llvm/test/CodeGen/X86/half.ll		patch \| blob \| history
llvm/test/CodeGen/X86/i64-mem-copy.ll		patch \| blob \| history
llvm/test/CodeGen/X86/insertelement-var-index.ll		patch \| blob \| history
llvm/test/CodeGen/X86/pr31088.ll		patch \| blob \| history
llvm/test/CodeGen/X86/var-permute-128.ll		patch \| blob \| history
llvm/test/CodeGen/X86/vec_fneg.ll		patch \| blob \| history
llvm/test/CodeGen/X86/vec_insert-4.ll		patch \| blob \| history
llvm/test/CodeGen/X86/vector-extend-inreg.ll		patch \| blob \| history