From 7b3120b9ae1bfb5b1bc1e3a2522776194c3ebbdf Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Sat, 19 Jan 2013 08:38:41 +0000 Subject: [PATCH] On Sandybridge split unaligned 256bit stores into two xmm-sized stores. llvm-svn: 172894 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 27 ++++++++++++++++--------- llvm/test/CodeGen/X86/2012-01-11-split-cv.ll | 2 +- llvm/test/CodeGen/X86/MergeConsecutiveStores.ll | 2 +- llvm/test/CodeGen/X86/avx-load-store.ll | 11 +++++++--- llvm/test/CodeGen/X86/avx-sext.ll | 12 ----------- llvm/test/CodeGen/X86/fp-load-trunc.ll | 4 ++-- llvm/test/CodeGen/X86/sandybridge-loads.ll | 24 +++++++++++++++++++--- llvm/test/CodeGen/X86/v8i1-masks.ll | 8 ++++---- llvm/test/CodeGen/X86/vec_fpext.ll | 2 +- 9 files changed, 56 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b6b10e2..ca8cd74 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16344,12 +16344,15 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, ISD::LoadExtType Ext = Ld->getExtensionType(); unsigned Alignment = Ld->getAlignment(); + bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8; // On Sandybridge unaligned 256bit loads are inefficient. if (RegVT.is256BitVector() && !Subtarget->hasInt256() && - !DCI.isBeforeLegalizeOps() && Alignment < 32 && - Ext == ISD::NON_EXTLOAD) { + !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { unsigned NumElems = RegVT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + SDValue Ptr = Ld->getBasePtr(); SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); @@ -16363,7 +16366,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), - Alignment); + std::max(Alignment/2U, 1U)); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); @@ -16536,16 +16539,21 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, DebugLoc dl = St->getDebugLoc(); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned Alignment = St->getAlignment(); + bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8; // If we are saving a concatenation of two XMM registers, perform two stores. // On Sandy Bridge, 256-bit memory operations are executed by two // 128-bit ports. However, on Haswell it is better to issue a single 256-bit // memory operation. if (VT.is256BitVector() && !Subtarget->hasInt256() && - StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && - StoredVal.getNumOperands() == 2) { - SDValue Value0 = StoredVal.getOperand(0); - SDValue Value1 = StoredVal.getOperand(1); + StVT == VT && !IsAligned) { + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + + SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); + SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); SDValue Ptr0 = St->getBasePtr(); @@ -16553,10 +16561,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); + St->isNonTemporal(), Alignment); SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); + St->isNonTemporal(), + std::max(Alignment/2U, 1U)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } diff --git a/llvm/test/CodeGen/X86/2012-01-11-split-cv.ll b/llvm/test/CodeGen/X86/2012-01-11-split-cv.ll index 6b90072..7e91498 100644 --- a/llvm/test/CodeGen/X86/2012-01-11-split-cv.ll +++ b/llvm/test/CodeGen/X86/2012-01-11-split-cv.ll @@ -2,7 +2,7 @@ ;CHECK: add18i16 define void @add18i16(<18 x i16>* nocapture sret %ret, <18 x i16>* %bp) nounwind { -;CHECK: vmovups +;CHECK: vmovaps %b = load <18 x i16>* %bp, align 16 %x = add <18 x i16> zeroinitializer, %b store <18 x i16> %x, <18 x i16>* %ret, align 16 diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll index 64825ba..52deadc 100644 --- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -42,7 +42,7 @@ define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwt ; Move the constants using a single vector store. ; CHECK: merge_const_store_vec -; CHECK: vmovups %ymm0, (%rsi) +; CHECK: vmovups ; CHECK: ret define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp { %1 = icmp sgt i32 %count, 0 diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll index c9fc66a..77a7c4f 100644 --- a/llvm/test/CodeGen/X86/avx-load-store.ll +++ b/llvm/test/CodeGen/X86/avx-load-store.ll @@ -53,19 +53,24 @@ define void @storev16i16(<16 x i16> %a) nounwind { unreachable } -; CHECK: vmovups %ymm +; CHECK: storev16i16_01 +; CHECK: vextractf128 +; CHECK: vmovaps %xmm define void @storev16i16_01(<16 x i16> %a) nounwind { store <16 x i16> %a, <16 x i16>* undef, align 4 unreachable } +; CHECK: storev32i8 ; CHECK: vmovaps %ymm define void @storev32i8(<32 x i8> %a) nounwind { store <32 x i8> %a, <32 x i8>* undef, align 32 unreachable } -; CHECK: vmovups %ymm +; CHECK: storev32i8_01 +; CHECK: vextractf128 +; CHECK: vmovups %xmm define void @storev32i8_01(<32 x i8> %a) nounwind { store <32 x i8> %a, <32 x i8>* undef, align 4 unreachable @@ -76,7 +81,7 @@ define void @storev32i8_01(<32 x i8> %a) nounwind { ; CHECK: _double_save ; CHECK-NOT: vinsertf128 $1 ; CHECK-NOT: vinsertf128 $0 -; CHECK: vmovaps %xmm +; CHECK: vmovups %xmm ; CHECK: vmovaps %xmm define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp { entry: diff --git a/llvm/test/CodeGen/X86/avx-sext.ll b/llvm/test/CodeGen/X86/avx-sext.ll index 5201575..adee9bb 100755 --- a/llvm/test/CodeGen/X86/avx-sext.ll +++ b/llvm/test/CodeGen/X86/avx-sext.ll @@ -186,18 +186,6 @@ define void @sext_4(<4 x i16>* %inbuf, <4 x i64>* %outbuf) { ret void } -; AVX: sext_5 -; AVX: vpmovsxbw -; AVX: vpmovsxwd -; AVX: vpmovsxwd -; AVX: vpmovsxdq -; AVX: ret -define void @sext_5(<8 x i8>* %inbuf, <8 x i64>* %outbuf) { - %v0 = load <8 x i8>* %inbuf - %r = sext <8 x i8> %v0 to <8 x i64> - store <8 x i64> %r, <8 x i64>* %outbuf - ret void -} ; AVX: sext_6 ; AVX: vpmovsxbw ; AVX: vpmovsxwd diff --git a/llvm/test/CodeGen/X86/fp-load-trunc.ll b/llvm/test/CodeGen/X86/fp-load-trunc.ll index 2ae65c9..a973bef 100644 --- a/llvm/test/CodeGen/X86/fp-load-trunc.ll +++ b/llvm/test/CodeGen/X86/fp-load-trunc.ll @@ -49,8 +49,8 @@ define <8 x float> @test4(<8 x double>* %p) nounwind { ; CHECK: movlhps ; CHECK: ret ; AVX: test4 -; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}}) -; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}}) +; AVX: vcvtpd2psy +; AVX: vcvtpd2psy ; AVX: vinsertf128 ; AVX: ret %x = load <8 x double>* %p diff --git a/llvm/test/CodeGen/X86/sandybridge-loads.ll b/llvm/test/CodeGen/X86/sandybridge-loads.ll index d85c32e..5a23cf1 100644 --- a/llvm/test/CodeGen/X86/sandybridge-loads.ll +++ b/llvm/test/CodeGen/X86/sandybridge-loads.ll @@ -3,7 +3,7 @@ ;CHECK: wideloads ;CHECK: vmovaps ;CHECK: vinsertf128 -;CHECK: vmovups +;CHECK: vmovaps ;CHECK-NOT: vinsertf128 ;CHECK: ret @@ -11,11 +11,29 @@ define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi %v0 = load <8 x float>* %a, align 16 ; <---- unaligned! %v1 = load <8 x float>* %b, align 32 ; <---- aligned! %m0 = fcmp olt <8 x float> %v1, %v0 - %v2 = load <8 x float>* %c, align 16 + %v2 = load <8 x float>* %c, align 32 ; <---- aligned! %m1 = fcmp olt <8 x float> %v2, %v0 %mand = and <8 x i1> %m1, %m0 %r = zext <8 x i1> %mand to <8 x i32> - store <8 x i32> %r, <8 x i32>* undef, align 16 + store <8 x i32> %r, <8 x i32>* undef, align 32 + ret void +} + +; CHECK: widestores +; loads: +; CHECK: vmovaps +; CHECK: vmovaps +; stores: +; CHECK: vmovaps +; CHECK: vextractf128 +; CHECK: vmovaps +;CHECK: ret + +define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { + %v0 = load <8 x float>* %a, align 32 + %v1 = load <8 x float>* %b, align 32 + store <8 x float> %v0, <8 x float>* %b, align 32 ; <--- aligned + store <8 x float> %v1, <8 x float>* %a, align 16 ; <--- unaligned ret void } diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll index ea231af..8cbfb5d 100644 --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -6,7 +6,7 @@ ;CHECK: vcmpltp ;CHECK: vandps ;CHECK: vandps -;CHECK: vmovups +;CHECK: vmovaps ;CHECK: ret define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { @@ -17,7 +17,7 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi %m1 = fcmp olt <8 x float> %v2, %v0 %mand = and <8 x i1> %m1, %m0 %r = zext <8 x i1> %mand to <8 x i32> - store <8 x i32> %r, <8 x i32>* undef, align 16 + store <8 x i32> %r, <8 x i32>* undef, align 32 ret void } @@ -25,7 +25,7 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi ;CHECK: vcmpltps ;CHECK: vxorps ;CHECK: vandps -;CHECK: vmovups +;CHECK: vmovaps ;CHECK: ret define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { %v0 = load <8 x float>* %a, align 16 @@ -33,7 +33,7 @@ define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi %m0 = fcmp olt <8 x float> %v1, %v0 %mand = xor <8 x i1> %m0, %r = zext <8 x i1> %mand to <8 x i32> - store <8 x i32> %r, <8 x i32>* undef, align 16 + store <8 x i32> %r, <8 x i32>* undef, align 32 ret void } diff --git a/llvm/test/CodeGen/X86/vec_fpext.ll b/llvm/test/CodeGen/X86/vec_fpext.ll index dc0464f..e4a8f46 100644 --- a/llvm/test/CodeGen/X86/vec_fpext.ll +++ b/llvm/test/CodeGen/X86/vec_fpext.ll @@ -29,8 +29,8 @@ entry: ; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}} ; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}} ; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}} -; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}} ; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}} +; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}} %0 = load <8 x float>* %in %1 = fpext <8 x float> %0 to <8 x double> store <8 x double> %1, <8 x double>* %out, align 1 -- 2.7.4