From 14a4af0e66553aa4b3fecb95d7d36e8f01ca7b8d Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Wed, 19 Dec 2012 07:50:20 +0000 Subject: [PATCH] Optimized load + SIGN_EXTEND patterns in the X86 backend. llvm-svn: 170506 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 1 + llvm/lib/Target/X86/X86ISelLowering.cpp | 25 ++++++-- llvm/lib/Target/X86/X86InstrSSE.td | 82 ++++++++++++++++++++++++ llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll | 4 +- llvm/test/CodeGen/X86/avx-sext.ll | 56 +++++++++++++++- llvm/test/CodeGen/X86/avx2-conversions.ll | 41 ++++++++++++ 6 files changed, 202 insertions(+), 7 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7c54d17..527fdaa 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5235,6 +5235,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { LN0->getAlignment()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); + AddToWorkList(ExtLoad.getNode()); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 97f2a35..fee9d93 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15929,10 +15929,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // If this is a vector EXT Load then attempt to optimize it using a // shuffle. We need SSSE3 shuffles. + // SEXT loads are suppoted starting SSE41. + // We generate X86ISD::VSEXT for them. // TODO: It is possible to support ZExt by zeroing the undef values // during the shuffle phase or after the shuffle. if (RegVT.isVector() && RegVT.isInteger() && - Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) { + (Ext == ISD::EXTLOAD && Subtarget->hasSSSE3() || + Ext == ISD::SEXTLOAD && Subtarget->hasSSE41())){ assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); @@ -15941,6 +15944,9 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); + if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) + return SDValue(); + // All sizes must be a power of two. if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue(); @@ -15964,16 +15970,23 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // Calculate the number of scalar loads that we need to perform // in order to load our vector from memory. unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + if (Ext == ISD::SEXTLOAD && NumLoads > 1) + return SDValue(); + + unsigned loadRegZize = RegSz; + if (Ext == ISD::SEXTLOAD && RegSz == 256) + loadRegZize /= 2; // Represent our vector as a sequence of elements which are the // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, - RegSz/SclrLoadTy.getSizeInBits()); + loadRegZize/SclrLoadTy.getSizeInBits()); // Represent the data using the same element type that is stored in // memory. In practice, we ''widen'' MemVT. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - RegSz/MemVT.getScalarType().getSizeInBits()); + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + loadRegZize/MemVT.getScalarType().getSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type"); @@ -16014,6 +16027,10 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); unsigned SizeRatio = RegSz/MemSz; + if (Ext == ISD::SEXTLOAD) { + SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + return DCI.CombineTo(N, Sext, TF, true); + } // Redistribute the loaded elements into the different locations. SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 54032fe..521073d 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5842,6 +5842,31 @@ defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>; +let Predicates = [HasAVX2] in { + def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))), + (VPMOVSXWDYrm addr:$src)>; + def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))), + (VPMOVSXDQYrm addr:$src)>; + + def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXBDYrm addr:$src)>; + def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXBDYrm addr:$src)>; + + def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXWQYrm addr:$src)>; + def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXWQYrm addr:$src)>; + + def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXBQYrm addr:$src)>; +} + let Predicates = [HasAVX] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbq @@ -5866,6 +5891,34 @@ let Predicates = [UseSSE41] in { (bitconvert (v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), (PMOVZXBQrm addr:$src)>; + + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVSXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVSXWDrm addr:$src)>; + def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVSXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVSXWQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (extloadi32i16 addr:$src))))))), + (PMOVSXBQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVSXDQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVSXDQrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (PMOVSXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (PMOVSXBWrm addr:$src)>; } let Predicates = [HasAVX2] in { @@ -5926,6 +5979,35 @@ let Predicates = [HasAVX] in { (VPMOVZXDQrm addr:$src)>; def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), (VPMOVZXDQrm addr:$src)>; + + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXDQrm addr:$src)>; + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXWDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXDQrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 + (scalar_to_vector (loadi64 addr:$src))))))), + (VPMOVSXBWrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))), + (VPMOVSXBWrm addr:$src)>; + + def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXBDrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 + (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXWQrm addr:$src)>; + def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 + (scalar_to_vector (extloadi32i16 addr:$src))))))), + (VPMOVSXBQrm addr:$src)>; } let Predicates = [UseSSE41] in { diff --git a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll index a720753..da734d4 100644 --- a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll @@ -16,8 +16,8 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: main define i32 @main() nounwind uwtable { entry: -; CHECK: movsbq j(%rip), % -; CHECK: movsbq i(%rip), % +; CHECK: pmovsxbq j(%rip), % +; CHECK: pmovsxbq i(%rip), % %0 = load <2 x i8>* @i, align 8 %1 = load <2 x i8>* @j, align 8 %div = sdiv <2 x i8> %1, %0 diff --git a/llvm/test/CodeGen/X86/avx-sext.ll b/llvm/test/CodeGen/X86/avx-sext.ll index 3713a8c..425d09c 100755 --- a/llvm/test/CodeGen/X86/avx-sext.ll +++ b/llvm/test/CodeGen/X86/avx-sext.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { ;CHECK: sext_8i16_to_8i32 @@ -15,3 +15,57 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp %B = sext <4 x i32> %A to <4 x i64> ret <4 x i64>%B } + +; CHECK: load_sext_test1 +; CHECK: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}} +; CHECK: ret +define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) { + %X = load <4 x i16>* %ptr + %Y = sext <4 x i16> %X to <4 x i32> + ret <4 x i32>%Y +} + +; CHECK: load_sext_test2 +; CHECK: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}} +; CHECK: ret +define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) { + %X = load <4 x i8>* %ptr + %Y = sext <4 x i8> %X to <4 x i32> + ret <4 x i32>%Y +} + +; CHECK: load_sext_test3 +; CHECK: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}} +; CHECK: ret +define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) { + %X = load <2 x i8>* %ptr + %Y = sext <2 x i8> %X to <2 x i64> + ret <2 x i64>%Y +} + +; CHECK: load_sext_test4 +; CHECK: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}} +; CHECK: ret +define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) { + %X = load <2 x i16>* %ptr + %Y = sext <2 x i16> %X to <2 x i64> + ret <2 x i64>%Y +} + +; CHECK: load_sext_test5 +; CHECK: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}} +; CHECK: ret +define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) { + %X = load <2 x i32>* %ptr + %Y = sext <2 x i32> %X to <2 x i64> + ret <2 x i64>%Y +} + +; CHECK: load_sext_test6 +; CHECK: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}} +; CHECK: ret +define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) { + %X = load <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i16> + ret <8 x i16>%Y +} diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll index b474913..3ce08dc 100755 --- a/llvm/test/CodeGen/X86/avx2-conversions.ll +++ b/llvm/test/CodeGen/X86/avx2-conversions.ll @@ -63,6 +63,47 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind { ret <8 x i32>%B } +; CHECK: load_sext_test1 +; CHECK: vpmovsxdq (%r{{[^,]*}}), %ymm{{.*}} +; CHECK: ret +define <4 x i64> @load_sext_test1(<4 x i32> *%ptr) { + %X = load <4 x i32>* %ptr + %Y = sext <4 x i32> %X to <4 x i64> + ret <4 x i64>%Y +} + +; CHECK: load_sext_test2 +; CHECK: vpmovsxbq (%r{{[^,]*}}), %ymm{{.*}} +; CHECK: ret +define <4 x i64> @load_sext_test2(<4 x i8> *%ptr) { + %X = load <4 x i8>* %ptr + %Y = sext <4 x i8> %X to <4 x i64> + ret <4 x i64>%Y +} +; CHECK: load_sext_test3 +; CHECK: vpmovsxwq (%r{{[^,]*}}), %ymm{{.*}} +; CHECK: ret +define <4 x i64> @load_sext_test3(<4 x i16> *%ptr) { + %X = load <4 x i16>* %ptr + %Y = sext <4 x i16> %X to <4 x i64> + ret <4 x i64>%Y +} +; CHECK: load_sext_test4 +; CHECK: vpmovsxwd (%r{{[^,]*}}), %ymm{{.*}} +; CHECK: ret +define <8 x i32> @load_sext_test4(<8 x i16> *%ptr) { + %X = load <8 x i16>* %ptr + %Y = sext <8 x i16> %X to <8 x i32> + ret <8 x i32>%Y +} +; CHECK: load_sext_test5 +; CHECK: vpmovsxbd (%r{{[^,]*}}), %ymm{{.*}} +; CHECK: ret +define <8 x i32> @load_sext_test5(<8 x i8> *%ptr) { + %X = load <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i32> + ret <8 x i32>%Y +} -- 2.7.4