From d3f0d03cc5f35c5a5816878a8aa70163d6a0d987 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 14 May 2017 18:52:15 +0000 Subject: [PATCH] [X86][AVX1] Account for cost of extract/insert of 256-bit SDIV/UDIV by mul sequences llvm-svn: 303017 --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 34 +++++++++++++------------- llvm/test/Analysis/CostModel/X86/div.ll | 32 ++++++++++++------------ 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 2952925..fc9fcfc 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -247,29 +247,29 @@ int X86TTIImpl::getArithmeticInstrCost( } static const CostTblEntry SSE2UniformConstCostTable[] = { - { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. - { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. - { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. - - { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. - { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. - { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. - - { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence - { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence - { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence - { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence - { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence - { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence - { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence - { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence + { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. + { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + + { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. + { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. + { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. + + { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. + { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. + { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && ST->hasSSE2()) { // pmuldq sequence. if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) - return LT.first * 30; + return LT.first * 32; if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) return LT.first * 15; diff --git a/llvm/test/Analysis/CostModel/X86/div.ll b/llvm/test/Analysis/CostModel/X86/div.ll index 0ac06ff..dabaaef 100644 --- a/llvm/test/Analysis/CostModel/X86/div.ll +++ b/llvm/test/Analysis/CostModel/X86/div.ll @@ -139,14 +139,14 @@ define i32 @sdiv_uniformconst() { ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv - ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv + ; AVX1: cost of 32 {{.*}} %V8i32 = sdiv ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv %V8i32 = sdiv <8 x i32> undef, ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv - ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv + ; AVX1: cost of 64 {{.*}} %V16i32 = sdiv ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv %V16i32 = sdiv <16 x i32> undef, @@ -157,12 +157,12 @@ define i32 @sdiv_uniformconst() { ; AVX: cost of 6 {{.*}} %V8i16 = sdiv %V8i16 = sdiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = sdiv - ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv + ; AVX1: cost of 14 {{.*}} %V16i16 = sdiv ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv %V16i16 = sdiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = sdiv - ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv + ; AVX1: cost of 28 {{.*}} %V32i16 = sdiv ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv @@ -203,12 +203,12 @@ define i32 @udiv_uniformconst() { ; AVX: cost of 15 {{.*}} %V4i32 = udiv %V4i32 = udiv <4 x i32> undef, ; SSE: cost of 30 {{.*}} %V8i32 = udiv - ; AVX1: cost of 30 {{.*}} %V8i32 = udiv + ; AVX1: cost of 32 {{.*}} %V8i32 = udiv ; AVX2: cost of 15 {{.*}} %V8i32 = udiv ; AVX512: cost of 15 {{.*}} %V8i32 = udiv %V8i32 = udiv <8 x i32> undef, ; SSE: cost of 60 {{.*}} %V16i32 = udiv - ; AVX1: cost of 60 {{.*}} %V16i32 = udiv + ; AVX1: cost of 64 {{.*}} %V16i32 = udiv ; AVX2: cost of 30 {{.*}} %V16i32 = udiv ; AVX512: cost of 15 {{.*}} %V16i32 = udiv %V16i32 = udiv <16 x i32> undef, @@ -219,12 +219,12 @@ define i32 @udiv_uniformconst() { ; AVX: cost of 6 {{.*}} %V8i16 = udiv %V8i16 = udiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = udiv - ; AVX1: cost of 12 {{.*}} %V16i16 = udiv + ; AVX1: cost of 14 {{.*}} %V16i16 = udiv ; AVX2: cost of 6 {{.*}} %V16i16 = udiv ; AVX512: cost of 6 {{.*}} %V16i16 = udiv %V16i16 = udiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = udiv - ; AVX1: cost of 24 {{.*}} %V32i16 = udiv + ; AVX1: cost of 28 {{.*}} %V32i16 = udiv ; AVX2: cost of 12 {{.*}} %V32i16 = udiv ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv @@ -269,14 +269,14 @@ define i32 @sdiv_uniformconstpow2() { ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv - ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv + ; AVX1: cost of 32 {{.*}} %V8i32 = sdiv ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv %V8i32 = sdiv <8 x i32> undef, ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv - ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv + ; AVX1: cost of 64 {{.*}} %V16i32 = sdiv ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv %V16i32 = sdiv <16 x i32> undef, @@ -287,12 +287,12 @@ define i32 @sdiv_uniformconstpow2() { ; AVX: cost of 6 {{.*}} %V8i16 = sdiv %V8i16 = sdiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = sdiv - ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv + ; AVX1: cost of 14 {{.*}} %V16i16 = sdiv ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv %V16i16 = sdiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = sdiv - ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv + ; AVX1: cost of 28 {{.*}} %V32i16 = sdiv ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv @@ -333,12 +333,12 @@ define i32 @udiv_uniformconstpow2() { ; AVX: cost of 15 {{.*}} %V4i32 = udiv %V4i32 = udiv <4 x i32> undef, ; SSE: cost of 30 {{.*}} %V8i32 = udiv - ; AVX1: cost of 30 {{.*}} %V8i32 = udiv + ; AVX1: cost of 32 {{.*}} %V8i32 = udiv ; AVX2: cost of 15 {{.*}} %V8i32 = udiv ; AVX512: cost of 15 {{.*}} %V8i32 = udiv %V8i32 = udiv <8 x i32> undef, ; SSE: cost of 60 {{.*}} %V16i32 = udiv - ; AVX1: cost of 60 {{.*}} %V16i32 = udiv + ; AVX1: cost of 64 {{.*}} %V16i32 = udiv ; AVX2: cost of 30 {{.*}} %V16i32 = udiv ; AVX512: cost of 15 {{.*}} %V16i32 = udiv %V16i32 = udiv <16 x i32> undef, @@ -349,12 +349,12 @@ define i32 @udiv_uniformconstpow2() { ; AVX: cost of 6 {{.*}} %V8i16 = udiv %V8i16 = udiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = udiv - ; AVX1: cost of 12 {{.*}} %V16i16 = udiv + ; AVX1: cost of 14 {{.*}} %V16i16 = udiv ; AVX2: cost of 6 {{.*}} %V16i16 = udiv ; AVX512: cost of 6 {{.*}} %V16i16 = udiv %V16i16 = udiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = udiv - ; AVX1: cost of 24 {{.*}} %V32i16 = udiv + ; AVX1: cost of 28 {{.*}} %V32i16 = udiv ; AVX2: cost of 12 {{.*}} %V32i16 = udiv ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv -- 2.7.4