From d21bf514940fd3b6368796e3ad22e1910c8598c6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 6 May 2022 11:41:45 +0100 Subject: [PATCH] [CostModel][X86] Adjust pre-SSE41 fp scalar select costs to account for vector ops Based off the script from D103695, we now mainly use BLENDV or OR(AND,ANDN) to select scalar float/double ops --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 4 + llvm/test/Analysis/CostModel/X86/vselect-cost.ll | 4 +- .../Transforms/SLPVectorizer/X86/crash_cmpop.ll | 116 +++++++-------------- 3 files changed, 44 insertions(+), 80 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 6bb79ab..ca54e1b 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2753,7 +2753,9 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, static const CostTblEntry SSE41CostTbl[] = { { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd + { ISD::SELECT, MVT::f64, 1 }, // blendvpd { ISD::SELECT, MVT::v4f32, 1 }, // blendvps + { ISD::SELECT, MVT::f32 , 1 }, // blendvps { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb @@ -2769,6 +2771,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16i8, 1 }, { ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd + { ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd { ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por { ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por { ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por @@ -2780,6 +2783,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::f32, 1 }, { ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps + { ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps }; if (ST->useSLMArithCosts()) diff --git a/llvm/test/Analysis/CostModel/X86/vselect-cost.ll b/llvm/test/Analysis/CostModel/X86/vselect-cost.ll index 88e81ac..9c475f9 100644 --- a/llvm/test/Analysis/CostModel/X86/vselect-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/vselect-cost.ll @@ -148,11 +148,11 @@ define i32 @test_select() { define i32 @test_select_fp() { ; SSE2-LABEL: 'test_select_fp' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = select i1 undef, double undef, double undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = select i1 undef, double undef, double undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = select <4 x i1> undef, <4 x double> undef, <4 x double> undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = select <8 x i1> undef, <8 x double> undef, <8 x double> undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = select i1 undef, float undef, float undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = select i1 undef, float undef, float undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = select <8 x i1> undef, <8 x float> undef, <8 x float> undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = select <16 x i1> undef, <16 x float> undef, <16 x float> undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll index 2e84c10..ef1ed33 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -1,87 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s -check-prefix=SSE -; RUN: opt < %s -basic-aa -slp-vectorizer -S -mattr=+avx | FileCheck %s -check-prefix=AVX +; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -basic-aa -slp-vectorizer -S -mattr=+avx | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) { -; SSE-LABEL: @testfunc( -; SSE-NEXT: entry: -; SSE-NEXT: br label [[FOR_BODY:%.*]] -; SSE: for.body: -; SSE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; SSE-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; SSE-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] -; SSE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] -; SSE-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; SSE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; SSE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; SSE-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; SSE-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP5]], i32 1 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP1]], i32 1 -; SSE-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[TMP6]], [[TMP8]] -; SSE-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP2]], [[TMP9]] -; SSE-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], -; SSE-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> -; SSE-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], -; SSE-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer -; SSE-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> , <2 x float> [[TMP14]] -; SSE-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 -; SSE-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 -; SSE-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]] -; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i32 0 -; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1 -; SSE-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], -; SSE-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> -; SSE-NEXT: [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], -; SSE-NEXT: [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> , <2 x float> [[TMP21]] -; SSE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 -; SSE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; SSE: for.end: -; SSE-NEXT: ret void -; -; AVX-LABEL: @testfunc( -; AVX-NEXT: entry: -; AVX-NEXT: br label [[FOR_BODY:%.*]] -; AVX: for.body: -; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; AVX-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] -; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]] -; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], -; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> -; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer -; AVX-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] -; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; AVX-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] -; AVX-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0 -; AVX-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 -; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], -; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> -; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], -; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] -; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 -; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] -; AVX: for.end: -; AVX-NEXT: ret void +; CHECK-LABEL: @testfunc( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> +; CHECK-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; CHECK-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> +; CHECK-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], +; CHECK-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; entry: br label %for.body -- 2.7.4