From d21bf514940fd3b6368796e3ad22e1910c8598c6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 6 May 2022 11:41:45 +0100
Subject: [PATCH] [CostModel][X86] Adjust pre-SSE41 fp scalar select costs to
 account for vector ops

Based off the script from D103695, we now mainly use BLENDV or OR(AND,ANDN) to select scalar float/double ops
---
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp     |   4 +
 llvm/test/Analysis/CostModel/X86/vselect-cost.ll   |   4 +-
 .../Transforms/SLPVectorizer/X86/crash_cmpop.ll    | 116 +++++++--------------
 3 files changed, 44 insertions(+), 80 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 6bb79ab..ca54e1b 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2753,7 +2753,9 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 
   static const CostTblEntry SSE41CostTbl[] = {
     { ISD::SELECT,  MVT::v2f64,   1 }, // blendvpd
+    { ISD::SELECT,  MVT::f64,     1 }, // blendvpd
     { ISD::SELECT,  MVT::v4f32,   1 }, // blendvps
+    { ISD::SELECT,  MVT::f32  ,   1 }, // blendvps
     { ISD::SELECT,  MVT::v2i64,   1 }, // pblendvb
     { ISD::SELECT,  MVT::v4i32,   1 }, // pblendvb
     { ISD::SELECT,  MVT::v8i16,   1 }, // pblendvb
@@ -2769,6 +2771,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v16i8,   1 },
 
     { ISD::SELECT,  MVT::v2f64,   2 }, // andpd + andnpd + orpd
+    { ISD::SELECT,  MVT::f64,     2 }, // andpd + andnpd + orpd
     { ISD::SELECT,  MVT::v2i64,   2 }, // pand + pandn + por
     { ISD::SELECT,  MVT::v4i32,   2 }, // pand + pandn + por
     { ISD::SELECT,  MVT::v8i16,   2 }, // pand + pandn + por
@@ -2780,6 +2783,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::f32,     1 },
 
     { ISD::SELECT,  MVT::v4f32,   2 }, // andps + andnps + orps
+    { ISD::SELECT,  MVT::f32,     2 }, // andps + andnps + orps
   };
 
   if (ST->useSLMArithCosts())
diff --git a/llvm/test/Analysis/CostModel/X86/vselect-cost.ll b/llvm/test/Analysis/CostModel/X86/vselect-cost.ll
index 88e81ac..9c475f9 100644
--- a/llvm/test/Analysis/CostModel/X86/vselect-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vselect-cost.ll
@@ -148,11 +148,11 @@ define i32 @test_select() {
 
 define i32 @test_select_fp() {
 ; SSE2-LABEL: 'test_select_fp'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = select i1 undef, double undef, double undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = select i1 undef, double undef, double undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = select <4 x i1> undef, <4 x double> undef, <4 x double> undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = select <8 x i1> undef, <8 x double> undef, <8 x double> undef
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = select i1 undef, float undef, float undef
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = select i1 undef, float undef, float undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = select <8 x i1> undef, <8 x float> undef, <8 x float> undef
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = select <16 x i1> undef, <16 x float> undef, <16 x float> undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
index 2e84c10..ef1ed33 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@@ -1,87 +1,47 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s -check-prefix=SSE
-; RUN: opt < %s -basic-aa -slp-vectorizer -S -mattr=+avx | FileCheck %s -check-prefix=AVX
+; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mattr=+avx | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.10.0"
 
 define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) {
-; SSE-LABEL: @testfunc(
-; SSE-NEXT:  entry:
-; SSE-NEXT:    br label [[FOR_BODY:%.*]]
-; SSE:       for.body:
-; SSE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; SSE-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; SSE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; SSE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
-; SSE-NEXT:    store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
-; SSE-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP5]], i32 1
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP1]], i32 1
-; SSE-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[TMP6]], [[TMP8]]
-; SSE-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[TMP2]], [[TMP9]]
-; SSE-NEXT:    [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], <float 1.000000e+00, float 1.000000e+00>
-; SSE-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; SSE-NEXT:    [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], <float -1.000000e+00, float -1.000000e+00>
-; SSE-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
-; SSE-NEXT:    [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP14]]
-; SSE-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; SSE-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; SSE-NEXT:    [[ADD13]] = fadd float [[TMP16]], [[TMP17]]
-; SSE-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i32 0
-; SSE-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1
-; SSE-NEXT:    [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], <float 1.000000e+00, float 1.000000e+00>
-; SSE-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; SSE-NEXT:    [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], <float -1.000000e+00, float -1.000000e+00>
-; SSE-NEXT:    [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP21]]
-; SSE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
-; SSE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; SSE:       for.end:
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @testfunc(
-; AVX-NEXT:  entry:
-; AVX-NEXT:    br label [[FOR_BODY:%.*]]
-; AVX:       for.body:
-; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX-NEXT:    [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
-; AVX-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
-; AVX-NEXT:    store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
-; AVX-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]]
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; AVX-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]]
-; AVX-NEXT:    [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
-; AVX-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
-; AVX-NEXT:    [[ADD13]] = fadd float [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0
-; AVX-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1
-; AVX-NEXT:    [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT:    [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP17]]
-; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
-; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; AVX:       for.end:
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @testfunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; CHECK-NEXT:    [[ADD13]] = fadd float [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP17]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
-- 
2.7.4