[CostModel][X86] Adjust uitofp(vXi64) SSE/AVX legalized costs based on llvm-mca reports.

author Simon Pilgrim <llvm-dev@redking.me.uk>

Fri, 2 Jul 2021 12:07:07 +0000 (13:07 +0100)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Fri, 2 Jul 2021 12:09:00 +0000 (13:09 +0100)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Fri, 2 Jul 2021 12:07:07 +0000 (13:07 +0100)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Fri, 2 Jul 2021 12:09:00 +0000 (13:09 +0100)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

index a58b150..265c4d7 100644 (file)
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1968,6 +1968,8 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
      { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 4 },
      { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 5 },
      { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i64, 18 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 10 },
      { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 8 },
      { ISD::UINT_TO_FP,  MVT::v8f64, MVT::v8i32, 10 },
      { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 5 },
@@ -2069,6 +2071,9 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
  
      { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    4 },
      { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64, 12 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64, 22 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  4 },
  
      { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f32,  3 },
      { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f64,  3 },
diff --git a/llvm/test/Analysis/CostModel/X86/uitofp.ll b/llvm/test/Analysis/CostModel/X86/uitofp.ll

index 94d5a7c..56cac09 100644 (file)
--- a/llvm/test/Analysis/CostModel/X86/uitofp.ll
+++ b/llvm/test/Analysis/CostModel/X86/uitofp.ll
@@ -114,16 +114,16 @@ define i32 @uitofp_i64_double() {
  ;
  ; SSE42-LABEL: 'uitofp_i64_double'
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'uitofp_i64_double'
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX512F-LABEL: 'uitofp_i64_double'
@@ -265,24 +265,24 @@ define i32 @uitofp_i64_float() {
  ;
  ; SSE42-LABEL: 'uitofp_i64_float'
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'uitofp_i64_float'
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX512F-LABEL: 'uitofp_i64_float'
  ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
  ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
  ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
  ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
  ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll

index e08ef00..6a43fa8 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.8.0"
  
  ; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
  ; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 6 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 10 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
  define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
  entry:
    br label %for.body
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll

index e3b8beb..2544a87 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -575,38 +575,11 @@ define void @uitofp_2i64_2f32() #0 {
  }
  
  define void @uitofp_4i64_4f32() #0 {
-; SSE-LABEL: @uitofp_4i64_4f32(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    ret void
-;
-; AVX256NODQ-LABEL: @uitofp_4i64_4f32(
-; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
-; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_4i64_4f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
-; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @uitofp_4i64_4f32(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
-; AVX256DQ-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; AVX256DQ-NEXT:    ret void
+; CHECK-LABEL: @uitofp_4i64_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
  ;
    %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
    %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -633,44 +606,11 @@ define void @uitofp_8i64_8f32() #0 {
  ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
  ; SSE-NEXT:    ret void
  ;
-; AVX256NODQ-LABEL: @uitofp_8i64_8f32(
-; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
-; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_8i64_8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
-; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX512-NEXT:    ret void
-;
-; AVX256DQ-LABEL: @uitofp_8i64_8f32(
-; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
-; AVX256DQ-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256DQ-NEXT:    ret void
+; AVX-LABEL: @uitofp_8i64_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
  ;
    %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
    %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Fri, 2 Jul 2021 12:07:07 +0000 (13:07 +0100)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Fri, 2 Jul 2021 12:09:00 +0000 (13:09 +0100)
llvm/lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
llvm/test/Analysis/CostModel/X86/uitofp.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll		patch \| blob \| history