[CostModel][X86] Adjust fp<->int vXi32 SSE legalized costs based on llvm-mca reports.

author Simon Pilgrim <llvm-dev@redking.me.uk>

Thu, 1 Jul 2021 13:16:02 +0000 (14:16 +0100)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Thu, 1 Jul 2021 14:34:20 +0000 (15:34 +0100)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Thu, 1 Jul 2021 13:16:02 +0000 (14:16 +0100)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Thu, 1 Jul 2021 14:34:20 +0000 (15:34 +0100)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp

index e400000..a58b150 100644 (file)
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2063,9 +2063,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
      { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
      { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i64,  1 }, // PSHUFB
  
-    { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  1 },
      { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
      { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  1 },
      { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
  
@@ -2084,24 +2082,25 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
      // These are somewhat magic numbers justified by looking at the output of
      // Intel's IACA, running some kernels and making sure when we take
      // legalization into account the throughput will be overestimated.
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 },
-    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
-
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  3 },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  4 },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  3 },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  4 },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  3 },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  4 },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v2i64,  8 },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  8 },
+
+    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    8 },
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    9 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v16i8,  4 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v16i8,  4 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v8i16,  4 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v8i16,  4 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  7 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  5 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64, 15 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v2i64, 18 },
  
      { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f32,  4 },
      { ISD::FP_TO_SINT,  MVT::v2i16,  MVT::v2f32,  2 },
@@ -2109,14 +2108,10 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
      { ISD::FP_TO_SINT,  MVT::v4i16,  MVT::v4f32,  2 },
      { ISD::FP_TO_SINT,  MVT::v2i16,  MVT::v2f64,  2 },
      { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f64,  4 },
-
      { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  1 },
  
-    { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    6 },
-    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    6 },
-
      { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
-    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    4 },
+    { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,   15 },
      { ISD::FP_TO_UINT,  MVT::v2i8,   MVT::v2f32,  4 },
      { ISD::FP_TO_UINT,  MVT::v2i8,   MVT::v2f64,  4 },
      { ISD::FP_TO_UINT,  MVT::v4i8,   MVT::v4f32,  3 },
@@ -2138,11 +2133,11 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
      { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
      { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
      { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
      { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
      { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
      { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  10 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 10 },
      { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
      { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
      { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
@@ -2250,12 +2245,12 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
    if (ST->hasSSE41() && !ST->hasAVX())
      if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
                                                     LTDest.second, LTSrc.second))
-      return AdjustCost(LTSrc.first * Entry->Cost);
+      return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  
    if (ST->hasSSE2() && !ST->hasAVX())
      if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
                                                     LTDest.second, LTSrc.second))
-      return AdjustCost(LTSrc.first * Entry->Cost);
+      return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  
    return AdjustCost(
        BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
diff --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll

index 5377c68..7d0a3fd 100644 (file)
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -373,23 +373,23 @@ define i32 @masks4(<4 x i1> %in) {
  
  define void @sitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
  ; SSE2-LABEL: 'sitofp4'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
  ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
  ;
  ; SSE41-LABEL: 'sitofp4'
  ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
  ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
  ; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
  ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -440,16 +440,16 @@ define void @sitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
  
  define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
  ; SSE2-LABEL: 'sitofp8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
  ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
  ;
  ; SSE41-LABEL: 'sitofp8'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
  ; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
  ; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
  ;
@@ -484,13 +484,13 @@ define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
  define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
  ; SSE-LABEL: 'uitofp4'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
  ;
  ; AVX1-LABEL: 'uitofp4'
@@ -539,9 +539,9 @@ define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
  
  define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
  ; SSE-LABEL: 'uitofp8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
  ;
diff --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll

index 390aeaa..5a9f6bf 100644 (file)
--- a/llvm/test/Analysis/CostModel/X86/fptoui.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll
@@ -12,24 +12,24 @@
  
  define i32 @fptoui_double_i64(i32 %arg) {
  ; SSE2-LABEL: 'fptoui_double_i64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %I64 = fptoui double undef to i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
  ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; SSE42-LABEL: 'fptoui_double_i64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %I64 = fptoui double undef to i64
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'fptoui_double_i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %I64 = fptoui double undef to i64
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX512F-LABEL: 'fptoui_double_i64'
@@ -47,10 +47,10 @@ define i32 @fptoui_double_i64(i32 %arg) {
  ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; SLM-LABEL: 'fptoui_double_i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
-; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %I64 = fptoui double undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
  ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
    %I64 = fptoui double undef to i64
diff --git a/llvm/test/Analysis/CostModel/X86/sitofp.ll b/llvm/test/Analysis/CostModel/X86/sitofp.ll

index de4b2c2..67d3663 100644 (file)
--- a/llvm/test/Analysis/CostModel/X86/sitofp.ll
+++ b/llvm/test/Analysis/CostModel/X86/sitofp.ll
@@ -13,9 +13,9 @@
  define i32 @sitofp_i8_double() {
  ; SSE-LABEL: 'sitofp_i8_double'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'sitofp_i8_double'
@@ -42,9 +42,9 @@ define i32 @sitofp_i8_double() {
  define i32 @sitofp_i16_double() {
  ; SSE-LABEL: 'sitofp_i16_double'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'sitofp_i16_double'
@@ -71,35 +71,35 @@ define i32 @sitofp_i16_double() {
  define i32 @sitofp_i32_double() {
  ; SSE2-LABEL: 'sitofp_i32_double'
  ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
  ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; SSE42-LABEL: 'sitofp_i32_double'
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX1-LABEL: 'sitofp_i32_double'
  ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
  ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
  ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
  ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX2-LABEL: 'sitofp_i32_double'
  ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
  ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
  ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
  ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX512-LABEL: 'sitofp_i32_double'
  ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
  ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
  ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
  ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -114,21 +114,21 @@ define i32 @sitofp_i32_double() {
  define i32 @sitofp_i64_double() {
  ; SSE-LABEL: 'sitofp_i64_double'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'sitofp_i64_double'
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
  ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX512F-LABEL: 'sitofp_i64_double'
  ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = sitofp i64 undef to double
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
  ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
  ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
  ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -150,10 +150,10 @@ define i32 @sitofp_i64_double() {
  define i32 @sitofp_i8_float() {
  ; SSE-LABEL: 'sitofp_i8_float'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'sitofp_i8_float'
@@ -183,10 +183,10 @@ define i32 @sitofp_i8_float() {
  define i32 @sitofp_i16_float() {
  ; SSE-LABEL: 'sitofp_i16_float'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'sitofp_i16_float'
@@ -216,10 +216,10 @@ define i32 @sitofp_i16_float() {
  define i32 @sitofp_i32_float() {
  ; SSE2-LABEL: 'sitofp_i32_float'
  ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
  ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; SSE42-LABEL: 'sitofp_i32_float'
@@ -232,7 +232,7 @@ define i32 @sitofp_i32_float() {
  ;
  ; AVX1-LABEL: 'sitofp_i32_float'
  ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
  ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
  ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
  ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
@@ -240,7 +240,7 @@ define i32 @sitofp_i32_float() {
  ;
  ; AVX2-LABEL: 'sitofp_i32_float'
  ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
  ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
  ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
  ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
@@ -248,7 +248,7 @@ define i32 @sitofp_i32_float() {
  ;
  ; AVX512-LABEL: 'sitofp_i32_float'
  ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
  ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
  ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
  ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
@@ -265,10 +265,10 @@ define i32 @sitofp_i32_float() {
  define i32 @sitofp_i64_float() {
  ; SSE-LABEL: 'sitofp_i64_float'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f32 = sitofp i64 undef to float
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'sitofp_i64_float'
diff --git a/llvm/test/Analysis/CostModel/X86/uitofp.ll b/llvm/test/Analysis/CostModel/X86/uitofp.ll

index deb6bd4..94d5a7c 100644 (file)
--- a/llvm/test/Analysis/CostModel/X86/uitofp.ll
+++ b/llvm/test/Analysis/CostModel/X86/uitofp.ll
@@ -13,9 +13,9 @@
  define i32 @uitofp_i8_double() {
  ; SSE-LABEL: 'uitofp_i8_double'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'uitofp_i8_double'
@@ -42,9 +42,9 @@ define i32 @uitofp_i8_double() {
  define i32 @uitofp_i16_double() {
  ; SSE-LABEL: 'uitofp_i16_double'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'uitofp_i16_double'
@@ -71,9 +71,9 @@ define i32 @uitofp_i16_double() {
  define i32 @uitofp_i32_double() {
  ; SSE-LABEL: 'uitofp_i32_double'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX1-LABEL: 'uitofp_i32_double'
@@ -106,17 +106,17 @@ define i32 @uitofp_i32_double() {
  
  define i32 @uitofp_i64_double() {
  ; SSE2-LABEL: 'uitofp_i64_double'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
  ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; SSE42-LABEL: 'uitofp_i64_double'
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'uitofp_i64_double'
@@ -150,10 +150,10 @@ define i32 @uitofp_i64_double() {
  define i32 @uitofp_i8_float() {
  ; SSE-LABEL: 'uitofp_i8_float'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'uitofp_i8_float'
@@ -183,10 +183,10 @@ define i32 @uitofp_i8_float() {
  define i32 @uitofp_i16_float() {
  ; SSE-LABEL: 'uitofp_i16_float'
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
  ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'uitofp_i16_float'
@@ -256,19 +256,19 @@ define i32 @uitofp_i32_float() {
  
  define i32 @uitofp_i64_float() {
  ; SSE2-LABEL: 'uitofp_i64_float'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
  ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; SSE42-LABEL: 'uitofp_i64_float'
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f32 = uitofp i64 undef to float
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
  ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
  ; AVX-LABEL: 'uitofp_i64_float'
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll

index 5536030..c5977a7 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -1,10 +1,10 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
  
  define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
  ; CHECK-LABEL: @sitofp_uitofp(
@@ -161,43 +161,12 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
  }
  
  define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
-; SSE-LABEL: @sitofp_4i32_8i16(
-; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i16> [[B]], i32 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i16> [[B]], i32 3
-; SSE-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; SSE-NEXT:    [[AB5:%.*]] = sitofp i16 [[B1]] to float
-; SSE-NEXT:    [[AB6:%.*]] = sitofp i16 [[B2]] to float
-; SSE-NEXT:    [[AB7:%.*]] = sitofp i16 [[B3]] to float
-; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; SSE-NEXT:    ret <8 x float> [[R7]]
-;
-; SLM-LABEL: @sitofp_4i32_8i16(
-; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; SLM-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[R72]]
-;
-; AVX-LABEL: @sitofp_4i32_8i16(
-; AVX-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; AVX-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    ret <8 x float> [[R72]]
-;
-; AVX512-LABEL: @sitofp_4i32_8i16(
-; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; AVX512-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:    ret <8 x float> [[R72]]
+; CHECK-LABEL: @sitofp_4i32_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; CHECK-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[R72]]
  ;
    %a0 = extractelement <4 x i32> %a, i32 0
    %a1 = extractelement <4 x i32> %a, i32 1
@@ -228,24 +197,81 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
  
  ; Inspired by PR38154
  define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
-; CHECK-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; CHECK-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; CHECK-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
-; CHECK-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
-; CHECK-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; CHECK-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; SSE-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SSE-NEXT:    [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float>
+; SSE-NEXT:    [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float>
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:    ret <8 x float> [[R72]]
+;
+; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SLM-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; SLM-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; SLM-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:    [[TMP5:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x float>
+; SLM-NEXT:    [[TMP6:%.*]] = uitofp <2 x i8> [[TMP4]] to <2 x float>
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; SLM-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R72:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SLM-NEXT:    ret <8 x float> [[R72]]
+;
+; AVX-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; AVX-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; AVX-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; AVX-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; AVX-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; AVX-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; AVX-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; AVX-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; AVX-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; AVX-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
+; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; AVX512-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; AVX512-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; AVX512-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; AVX512-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX512-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; AVX512-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; AVX512-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; AVX512-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; AVX512-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; AVX512-NEXT:    ret <8 x float> [[R7]]
  ;
    %a0 = extractelement <4 x i32> %a, i32 0
    %a1 = extractelement <4 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll

index 3aa1079..72940f9 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -161,43 +161,12 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
  }
  
  define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
-; SSE-LABEL: @sitofp_4i32_8i16(
-; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i16> [[B]], i32 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i16> [[B]], i32 3
-; SSE-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; SSE-NEXT:    [[AB5:%.*]] = sitofp i16 [[B1]] to float
-; SSE-NEXT:    [[AB6:%.*]] = sitofp i16 [[B2]] to float
-; SSE-NEXT:    [[AB7:%.*]] = sitofp i16 [[B3]] to float
-; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; SSE-NEXT:    ret <8 x float> [[R7]]
-;
-; SLM-LABEL: @sitofp_4i32_8i16(
-; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; SLM-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[R72]]
-;
-; AVX-LABEL: @sitofp_4i32_8i16(
-; AVX-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; AVX-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    ret <8 x float> [[R72]]
-;
-; AVX512-LABEL: @sitofp_4i32_8i16(
-; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; AVX512-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:    ret <8 x float> [[R72]]
+; CHECK-LABEL: @sitofp_4i32_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; CHECK-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[R72]]
  ;
    %a0 = extractelement <4 x i32> %a, i32 0
    %a1 = extractelement <4 x i32> %a, i32 1
@@ -228,24 +197,81 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
  
  ; Inspired by PR38154
  define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
-; CHECK-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; CHECK-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; CHECK-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
-; CHECK-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
-; CHECK-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; CHECK-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; SSE-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SSE-NEXT:    [[TMP9:%.*]] = sitofp <2 x i8> [[TMP8]] to <2 x float>
+; SSE-NEXT:    [[TMP10:%.*]] = uitofp <2 x i8> [[TMP8]] to <2 x float>
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R53:%.*]] = shufflevector <8 x float> [[R31]], <8 x float> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R72:%.*]] = shufflevector <8 x float> [[R53]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:    ret <8 x float> [[R72]]
+;
+; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SLM-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; SLM-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; SLM-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[C:%.*]], <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:    [[TMP5:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x float>
+; SLM-NEXT:    [[TMP6:%.*]] = uitofp <2 x i8> [[TMP4]] to <2 x float>
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; SLM-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R72:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SLM-NEXT:    ret <8 x float> [[R72]]
+;
+; AVX-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; AVX-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; AVX-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; AVX-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; AVX-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; AVX-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; AVX-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; AVX-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; AVX-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; AVX-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
+; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; AVX512-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; AVX512-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; AVX512-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; AVX512-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX512-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; AVX512-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; AVX512-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; AVX512-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; AVX512-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; AVX512-NEXT:    ret <8 x float> [[R7]]
  ;
    %a0 = extractelement <4 x i32> %a, i32 0
    %a1 = extractelement <4 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll

index 4b4b01b..73710e2 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
@@ -1,10 +1,10 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256DQ
  
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  
@@ -216,20 +216,14 @@ define void @sitofp_8i64_8f64() #0 {
  }
  
  define void @sitofp_2i32_2f64() #0 {
-; SSE-LABEL: @sitofp_2i32_2f64(
-; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @sitofp_2i32_2f64(
-; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; AVX-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
-; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @sitofp_2i32_2f64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
+; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
  ;
    %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
    %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
@@ -922,26 +916,11 @@ define void @sitofp_16i32_16f32() #0 {
  }
  
  define void @sitofp_4i16_4f32() #0 {
-; SSE-LABEL: @sitofp_4i16_4f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @sitofp_4i16_4f32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; AVX-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @sitofp_4i16_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
  ;
    %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
    %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
@@ -960,30 +939,12 @@ define void @sitofp_4i16_4f32() #0 {
  
  define void @sitofp_8i16_8f32() #0 {
  ; SSE-LABEL: @sitofp_8i16_8f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
  ; SSE-NEXT:    ret void
  ;
  ; AVX-LABEL: @sitofp_8i16_8f32(
@@ -1021,54 +982,18 @@ define void @sitofp_8i16_8f32() #0 {
  
  define void @sitofp_16i16_16f32() #0 {
  ; SSE-LABEL: @sitofp_16i16_16f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    [[CVT8:%.*]] = sitofp i16 [[LD8]] to float
-; SSE-NEXT:    [[CVT9:%.*]] = sitofp i16 [[LD9]] to float
-; SSE-NEXT:    [[CVT10:%.*]] = sitofp i16 [[LD10]] to float
-; SSE-NEXT:    [[CVT11:%.*]] = sitofp i16 [[LD11]] to float
-; SSE-NEXT:    [[CVT12:%.*]] = sitofp i16 [[LD12]] to float
-; SSE-NEXT:    [[CVT13:%.*]] = sitofp i16 [[LD13]] to float
-; SSE-NEXT:    [[CVT14:%.*]] = sitofp i16 [[LD14]] to float
-; SSE-NEXT:    [[CVT15:%.*]] = sitofp i16 [[LD15]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE-NEXT:    store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
-; SSE-NEXT:    store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE-NEXT:    store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
-; SSE-NEXT:    store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE-NEXT:    store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
-; SSE-NEXT:    store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE-NEXT:    store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
-; SSE-NEXT:    store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
  ; SSE-NEXT:    ret void
  ;
  ; AVX256-LABEL: @sitofp_16i16_16f32(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll

index ef63088..fc90662 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -1,10 +1,10 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256DQ
  
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  
@@ -216,20 +216,14 @@ define void @sitofp_8i64_8f64() #0 {
  }
  
  define void @sitofp_2i32_2f64() #0 {
-; SSE-LABEL: @sitofp_2i32_2f64(
-; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @sitofp_2i32_2f64(
-; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; AVX-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
-; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @sitofp_2i32_2f64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
+; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
  ;
    %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
    %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
@@ -922,26 +916,11 @@ define void @sitofp_16i32_16f32() #0 {
  }
  
  define void @sitofp_4i16_4f32() #0 {
-; SSE-LABEL: @sitofp_4i16_4f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @sitofp_4i16_4f32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; AVX-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @sitofp_4i16_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
  ;
    %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
    %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
@@ -960,30 +939,12 @@ define void @sitofp_4i16_4f32() #0 {
  
  define void @sitofp_8i16_8f32() #0 {
  ; SSE-LABEL: @sitofp_8i16_8f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
  ; SSE-NEXT:    ret void
  ;
  ; AVX-LABEL: @sitofp_8i16_8f32(
@@ -1021,54 +982,18 @@ define void @sitofp_8i16_8f32() #0 {
  
  define void @sitofp_16i16_16f32() #0 {
  ; SSE-LABEL: @sitofp_16i16_16f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
-; SSE-NEXT:    [[CVT8:%.*]] = sitofp i16 [[LD8]] to float
-; SSE-NEXT:    [[CVT9:%.*]] = sitofp i16 [[LD9]] to float
-; SSE-NEXT:    [[CVT10:%.*]] = sitofp i16 [[LD10]] to float
-; SSE-NEXT:    [[CVT11:%.*]] = sitofp i16 [[LD11]] to float
-; SSE-NEXT:    [[CVT12:%.*]] = sitofp i16 [[LD12]] to float
-; SSE-NEXT:    [[CVT13:%.*]] = sitofp i16 [[LD13]] to float
-; SSE-NEXT:    [[CVT14:%.*]] = sitofp i16 [[LD14]] to float
-; SSE-NEXT:    [[CVT15:%.*]] = sitofp i16 [[LD15]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE-NEXT:    store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
-; SSE-NEXT:    store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE-NEXT:    store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
-; SSE-NEXT:    store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE-NEXT:    store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
-; SSE-NEXT:    store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE-NEXT:    store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
-; SSE-NEXT:    store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
  ; SSE-NEXT:    ret void
  ;
  ; AVX256-LABEL: @sitofp_16i16_16f32(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll

index dfc7d64..e3b8beb 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -576,18 +576,9 @@ define void @uitofp_2i64_2f32() #0 {
  
  define void @uitofp_4i64_4f32() #0 {
  ; SSE-LABEL: @uitofp_4i64_4f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
  ; SSE-NEXT:    ret void
  ;
  ; AVX256NODQ-LABEL: @uitofp_4i64_4f32(
@@ -634,30 +625,12 @@ define void @uitofp_4i64_4f32() #0 {
  
  define void @uitofp_8i64_8f32() #0 {
  ; SSE-LABEL: @uitofp_8i64_8f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
  ; SSE-NEXT:    ret void
  ;
  ; AVX256NODQ-LABEL: @uitofp_8i64_8f32(
@@ -874,26 +847,11 @@ define void @uitofp_16i32_16f32() #0 {
  }
  
  define void @uitofp_4i16_4f32() #0 {
-; SSE-LABEL: @uitofp_4i16_4f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @uitofp_4i16_4f32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; AVX-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @uitofp_4i16_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
  ;
    %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
    %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
@@ -912,30 +870,12 @@ define void @uitofp_4i16_4f32() #0 {
  
  define void @uitofp_8i16_8f32() #0 {
  ; SSE-LABEL: @uitofp_8i16_8f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
  ; SSE-NEXT:    ret void
  ;
  ; AVX-LABEL: @uitofp_8i16_8f32(
@@ -973,54 +913,18 @@ define void @uitofp_8i16_8f32() #0 {
  
  define void @uitofp_16i16_16f32() #0 {
  ; SSE-LABEL: @uitofp_16i16_16f32(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
-; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
-; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
-; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
-; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
-; SSE-NEXT:    [[CVT8:%.*]] = uitofp i16 [[LD8]] to float
-; SSE-NEXT:    [[CVT9:%.*]] = uitofp i16 [[LD9]] to float
-; SSE-NEXT:    [[CVT10:%.*]] = uitofp i16 [[LD10]] to float
-; SSE-NEXT:    [[CVT11:%.*]] = uitofp i16 [[LD11]] to float
-; SSE-NEXT:    [[CVT12:%.*]] = uitofp i16 [[LD12]] to float
-; SSE-NEXT:    [[CVT13:%.*]] = uitofp i16 [[LD13]] to float
-; SSE-NEXT:    [[CVT14:%.*]] = uitofp i16 [[LD14]] to float
-; SSE-NEXT:    [[CVT15:%.*]] = uitofp i16 [[LD15]] to float
-; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
-; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
-; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
-; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
-; SSE-NEXT:    store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
-; SSE-NEXT:    store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
-; SSE-NEXT:    store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
-; SSE-NEXT:    store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
-; SSE-NEXT:    store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
-; SSE-NEXT:    store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
-; SSE-NEXT:    store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
-; SSE-NEXT:    store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i16> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
  ; SSE-NEXT:    ret void
  ;
  ; AVX256-LABEL: @uitofp_16i16_16f32(
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Thu, 1 Jul 2021 13:16:02 +0000 (14:16 +0100)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Thu, 1 Jul 2021 14:34:20 +0000 (15:34 +0100)
llvm/lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
llvm/test/Analysis/CostModel/X86/cast.ll		patch \| blob \| history
llvm/test/Analysis/CostModel/X86/fptoui.ll		patch \| blob \| history
llvm/test/Analysis/CostModel/X86/sitofp.ll		patch \| blob \| history
llvm/test/Analysis/CostModel/X86/uitofp.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll		patch \| blob \| history