From 242203d254c5fc27520348f4f5d49a4c39f7db0f Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Mon, 29 Aug 2022 22:54:41 -0700
Subject: [PATCH] [AArch64][TTI] Add cost table entry for trunc over vector of
 integers.

1) Tablegen patterns exist to use 'xtn' and 'uzp1' for trunc [1]. Cost table entries are updated based on the actual number of {xtn, uzp1} instructions generated.
2) Without this, an IR instruction like trunc <8 x i16> %v to <8 x i8> is considered free and might be sinked to other basic blocks. As a result, the sinked 'trunc' is in a different basic block with its (usually not-free) vector operand and misses the chance to be combined during instruction selection. (examples in [2])
3) It's a lot of effort to teach CodeGenPrepare.cpp to sink the operand of trunc without introducing regressions, since the instruction to compute the operand of trunc could be faster (e.g., throughput) than the instruction corresponding to "trunc (bin-vector-op". For instance in [3], sinking %1 (as trunc operand) into bb.1 and bb.2 means to replace 2 xtn with 2 shrn (shrn has a throughput of 1 and only utilize v1 pipeline), which is not necessarily good, especially since ushr result needs to be preserved for store operation in bb.0. Meanwhile, it's too optimistic (for CodeGenPrepare pass) to assume machine-cse will always be able to de-dup shrn from various basic blocks into one shrn.

[1] For {v8i16->v8i8, v4i32->v4i16, v2i64->v2i32}, https://github.com/llvm/llvm-project/blob/813ae2871d71f32cce46768e63185cd64651f6e9/llvm/lib/Target/AArch64/AArch64InstrInfo.td#L4472.
    For concat (trunc, trunc) -> uzip1, https://github.com/llvm/llvm-project/blob/813ae2871d71f32cce46768e63185cd64651f6e9/llvm/lib/Target/AArch64/AArch64InstrInfo.td#L5428-L5437
[2] examples
    - trunc(umin(X, 255)) -> UQXTRN v8i8 (and other {u,s}x{min,max} pattern for v8i16 operands) from https://github.com/llvm/llvm-project/blob/813ae2871d71f32cce46768e63185cd64651f6e9/llvm/lib/Target/AArch64/AArch64InstrInfo.td#L4515-L4528
    - trunc (AArch64vlshr v8i16, imm) -> SHRNv8i8 (same missed for SHRNv2i32) from https://github.com/llvm/llvm-project/blob/813ae2871d71f32cce46768e63185cd64651f6e9/llvm/lib/Target/AArch64/AArch64InstrInfo.td#L6743-L6748
[3]
    ---
    ; instruction latency / throughput / pipeline on `neoverse-n1`
    bb.0:
      %1 = lshr <8 x i16> %10, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>   ; ushr, latency 2, throughput 1, pipeline V1
      %2 = trunc <8 x i16> %1 to <8 x i8>  ; xtn, latency 2, throughput 2, pipeline V
      %3 = store <8 x i8> %1, ptr %addr
      br cond i1 cond, label bb.1, label bb.2

    bb.1:
      %4 = trunc <8 x i16> %1 to <8 x i8> ; xtn

    bb.2:
      %5 = trunc <8 x i16> %1 to <8 x i8> ; xtn
    ---

Differential Revision: https://reviews.llvm.org/D132784
---
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  | 24 ++++++++++++---
 .../Analysis/CostModel/AArch64/arith-overflow.ll   | 36 +++++++++++-----------
 llvm/test/Analysis/CostModel/AArch64/cast.ll       | 34 ++++++++++----------
 3 files changed, 55 insertions(+), 39 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3576cef..ce3c34b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1602,10 +1602,26 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 
   static const TypeConversionCostTblEntry
   ConversionTbl[] = {
-    { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
-    { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
-    { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
-    { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+    { ISD::TRUNCATE, MVT::v2i8,   MVT::v2i64,  1},  // xtn
+    { ISD::TRUNCATE, MVT::v2i16,  MVT::v2i64,  1},  // xtn
+    { ISD::TRUNCATE, MVT::v2i32,  MVT::v2i64,  1},  // xtn
+    { ISD::TRUNCATE, MVT::v4i8,   MVT::v4i32,  1},  // xtn
+    { ISD::TRUNCATE, MVT::v4i8,   MVT::v4i64,  3},  // 2 xtn + 1 uzp1
+    { ISD::TRUNCATE, MVT::v4i16,  MVT::v4i32,  1},  // xtn
+    { ISD::TRUNCATE, MVT::v4i16,  MVT::v4i64,  2},  // 1 uzp1 + 1 xtn
+    { ISD::TRUNCATE, MVT::v4i32,  MVT::v4i64,  1},  // 1 uzp1
+    { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i16,  1},  // 1 xtn
+    { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i32,  2},  // 1 uzp1 + 1 xtn
+    { ISD::TRUNCATE, MVT::v8i8,   MVT::v8i64,  4},  // 3 x uzp1 + xtn
+    { ISD::TRUNCATE, MVT::v8i16,  MVT::v8i32,  1},  // 1 uzp1
+    { ISD::TRUNCATE, MVT::v8i16,  MVT::v8i64,  3},  // 3 x uzp1
+    { ISD::TRUNCATE, MVT::v8i32,  MVT::v8i64,  2},  // 2 x uzp1
+    { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i16, 1},  // uzp1
+    { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i32, 3},  // (2 + 1) x uzp1
+    { ISD::TRUNCATE, MVT::v16i8,  MVT::v16i64, 7},  // (4 + 2 + 1) x uzp1
+    { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2},  // 2 x uzp1
+    { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6},  // (4 + 2) x uzp1
+    { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4},  // 4 x uzp1
 
     // Truncations on nxvmiN
     { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll b/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
index e46fbe9..f88fe27 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
@@ -359,13 +359,13 @@ define i32 @smul(i32 %arg) {
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
@@ -378,9 +378,9 @@ define i32 @smul(i32 %arg) {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -441,13 +441,13 @@ define i32 @umul(i32 %arg) {
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
@@ -460,9 +460,9 @@ define i32 @umul(i32 %arg) {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index 691e445..b2c8acf9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -167,26 +167,26 @@ define void @trunc() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %r8 = trunc i8 undef to i1
-- 
2.7.4