[ARM] Add MVE vecreduce costmodel tests. NFC

author David Green <david.green@arm.com>

Fri, 9 Oct 2020 15:25:25 +0000 (16:25 +0100)

committer David Green <david.green@arm.com>

Fri, 9 Oct 2020 15:25:25 +0000 (16:25 +0100)
author David Green <david.green@arm.com>
Fri, 9 Oct 2020 15:25:25 +0000 (16:25 +0100)
committer David Green <david.green@arm.com>
Fri, 9 Oct 2020 15:25:25 +0000 (16:25 +0100)
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll b/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll

new file mode 100644 (file)

index 0000000..90de689
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll
@@ -0,0 +1,1079 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -cost-model -analyze | FileCheck %s
+
+define void @add_i8() {
+; CHECK-LABEL: 'add_i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
+
+  %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+
+  %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+
+  %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+
+  %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+
+  ret void
+}
+
+define void @add_i16() {
+; CHECK-LABEL: 'add_i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3za = zext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sa = sext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4za = zext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %a0za = zext <1 x i8> undef to <1 x i16>
+  %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za)
+
+  %a0sa = sext <1 x i8> undef to <1 x i16>
+  %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sa)
+
+  %a1za = zext <2 x i8> undef to <2 x i16>
+  %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1za)
+
+  %a1sa = sext <2 x i8> undef to <2 x i16>
+  %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa)
+
+  %a2za = zext <4 x i8> undef to <4 x i16>
+  %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za)
+
+  %a2sa = sext <4 x i8> undef to <4 x i16>
+  %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sa)
+
+  %a3za = zext <8 x i8> undef to <8 x i16>
+  %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3za)
+
+  %a3sa = sext <8 x i8> undef to <8 x i16>
+  %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sa)
+
+  %a4za = zext <16 x i8> undef to <16 x i16>
+  %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za)
+
+  %a4sa = sext <16 x i8> undef to <16 x i16>
+  %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa)
+
+  %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
+
+  %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+
+  %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+
+  %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+
+  %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+
+  ret void
+}
+
+define void @add_i32() {
+; CHECK-LABEL: 'add_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3za = zext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sa = sext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4za = zext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6za = zext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8za = zext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sa = sext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9za = zext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %a0za = zext <1 x i8> undef to <1 x i32>
+  %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za)
+
+  %a0sa = sext <1 x i8> undef to <1 x i32>
+  %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sa)
+
+  %a1za = zext <2 x i8> undef to <2 x i32>
+  %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1za)
+
+  %a1sa = sext <2 x i8> undef to <2 x i32>
+  %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa)
+
+  %a2za = zext <4 x i8> undef to <4 x i32>
+  %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za)
+
+  %a2sa = sext <4 x i8> undef to <4 x i32>
+  %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sa)
+
+  %a3za = zext <8 x i8> undef to <8 x i32>
+  %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3za)
+
+  %a3sa = sext <8 x i8> undef to <8 x i32>
+  %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sa)
+
+  %a4za = zext <16 x i8> undef to <16 x i32>
+  %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4za)
+
+  %a4sa = sext <16 x i8> undef to <16 x i32>
+  %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa)
+
+  %a5za = zext <1 x i16> undef to <1 x i32>
+  %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za)
+
+  %a5sa = sext <1 x i16> undef to <1 x i32>
+  %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sa)
+
+  %a6za = zext <2 x i16> undef to <2 x i32>
+  %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6za)
+
+  %a6sa = sext <2 x i16> undef to <2 x i32>
+  %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa)
+
+  %a7za = zext <4 x i16> undef to <4 x i32>
+  %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za)
+
+  %a7sa = sext <4 x i16> undef to <4 x i32>
+  %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sa)
+
+  %a8za = zext <8 x i16> undef to <8 x i32>
+  %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8za)
+
+  %a8sa = sext <8 x i16> undef to <8 x i32>
+  %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sa)
+
+  %a9za = zext <16 x i16> undef to <16 x i32>
+  %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za)
+
+  %a9sa = sext <16 x i16> undef to <16 x i32>
+  %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa)
+
+  %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
+
+  %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+
+  %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+
+  %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+
+  %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+
+  ret void
+}
+
+define void @add_i64() {
+; CHECK-LABEL: 'add_i64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a0za = zext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1za = zext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sa = sext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2za = zext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sa = sext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3za = zext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sa = sext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4za = zext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sa = sext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a5za = zext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6za = zext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sa = sext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7za = zext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sa = sext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8za = zext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sa = sext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9za = zext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sa = sext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a10za = zext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11za = zext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sa = sext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12za = zext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sa = sext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13za = zext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sa = sext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14za = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14za)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sa = sext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sa)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %a0za = zext <1 x i8> undef to <1 x i64>
+  %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za)
+
+  %a0sa = sext <1 x i8> undef to <1 x i64>
+  %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sa)
+
+  %a1za = zext <2 x i8> undef to <2 x i64>
+  %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1za)
+
+  %a1sa = sext <2 x i8> undef to <2 x i64>
+  %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sa)
+
+  %a2za = zext <4 x i8> undef to <4 x i64>
+  %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2za)
+
+  %a2sa = sext <4 x i8> undef to <4 x i64>
+  %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sa)
+
+  %a3za = zext <8 x i8> undef to <8 x i64>
+  %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3za)
+
+  %a3sa = sext <8 x i8> undef to <8 x i64>
+  %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sa)
+
+  %a4za = zext <16 x i8> undef to <16 x i64>
+  %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4za)
+
+  %a4sa = sext <16 x i8> undef to <16 x i64>
+  %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sa)
+
+  %a5za = zext <1 x i16> undef to <1 x i64>
+  %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5za)
+
+  %a5sa = sext <1 x i16> undef to <1 x i64>
+  %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sa)
+
+  %a6za = zext <2 x i16> undef to <2 x i64>
+  %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6za)
+
+  %a6sa = sext <2 x i16> undef to <2 x i64>
+  %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sa)
+
+  %a7za = zext <4 x i16> undef to <4 x i64>
+  %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7za)
+
+  %a7sa = sext <4 x i16> undef to <4 x i64>
+  %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sa)
+
+  %a8za = zext <8 x i16> undef to <8 x i64>
+  %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8za)
+
+  %a8sa = sext <8 x i16> undef to <8 x i64>
+  %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sa)
+
+  %a9za = zext <16 x i16> undef to <16 x i64>
+  %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9za)
+
+  %a9sa = sext <16 x i16> undef to <16 x i64>
+  %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sa)
+
+  %a10za = zext <1 x i32> undef to <1 x i64>
+  %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10za)
+
+  %a10sa = sext <1 x i32> undef to <1 x i64>
+  %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sa)
+
+  %a11za = zext <2 x i32> undef to <2 x i64>
+  %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11za)
+
+  %a11sa = sext <2 x i32> undef to <2 x i64>
+  %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sa)
+
+  %a12za = zext <4 x i32> undef to <4 x i64>
+  %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12za)
+
+  %a12sa = sext <4 x i32> undef to <4 x i64>
+  %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sa)
+
+  %a13za = zext <8 x i32> undef to <8 x i64>
+  %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13za)
+
+  %a13sa = sext <8 x i32> undef to <8 x i64>
+  %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sa)
+
+  %a14za = zext <16 x i32> undef to <16 x i64>
+  %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14za)
+
+  %a14sa = sext <16 x i32> undef to <16 x i64>
+  %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sa)
+
+  %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+
+  %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+
+  %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+
+  %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+
+  %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+
+  ret void
+}
+
+define void @mla_i8() {
+; CHECK-LABEL: 'mla_i8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0m = mul <1 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a1m = mul <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2m = mul <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3m = mul <8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a3m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a4m = mul <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a4m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %a0m = mul <1 x i8> undef, undef
+  %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m)
+
+  %a1m = mul <2 x i8> undef, undef
+  %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m)
+
+  %a2m = mul <4 x i8> undef, undef
+  %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m)
+
+  %a3m = mul <8 x i8> undef, undef
+  %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a3m)
+
+  %a4m = mul <16 x i8> undef, undef
+  %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a4m)
+
+  ret void
+}
+
+define void @mla_i16() {
+; CHECK-LABEL: 'mla_i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zb = zext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zm = mul <1 x i16> %a0za, %a0zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sb = sext <1 x i8> undef to <1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sm = mul <1 x i16> %a0sa, %a0sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1zb = zext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %a1zm = mul <2 x i16> %a1za, %a1zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sb = sext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %a1sm = mul <2 x i16> %a1sa, %a1sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zb = zext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i16> %a2za, %a2zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sb = sext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sm = mul <4 x i16> %a2sa, %a2sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3za = zext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3zb = zext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3zm = mul <8 x i16> %a3za, %a3zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sa = sext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sb = sext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a3sm = mul <8 x i16> %a3sa, %a3sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4za = zext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4zb = zext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4zm = mul <16 x i16> %a4za, %a4zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a4sb = sext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a4sm = mul <16 x i16> %a4sa, %a4sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5m = mul <1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a6m = mul <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7m = mul <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a8m = mul <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a8m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a9m = mul <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1168 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a9m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %a0za = zext <1 x i8> undef to <1 x i16>
+  %a0zb = zext <1 x i8> undef to <1 x i16>
+  %a0zm = mul <1 x i16> %a0za, %a0zb
+  %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0zm)
+
+  %a0sa = sext <1 x i8> undef to <1 x i16>
+  %a0sb = sext <1 x i8> undef to <1 x i16>
+  %a0sm = mul <1 x i16> %a0sa, %a0sb
+  %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sm)
+
+  %a1za = zext <2 x i8> undef to <2 x i16>
+  %a1zb = zext <2 x i8> undef to <2 x i16>
+  %a1zm = mul <2 x i16> %a1za, %a1zb
+  %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1zm)
+
+  %a1sa = sext <2 x i8> undef to <2 x i16>
+  %a1sb = sext <2 x i8> undef to <2 x i16>
+  %a1sm = mul <2 x i16> %a1sa, %a1sb
+  %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sm)
+
+  %a2za = zext <4 x i8> undef to <4 x i16>
+  %a2zb = zext <4 x i8> undef to <4 x i16>
+  %a2zm = mul <4 x i16> %a2za, %a2zb
+  %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2zm)
+
+  %a2sa = sext <4 x i8> undef to <4 x i16>
+  %a2sb = sext <4 x i8> undef to <4 x i16>
+  %a2sm = mul <4 x i16> %a2sa, %a2sb
+  %a2s = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2sm)
+
+  %a3za = zext <8 x i8> undef to <8 x i16>
+  %a3zb = zext <8 x i8> undef to <8 x i16>
+  %a3zm = mul <8 x i16> %a3za, %a3zb
+  %a3z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3zm)
+
+  %a3sa = sext <8 x i8> undef to <8 x i16>
+  %a3sb = sext <8 x i8> undef to <8 x i16>
+  %a3sm = mul <8 x i16> %a3sa, %a3sb
+  %a3s = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a3sm)
+
+  %a4za = zext <16 x i8> undef to <16 x i16>
+  %a4zb = zext <16 x i8> undef to <16 x i16>
+  %a4zm = mul <16 x i16> %a4za, %a4zb
+  %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4zm)
+
+  %a4sa = sext <16 x i8> undef to <16 x i16>
+  %a4sb = sext <16 x i8> undef to <16 x i16>
+  %a4sm = mul <16 x i16> %a4sa, %a4sb
+  %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm)
+
+  %a5m = mul <1 x i16> undef, undef
+  %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m)
+
+  %a6m = mul <2 x i16> undef, undef
+  %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m)
+
+  %a7m = mul <4 x i16> undef, undef
+  %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m)
+
+  %a8m = mul <8 x i16> undef, undef
+  %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a8m)
+
+  %a9m = mul <16 x i16> undef, undef
+  %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a9m)
+
+  ret void
+}
+
+define void @mla_i32() {
+; CHECK-LABEL: 'mla_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zb = zext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0zm = mul <1 x i32> %a0za, %a0zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sb = sext <1 x i8> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a0sm = mul <1 x i32> %a0sa, %a0sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a1zb = zext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %a1zm = mul <2 x i32> %a1za, %a1zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1sb = sext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %a1sm = mul <2 x i32> %a1sa, %a1sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2zb = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i32> %a2za, %a2zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a2sb = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a2sm = mul <4 x i32> %a2sa, %a2sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3za = zext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3zb = zext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3zm = mul <8 x i32> %a3za, %a3zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sa = sext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a3sb = sext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a3sm = mul <8 x i32> %a3sa, %a3sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4za = zext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4zb = zext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4zm = mul <16 x i32> %a4za, %a4zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a4sb = sext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a4sm = mul <16 x i32> %a4sa, %a4sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5zb = zext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5zm = mul <1 x i32> %a5za, %a5zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sb = sext <1 x i16> undef to <1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a5sm = mul <1 x i32> %a5sa, %a5sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6za = zext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a6zb = zext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %a6zm = mul <2 x i32> %a6za, %a6zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6sb = sext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %a6sm = mul <2 x i32> %a6sa, %a6sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7zb = zext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7zm = mul <4 x i32> %a7za, %a7zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sb = sext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a7sm = mul <4 x i32> %a7sa, %a7sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8za = zext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8zb = zext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8zm = mul <8 x i32> %a8za, %a8zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sa = sext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %a8sb = sext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a8sm = mul <8 x i32> %a8sa, %a8sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9za = zext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9zb = zext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9zm = mul <16 x i32> %a9za, %a9zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %a9sb = sext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a9sm = mul <16 x i32> %a9sa, %a9sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a10m = mul <1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a11m = mul <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %a12m = mul <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a13m = mul <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a13m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a14m = mul <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 622 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a14m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %a0za = zext <1 x i8> undef to <1 x i32>
+  %a0zb = zext <1 x i8> undef to <1 x i32>
+  %a0zm = mul <1 x i32> %a0za, %a0zb
+  %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0zm)
+
+  %a0sa = sext <1 x i8> undef to <1 x i32>
+  %a0sb = sext <1 x i8> undef to <1 x i32>
+  %a0sm = mul <1 x i32> %a0sa, %a0sb
+  %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sm)
+
+  %a1za = zext <2 x i8> undef to <2 x i32>
+  %a1zb = zext <2 x i8> undef to <2 x i32>
+  %a1zm = mul <2 x i32> %a1za, %a1zb
+  %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1zm)
+
+  %a1sa = sext <2 x i8> undef to <2 x i32>
+  %a1sb = sext <2 x i8> undef to <2 x i32>
+  %a1sm = mul <2 x i32> %a1sa, %a1sb
+  %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sm)
+
+  %a2za = zext <4 x i8> undef to <4 x i32>
+  %a2zb = zext <4 x i8> undef to <4 x i32>
+  %a2zm = mul <4 x i32> %a2za, %a2zb
+  %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2zm)
+
+  %a2sa = sext <4 x i8> undef to <4 x i32>
+  %a2sb = sext <4 x i8> undef to <4 x i32>
+  %a2sm = mul <4 x i32> %a2sa, %a2sb
+  %a2s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2sm)
+
+  %a3za = zext <8 x i8> undef to <8 x i32>
+  %a3zb = zext <8 x i8> undef to <8 x i32>
+  %a3zm = mul <8 x i32> %a3za, %a3zb
+  %a3z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3zm)
+
+  %a3sa = sext <8 x i8> undef to <8 x i32>
+  %a3sb = sext <8 x i8> undef to <8 x i32>
+  %a3sm = mul <8 x i32> %a3sa, %a3sb
+  %a3s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a3sm)
+
+  %a4za = zext <16 x i8> undef to <16 x i32>
+  %a4zb = zext <16 x i8> undef to <16 x i32>
+  %a4zm = mul <16 x i32> %a4za, %a4zb
+  %a4z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4zm)
+
+  %a4sa = sext <16 x i8> undef to <16 x i32>
+  %a4sb = sext <16 x i8> undef to <16 x i32>
+  %a4sm = mul <16 x i32> %a4sa, %a4sb
+  %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sm)
+
+  %a5za = zext <1 x i16> undef to <1 x i32>
+  %a5zb = zext <1 x i16> undef to <1 x i32>
+  %a5zm = mul <1 x i32> %a5za, %a5zb
+  %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5zm)
+
+  %a5sa = sext <1 x i16> undef to <1 x i32>
+  %a5sb = sext <1 x i16> undef to <1 x i32>
+  %a5sm = mul <1 x i32> %a5sa, %a5sb
+  %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sm)
+
+  %a6za = zext <2 x i16> undef to <2 x i32>
+  %a6zb = zext <2 x i16> undef to <2 x i32>
+  %a6zm = mul <2 x i32> %a6za, %a6zb
+  %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6zm)
+
+  %a6sa = sext <2 x i16> undef to <2 x i32>
+  %a6sb = sext <2 x i16> undef to <2 x i32>
+  %a6sm = mul <2 x i32> %a6sa, %a6sb
+  %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sm)
+
+  %a7za = zext <4 x i16> undef to <4 x i32>
+  %a7zb = zext <4 x i16> undef to <4 x i32>
+  %a7zm = mul <4 x i32> %a7za, %a7zb
+  %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7zm)
+
+  %a7sa = sext <4 x i16> undef to <4 x i32>
+  %a7sb = sext <4 x i16> undef to <4 x i32>
+  %a7sm = mul <4 x i32> %a7sa, %a7sb
+  %a7s = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7sm)
+
+  %a8za = zext <8 x i16> undef to <8 x i32>
+  %a8zb = zext <8 x i16> undef to <8 x i32>
+  %a8zm = mul <8 x i32> %a8za, %a8zb
+  %a8z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8zm)
+
+  %a8sa = sext <8 x i16> undef to <8 x i32>
+  %a8sb = sext <8 x i16> undef to <8 x i32>
+  %a8sm = mul <8 x i32> %a8sa, %a8sb
+  %a8s = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a8sm)
+
+  %a9za = zext <16 x i16> undef to <16 x i32>
+  %a9zb = zext <16 x i16> undef to <16 x i32>
+  %a9zm = mul <16 x i32> %a9za, %a9zb
+  %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9zm)
+
+  %a9sa = sext <16 x i16> undef to <16 x i32>
+  %a9sb = sext <16 x i16> undef to <16 x i32>
+  %a9sm = mul <16 x i32> %a9sa, %a9sb
+  %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm)
+
+  %a10m = mul <1 x i32> undef, undef
+  %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m)
+
+  %a11m = mul <2 x i32> undef, undef
+  %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m)
+
+  %a12m = mul <4 x i32> undef, undef
+  %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m)
+
+  %a13m = mul <8 x i32> undef, undef
+  %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a13m)
+
+  %a14m = mul <16 x i32> undef, undef
+  %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a14m)
+
+  ret void
+}
+
+define void @mla_i64() {
+; CHECK-LABEL: 'mla_i64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a0za = zext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a0zb = zext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0zm = mul <1 x i64> %a0za, %a0zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a0sb = sext <1 x i8> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a0sm = mul <1 x i64> %a0sa, %a0sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1za = zext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a1zb = zext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a1zm = mul <2 x i64> %a1za, %a1zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sa = sext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a1sb = sext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a1sm = mul <2 x i64> %a1sa, %a1sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2za = zext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a2zb = zext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %a2zm = mul <4 x i64> %a2za, %a2zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sa = sext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a2sb = sext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %a2sm = mul <4 x i64> %a2sa, %a2sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3za = zext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a3zb = zext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a3zm = mul <8 x i64> %a3za, %a3zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sa = sext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a3sb = sext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a3sm = mul <8 x i64> %a3sa, %a3sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4za = zext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %a4zb = zext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 800 for instruction: %a4zm = mul <16 x i64> %a4za, %a4zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sa = sext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %a4sb = sext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 800 for instruction: %a4sm = mul <16 x i64> %a4sa, %a4sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a5za = zext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a5zb = zext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5zm = mul <1 x i64> %a5za, %a5zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a5sb = sext <1 x i16> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a5sm = mul <1 x i64> %a5sa, %a5sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6za = zext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a6zb = zext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a6zm = mul <2 x i64> %a6za, %a6zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sa = sext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %a6sb = sext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a6sm = mul <2 x i64> %a6sa, %a6sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7za = zext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a7zb = zext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %a7zm = mul <4 x i64> %a7za, %a7zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sa = sext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %a7sb = sext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %a7sm = mul <4 x i64> %a7sa, %a7sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8za = zext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %a8zb = zext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a8zm = mul <8 x i64> %a8za, %a8zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sa = sext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %a8sb = sext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a8sm = mul <8 x i64> %a8sa, %a8sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9za = zext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 296 for instruction: %a9zb = zext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 800 for instruction: %a9zm = mul <16 x i64> %a9za, %a9zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sa = sext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %a9sb = sext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 800 for instruction: %a9sm = mul <16 x i64> %a9sa, %a9sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a10za = zext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a10zb = zext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10zm = mul <1 x i64> %a10za, %a10zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a10sb = sext <1 x i32> undef to <1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a10sm = mul <1 x i64> %a10sa, %a10sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11za = zext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a11zb = zext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11zm = mul <2 x i64> %a11za, %a11zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sa = sext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sb = sext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a11sm = mul <2 x i64> %a11sa, %a11sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12za = zext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %a12zb = zext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %a12zm = mul <4 x i64> %a12za, %a12zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sa = sext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12sb = sext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %a12sm = mul <4 x i64> %a12sa, %a12sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13za = zext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %a13zb = zext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a13zm = mul <8 x i64> %a13za, %a13zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sa = sext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %a13sb = sext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %a13sm = mul <8 x i64> %a13sa, %a13sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14za = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a14zb = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 800 for instruction: %a14zm = mul <16 x i64> %a14za, %a14zb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14zm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sa = sext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1056 for instruction: %a14sb = sext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 800 for instruction: %a14sm = mul <16 x i64> %a14sa, %a14sb
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sm)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a15m = mul <1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a15m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a16m = mul <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a16m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %a17m = mul <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a17m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %a18m = mul <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a18m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %a19m = mul <16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a19m)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %a0za = zext <1 x i8> undef to <1 x i64>
+  %a0zb = zext <1 x i8> undef to <1 x i64>
+  %a0zm = mul <1 x i64> %a0za, %a0zb
+  %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0zm)
+
+  %a0sa = sext <1 x i8> undef to <1 x i64>
+  %a0sb = sext <1 x i8> undef to <1 x i64>
+  %a0sm = mul <1 x i64> %a0sa, %a0sb
+  %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sm)
+
+  %a1za = zext <2 x i8> undef to <2 x i64>
+  %a1zb = zext <2 x i8> undef to <2 x i64>
+  %a1zm = mul <2 x i64> %a1za, %a1zb
+  %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1zm)
+
+  %a1sa = sext <2 x i8> undef to <2 x i64>
+  %a1sb = sext <2 x i8> undef to <2 x i64>
+  %a1sm = mul <2 x i64> %a1sa, %a1sb
+  %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sm)
+
+  %a2za = zext <4 x i8> undef to <4 x i64>
+  %a2zb = zext <4 x i8> undef to <4 x i64>
+  %a2zm = mul <4 x i64> %a2za, %a2zb
+  %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2zm)
+
+  %a2sa = sext <4 x i8> undef to <4 x i64>
+  %a2sb = sext <4 x i8> undef to <4 x i64>
+  %a2sm = mul <4 x i64> %a2sa, %a2sb
+  %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sm)
+
+  %a3za = zext <8 x i8> undef to <8 x i64>
+  %a3zb = zext <8 x i8> undef to <8 x i64>
+  %a3zm = mul <8 x i64> %a3za, %a3zb
+  %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3zm)
+
+  %a3sa = sext <8 x i8> undef to <8 x i64>
+  %a3sb = sext <8 x i8> undef to <8 x i64>
+  %a3sm = mul <8 x i64> %a3sa, %a3sb
+  %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sm)
+
+  %a4za = zext <16 x i8> undef to <16 x i64>
+  %a4zb = zext <16 x i8> undef to <16 x i64>
+  %a4zm = mul <16 x i64> %a4za, %a4zb
+  %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4zm)
+
+  %a4sa = sext <16 x i8> undef to <16 x i64>
+  %a4sb = sext <16 x i8> undef to <16 x i64>
+  %a4sm = mul <16 x i64> %a4sa, %a4sb
+  %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sm)
+
+  %a5za = zext <1 x i16> undef to <1 x i64>
+  %a5zb = zext <1 x i16> undef to <1 x i64>
+  %a5zm = mul <1 x i64> %a5za, %a5zb
+  %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5zm)
+
+  %a5sa = sext <1 x i16> undef to <1 x i64>
+  %a5sb = sext <1 x i16> undef to <1 x i64>
+  %a5sm = mul <1 x i64> %a5sa, %a5sb
+  %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sm)
+
+  %a6za = zext <2 x i16> undef to <2 x i64>
+  %a6zb = zext <2 x i16> undef to <2 x i64>
+  %a6zm = mul <2 x i64> %a6za, %a6zb
+  %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6zm)
+
+  %a6sa = sext <2 x i16> undef to <2 x i64>
+  %a6sb = sext <2 x i16> undef to <2 x i64>
+  %a6sm = mul <2 x i64> %a6sa, %a6sb
+  %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sm)
+
+  %a7za = zext <4 x i16> undef to <4 x i64>
+  %a7zb = zext <4 x i16> undef to <4 x i64>
+  %a7zm = mul <4 x i64> %a7za, %a7zb
+  %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7zm)
+
+  %a7sa = sext <4 x i16> undef to <4 x i64>
+  %a7sb = sext <4 x i16> undef to <4 x i64>
+  %a7sm = mul <4 x i64> %a7sa, %a7sb
+  %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sm)
+
+  %a8za = zext <8 x i16> undef to <8 x i64>
+  %a8zb = zext <8 x i16> undef to <8 x i64>
+  %a8zm = mul <8 x i64> %a8za, %a8zb
+  %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8zm)
+
+  %a8sa = sext <8 x i16> undef to <8 x i64>
+  %a8sb = sext <8 x i16> undef to <8 x i64>
+  %a8sm = mul <8 x i64> %a8sa, %a8sb
+  %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sm)
+
+  %a9za = zext <16 x i16> undef to <16 x i64>
+  %a9zb = zext <16 x i16> undef to <16 x i64>
+  %a9zm = mul <16 x i64> %a9za, %a9zb
+  %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9zm)
+
+  %a9sa = sext <16 x i16> undef to <16 x i64>
+  %a9sb = sext <16 x i16> undef to <16 x i64>
+  %a9sm = mul <16 x i64> %a9sa, %a9sb
+  %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sm)
+
+  %a10za = zext <1 x i32> undef to <1 x i64>
+  %a10zb = zext <1 x i32> undef to <1 x i64>
+  %a10zm = mul <1 x i64> %a10za, %a10zb
+  %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10zm)
+
+  %a10sa = sext <1 x i32> undef to <1 x i64>
+  %a10sb = sext <1 x i32> undef to <1 x i64>
+  %a10sm = mul <1 x i64> %a10sa, %a10sb
+  %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sm)
+
+  %a11za = zext <2 x i32> undef to <2 x i64>
+  %a11zb = zext <2 x i32> undef to <2 x i64>
+  %a11zm = mul <2 x i64> %a11za, %a11zb
+  %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11zm)
+
+  %a11sa = sext <2 x i32> undef to <2 x i64>
+  %a11sb = sext <2 x i32> undef to <2 x i64>
+  %a11sm = mul <2 x i64> %a11sa, %a11sb
+  %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sm)
+
+  %a12za = zext <4 x i32> undef to <4 x i64>
+  %a12zb = zext <4 x i32> undef to <4 x i64>
+  %a12zm = mul <4 x i64> %a12za, %a12zb
+  %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12zm)
+
+  %a12sa = sext <4 x i32> undef to <4 x i64>
+  %a12sb = sext <4 x i32> undef to <4 x i64>
+  %a12sm = mul <4 x i64> %a12sa, %a12sb
+  %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sm)
+
+  %a13za = zext <8 x i32> undef to <8 x i64>
+  %a13zb = zext <8 x i32> undef to <8 x i64>
+  %a13zm = mul <8 x i64> %a13za, %a13zb
+  %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13zm)
+
+  %a13sa = sext <8 x i32> undef to <8 x i64>
+  %a13sb = sext <8 x i32> undef to <8 x i64>
+  %a13sm = mul <8 x i64> %a13sa, %a13sb
+  %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sm)
+
+  %a14za = zext <16 x i32> undef to <16 x i64>
+  %a14zb = zext <16 x i32> undef to <16 x i64>
+  %a14zm = mul <16 x i64> %a14za, %a14zb
+  %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14zm)
+
+  %a14sa = sext <16 x i32> undef to <16 x i64>
+  %a14sb = sext <16 x i32> undef to <16 x i64>
+  %a14sm = mul <16 x i64> %a14sa, %a14sb
+  %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sm)
+
+  %a15m = mul <1 x i64> undef, undef
+  %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a15m)
+
+  %a16m = mul <2 x i64> undef, undef
+  %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a16m)
+
+  %a17m = mul <4 x i64> undef, undef
+  %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a17m)
+
+  %a18m = mul <8 x i64> undef, undef
+  %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a18m)
+
+  %a19m = mul <16 x i64> undef, undef
+  %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a19m)
+
+  ret void
+}
+
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll

index d71af6d..2564c1e 100644 (file)
--- a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
@@ -1,10 +1,8 @@
  ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
  ; RUN: opt -cost-model -analyze -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-RECIP
  ; RUN: opt -cost-model -analyze -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-RECIP
-; RUN: opt -cost-model -analyze -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-RECIP
  ; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=thumbv8m.main < %s | FileCheck %s --check-prefix=V8M-SIZE
  ; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=NEON-SIZE
-; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=armv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=MVE-SIZE
  
  define i32 @reduce_i64(i32 %arg) {
  ; V8M-RECIP-LABEL: 'reduce_i64'
@@ -23,14 +21,6 @@ define i32 @reduce_i64(i32 %arg) {
  ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
  ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
-; MVE-RECIP-LABEL: 'reduce_i64'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 730 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
  ; V8M-SIZE-LABEL: 'reduce_i64'
  ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
  ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
@@ -47,14 +37,6 @@ define i32 @reduce_i64(i32 %arg) {
  ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
  ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
  ;
-; MVE-SIZE-LABEL: 'reduce_i64'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
    %V1  = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
    %V2  = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
    %V4  = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
@@ -84,16 +66,6 @@ define i32 @reduce_i32(i32 %arg) {
  ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
  ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
  ;
-; MVE-RECIP-LABEL: 'reduce_i32'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 782 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4120 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5658 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 11806 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 36390 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
  ; V8M-SIZE-LABEL: 'reduce_i32'
  ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
  ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
@@ -114,16 +86,6 @@ define i32 @reduce_i32(i32 %arg) {
  ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
  ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
  ;
-; MVE-SIZE-LABEL: 'reduce_i32'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
    %V2   = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
    %V4   = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
    %V8   = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
author	David Green <david.green@arm.com>
	Fri, 9 Oct 2020 15:25:25 +0000 (16:25 +0100)
committer	David Green <david.green@arm.com>
	Fri, 9 Oct 2020 15:25:25 +0000 (16:25 +0100)
llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Analysis/CostModel/ARM/reduce-add.ll		patch \| blob \| history