From c931d35216a320465947a5a46b158cf71024458d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 23 Sep 2021 14:48:08 +0100 Subject: [PATCH] [CostModel][X86] Increase i64 mul cost from 1 to 2 Only the most recent cpus support really 1cy 64-bit multiplies, and the X64 cost table represents a realistic worst case. The 1cy value was also discouraging vectorization when most vXi64 PMULDQ expansions aren't actually slower than scalarization. Noticed while investigating PR51436. --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 1 + llvm/test/Analysis/CostModel/X86/arith-fix.ll | 200 +++++++-------- llvm/test/Analysis/CostModel/X86/arith-overflow.ll | 120 ++++----- llvm/test/Analysis/CostModel/X86/arith.ll | 20 +- llvm/test/Analysis/CostModel/X86/rem.ll | 280 ++++++++++----------- .../test/Analysis/CostModel/X86/slm-arith-costs.ll | 2 +- .../X86/loop-invariant-conditions.ll | 20 +- .../test/Transforms/SLPVectorizer/X86/arith-fix.ll | 178 ++----------- .../test/Transforms/SLPVectorizer/X86/arith-mul.ll | 199 +++++++++------ 9 files changed, 466 insertions(+), 554 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index e88b5e2..dc11db2 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1017,6 +1017,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( static const CostTblEntry X64CostTbl[] = { // 64-bit targets { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ + { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/ }; if (ST->is64Bit()) diff --git a/llvm/test/Analysis/CostModel/X86/arith-fix.ll b/llvm/test/Analysis/CostModel/X86/arith-fix.ll index 3b50f6b..fcb2b90 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-fix.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-fix.ll @@ -33,11 +33,11 @@ declare <64 x i8> @llvm.smul.fix.v64i8(<64 x i8>, <64 x i8>, i32) define i32 @smul(i32 %arg) { ; SSSE3-LABEL: 'smul' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -52,11 +52,11 @@ define i32 @smul(i32 %arg) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'smul' -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) -; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) ; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -71,11 +71,11 @@ define i32 @smul(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'smul' -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; AVX1-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; AVX1-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) ; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; AVX1-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; AVX1-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -90,11 +90,11 @@ define i32 @smul(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'smul' -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; AVX2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) ; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; AVX2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -109,11 +109,11 @@ define i32 @smul(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'smul' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -128,11 +128,11 @@ define i32 @smul(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'smul' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -147,11 +147,11 @@ define i32 @smul(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'smul' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -166,11 +166,11 @@ define i32 @smul(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'smul' -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) -; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; SLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; SLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) ; SLM-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; SLM-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; SLM-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -185,11 +185,11 @@ define i32 @smul(i32 %arg) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'smul' -; GLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) -; GLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; GLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; GLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) ; GLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; GLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; GLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -204,11 +204,11 @@ define i32 @smul(i32 %arg) { ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'smul' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.smul.fix.i64(i64 undef, i64 undef, i32 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -267,11 +267,11 @@ declare <64 x i8> @llvm.umul.fix.v64i8(<64 x i8>, <64 x i8>, i32) define i32 @umul(i32 %arg) { ; SSSE3-LABEL: 'umul' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -286,11 +286,11 @@ define i32 @umul(i32 %arg) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'umul' -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) -; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; SSE42-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) ; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -305,11 +305,11 @@ define i32 @umul(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'umul' -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; AVX1-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; AVX1-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) ; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; AVX1-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; AVX1-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -324,11 +324,11 @@ define i32 @umul(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'umul' -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; AVX2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) ; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; AVX2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -343,11 +343,11 @@ define i32 @umul(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'umul' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -362,11 +362,11 @@ define i32 @umul(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'umul' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -381,11 +381,11 @@ define i32 @umul(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'umul' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -400,11 +400,11 @@ define i32 @umul(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'umul' -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) -; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; SLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; SLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) ; SLM-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; SLM-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; SLM-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -419,11 +419,11 @@ define i32 @umul(i32 %arg) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'umul' -; GLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) -; GLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; GLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; GLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) ; GLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; GLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; GLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) @@ -438,11 +438,11 @@ define i32 @umul(i32 %arg) { ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'umul' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call i64 @llvm.umul.fix.i64(i64 undef, i64 undef, i32 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> undef, <2 x i64> undef, i32 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3) diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll index 81bf82e..280a672b 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll @@ -990,9 +990,9 @@ declare {<64 x i8>, <64 x i1>} @llvm.smul.with.overflow.v64i8(<64 x i8>, <64 x define i32 @smul(i32 %arg) { ; SSSE3-LABEL: 'smul' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1009,9 +1009,9 @@ define i32 @smul(i32 %arg) { ; ; SSE42-LABEL: 'smul' ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1028,9 +1028,9 @@ define i32 @smul(i32 %arg) { ; ; AVX1-LABEL: 'smul' ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1047,9 +1047,9 @@ define i32 @smul(i32 %arg) { ; ; AVX2-LABEL: 'smul' ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1066,9 +1066,9 @@ define i32 @smul(i32 %arg) { ; ; AVX512F-LABEL: 'smul' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1085,9 +1085,9 @@ define i32 @smul(i32 %arg) { ; ; AVX512BW-LABEL: 'smul' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1104,9 +1104,9 @@ define i32 @smul(i32 %arg) { ; ; AVX512DQ-LABEL: 'smul' ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1123,9 +1123,9 @@ define i32 @smul(i32 %arg) { ; ; SLM-LABEL: 'smul' ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1142,9 +1142,9 @@ define i32 @smul(i32 %arg) { ; ; GLM-LABEL: 'smul' ; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; GLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; GLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; GLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; GLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; GLM-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; GLM-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; GLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; GLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1161,9 +1161,9 @@ define i32 @smul(i32 %arg) { ; ; BTVER2-LABEL: 'smul' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1228,9 +1228,9 @@ declare {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x define i32 @umul(i32 %arg) { ; SSSE3-LABEL: 'umul' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1247,9 +1247,9 @@ define i32 @umul(i32 %arg) { ; ; SSE42-LABEL: 'umul' ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1266,9 +1266,9 @@ define i32 @umul(i32 %arg) { ; ; AVX1-LABEL: 'umul' ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1285,9 +1285,9 @@ define i32 @umul(i32 %arg) { ; ; AVX2-LABEL: 'umul' ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1304,9 +1304,9 @@ define i32 @umul(i32 %arg) { ; ; AVX512F-LABEL: 'umul' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1323,9 +1323,9 @@ define i32 @umul(i32 %arg) { ; ; AVX512BW-LABEL: 'umul' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1342,9 +1342,9 @@ define i32 @umul(i32 %arg) { ; ; AVX512DQ-LABEL: 'umul' ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1361,9 +1361,9 @@ define i32 @umul(i32 %arg) { ; ; SLM-LABEL: 'umul' ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1380,9 +1380,9 @@ define i32 @umul(i32 %arg) { ; ; GLM-LABEL: 'umul' ; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; GLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; GLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; GLM-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; GLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; GLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; GLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; GLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; GLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -1399,9 +1399,9 @@ define i32 @umul(i32 %arg) { ; ; BTVER2-LABEL: 'umul' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; BTVER2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; BTVER2-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; BTVER2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) diff --git a/llvm/test/Analysis/CostModel/X86/arith.ll b/llvm/test/Analysis/CostModel/X86/arith.ll index 40f7815e..cfbfc07 100644 --- a/llvm/test/Analysis/CostModel/X86/arith.ll +++ b/llvm/test/Analysis/CostModel/X86/arith.ll @@ -930,7 +930,7 @@ define i32 @and(i32 %arg) { define i32 @mul(i32 %arg) { ; SSSE3-LABEL: 'mul' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef ; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = mul <2 x i64> undef, undef ; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = mul <4 x i64> undef, undef ; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = mul <8 x i64> undef, undef @@ -952,7 +952,7 @@ define i32 @mul(i32 %arg) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'mul' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = mul <2 x i64> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = mul <4 x i64> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = mul <8 x i64> undef, undef @@ -974,7 +974,7 @@ define i32 @mul(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'mul' -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = mul <2 x i64> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = mul <4 x i64> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = mul <8 x i64> undef, undef @@ -996,7 +996,7 @@ define i32 @mul(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'mul' -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = mul <2 x i64> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = mul <4 x i64> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = mul <8 x i64> undef, undef @@ -1018,7 +1018,7 @@ define i32 @mul(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'mul' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = mul <2 x i64> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = mul <4 x i64> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = mul <8 x i64> undef, undef @@ -1040,7 +1040,7 @@ define i32 @mul(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'mul' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = mul <2 x i64> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = mul <4 x i64> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I64 = mul <8 x i64> undef, undef @@ -1062,7 +1062,7 @@ define i32 @mul(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'mul' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = mul <2 x i64> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = mul <4 x i64> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = mul <8 x i64> undef, undef @@ -1084,7 +1084,7 @@ define i32 @mul(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'mul' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = mul <2 x i64> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = mul <4 x i64> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = mul <8 x i64> undef, undef @@ -1106,7 +1106,7 @@ define i32 @mul(i32 %arg) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'mul' -; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = mul <2 x i64> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = mul <4 x i64> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = mul <8 x i64> undef, undef @@ -1128,7 +1128,7 @@ define i32 @mul(i32 %arg) { ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'mul' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef +; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef ; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = mul <2 x i64> undef, undef ; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = mul <4 x i64> undef, undef ; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = mul <8 x i64> undef, undef diff --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll index 4562543..8b09e42 100644 --- a/llvm/test/Analysis/CostModel/X86/rem.ll +++ b/llvm/test/Analysis/CostModel/X86/rem.ll @@ -13,10 +13,10 @@ define i32 @srem() { ; CHECK-LABEL: 'srem' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i32 = srem <4 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i32 = srem <8 x i32> undef, undef @@ -56,10 +56,10 @@ define i32 @srem() { define i32 @urem() { ; CHECK-LABEL: 'urem' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i32 = urem <4 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i32 = urem <8 x i32> undef, undef @@ -99,10 +99,10 @@ define i32 @urem() { define i32 @srem_const() { ; SSE2-LABEL: 'srem_const' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; SSE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE2-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, @@ -118,10 +118,10 @@ define i32 @srem_const() { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'srem_const' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, ; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, @@ -137,10 +137,10 @@ define i32 @srem_const() { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'srem_const' -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE42-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE42-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, @@ -156,10 +156,10 @@ define i32 @srem_const() { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'srem_const' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = srem <8 x i32> undef, @@ -175,10 +175,10 @@ define i32 @srem_const() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_const' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, @@ -194,10 +194,10 @@ define i32 @srem_const() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_const' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, @@ -213,10 +213,10 @@ define i32 @srem_const() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'srem_const' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, @@ -232,10 +232,10 @@ define i32 @srem_const() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_const' -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; SLM-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, @@ -251,10 +251,10 @@ define i32 @srem_const() { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'srem_const' -; GLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; GLM-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; GLM-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; GLM-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; GLM-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, @@ -270,10 +270,10 @@ define i32 @srem_const() { ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'srem_const' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = srem <8 x i32> undef, @@ -313,10 +313,10 @@ define i32 @srem_const() { define i32 @urem_const() { ; SSE-LABEL: 'urem_const' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; SSE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = urem <8 x i32> undef, @@ -332,10 +332,10 @@ define i32 @urem_const() { ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'urem_const' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8i32 = urem <8 x i32> undef, @@ -351,10 +351,10 @@ define i32 @urem_const() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'urem_const' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, @@ -370,10 +370,10 @@ define i32 @urem_const() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'urem_const' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, @@ -389,10 +389,10 @@ define i32 @urem_const() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'urem_const' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, @@ -408,10 +408,10 @@ define i32 @urem_const() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'urem_const' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8i32 = urem <8 x i32> undef, @@ -451,10 +451,10 @@ define i32 @urem_const() { define i32 @srem_uniformconst() { ; SSE-LABEL: 'srem_uniformconst' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; SSE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = srem <8 x i32> undef, @@ -470,10 +470,10 @@ define i32 @srem_uniformconst() { ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'srem_uniformconst' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8i32 = srem <8 x i32> undef, @@ -489,10 +489,10 @@ define i32 @srem_uniformconst() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_uniformconst' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, @@ -508,10 +508,10 @@ define i32 @srem_uniformconst() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_uniformconst' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, @@ -527,10 +527,10 @@ define i32 @srem_uniformconst() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'srem_uniformconst' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, @@ -546,10 +546,10 @@ define i32 @srem_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'srem_uniformconst' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = srem <2 x i64> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = srem <4 x i64> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = srem <8 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8i32 = srem <8 x i32> undef, @@ -589,10 +589,10 @@ define i32 @srem_uniformconst() { define i32 @urem_uniformconst() { ; SSE-LABEL: 'urem_uniformconst' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; SSE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8i32 = urem <8 x i32> undef, @@ -608,10 +608,10 @@ define i32 @urem_uniformconst() { ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'urem_uniformconst' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = urem <8 x i32> undef, @@ -627,10 +627,10 @@ define i32 @urem_uniformconst() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'urem_uniformconst' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = urem <8 x i32> undef, @@ -646,10 +646,10 @@ define i32 @urem_uniformconst() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'urem_uniformconst' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = urem <8 x i32> undef, @@ -665,10 +665,10 @@ define i32 @urem_uniformconst() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'urem_uniformconst' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = urem <8 x i32> undef, @@ -684,10 +684,10 @@ define i32 @urem_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'urem_uniformconst' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2i64 = urem <2 x i64> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V4i64 = urem <4 x i64> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V8i64 = urem <8 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = urem i64 undef, 7 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = urem <2 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = urem <4 x i64> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = urem <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = urem <8 x i32> undef, @@ -727,7 +727,7 @@ define i32 @urem_uniformconst() { define i32 @srem_constpow2() { ; SSE2-LABEL: 'srem_constpow2' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V2i64 = srem <2 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V4i64 = srem <4 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V8i64 = srem <8 x i64> undef, @@ -746,7 +746,7 @@ define i32 @srem_constpow2() { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'srem_constpow2' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V2i64 = srem <2 x i64> undef, ; SSSE3-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V4i64 = srem <4 x i64> undef, ; SSSE3-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V8i64 = srem <8 x i64> undef, @@ -765,7 +765,7 @@ define i32 @srem_constpow2() { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'srem_constpow2' -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = srem <2 x i64> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = srem <4 x i64> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = srem <8 x i64> undef, @@ -784,7 +784,7 @@ define i32 @srem_constpow2() { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'srem_constpow2' -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2i64 = srem <2 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8i64 = srem <8 x i64> undef, @@ -803,7 +803,7 @@ define i32 @srem_constpow2() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_constpow2' -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2i64 = srem <2 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8i64 = srem <8 x i64> undef, @@ -822,7 +822,7 @@ define i32 @srem_constpow2() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_constpow2' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2i64 = srem <2 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = srem <8 x i64> undef, @@ -841,7 +841,7 @@ define i32 @srem_constpow2() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'srem_constpow2' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2i64 = srem <2 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = srem <8 x i64> undef, @@ -860,7 +860,7 @@ define i32 @srem_constpow2() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_constpow2' -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; SLM-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V2i64 = srem <2 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V4i64 = srem <4 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V8i64 = srem <8 x i64> undef, @@ -879,7 +879,7 @@ define i32 @srem_constpow2() { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'srem_constpow2' -; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; GLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = srem <2 x i64> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = srem <4 x i64> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = srem <8 x i64> undef, @@ -898,7 +898,7 @@ define i32 @srem_constpow2() { ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'srem_constpow2' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2i64 = srem <2 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4i64 = srem <4 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8i64 = srem <8 x i64> undef, @@ -1022,7 +1022,7 @@ define i32 @urem_constpow2() { define i32 @srem_uniformconstpow2() { ; SSE2-LABEL: 'srem_uniformconstpow2' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2i64 = srem <2 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4i64 = srem <4 x i64> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8i64 = srem <8 x i64> undef, @@ -1041,7 +1041,7 @@ define i32 @srem_uniformconstpow2() { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'srem_uniformconstpow2' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2i64 = srem <2 x i64> undef, ; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4i64 = srem <4 x i64> undef, ; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8i64 = srem <8 x i64> undef, @@ -1060,7 +1060,7 @@ define i32 @srem_uniformconstpow2() { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'srem_uniformconstpow2' -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2i64 = srem <2 x i64> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4i64 = srem <4 x i64> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8i64 = srem <8 x i64> undef, @@ -1079,7 +1079,7 @@ define i32 @srem_uniformconstpow2() { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'srem_uniformconstpow2' -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2i64 = srem <2 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i64 = srem <8 x i64> undef, @@ -1098,7 +1098,7 @@ define i32 @srem_uniformconstpow2() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_uniformconstpow2' -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2i64 = srem <2 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8i64 = srem <8 x i64> undef, @@ -1117,7 +1117,7 @@ define i32 @srem_uniformconstpow2() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_uniformconstpow2' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2i64 = srem <2 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = srem <8 x i64> undef, @@ -1136,7 +1136,7 @@ define i32 @srem_uniformconstpow2() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'srem_uniformconstpow2' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2i64 = srem <2 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8i64 = srem <8 x i64> undef, @@ -1155,7 +1155,7 @@ define i32 @srem_uniformconstpow2() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_uniformconstpow2' -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; SLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2i64 = srem <2 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V4i64 = srem <4 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V8i64 = srem <8 x i64> undef, @@ -1174,7 +1174,7 @@ define i32 @srem_uniformconstpow2() { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'srem_uniformconstpow2' -; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; GLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; GLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2i64 = srem <2 x i64> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4i64 = srem <4 x i64> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8i64 = srem <8 x i64> undef, @@ -1193,7 +1193,7 @@ define i32 @srem_uniformconstpow2() { ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; BTVER2-LABEL: 'srem_uniformconstpow2' -; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2i64 = srem <2 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4i64 = srem <4 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i64 = srem <8 x i64> undef, diff --git a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll index c6fab69..5a7267d 100644 --- a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll +++ b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -442,7 +442,7 @@ entry: ; 64bit mul define i64 @slm-costs_64_scalar_mul(i64 %a, i64 %b) { ; CHECK-LABEL: 'slm-costs_64_scalar_mul' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = mul nsw i64 %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw i64 %a, %b ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %res ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/X86/loop-invariant-conditions.ll b/llvm/test/Transforms/IndVarSimplify/X86/loop-invariant-conditions.ll index 8b80f8e..4e0556d 100644 --- a/llvm/test/Transforms/IndVarSimplify/X86/loop-invariant-conditions.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/loop-invariant-conditions.ll @@ -311,14 +311,12 @@ for.end: ; preds = %if.end, %entry define void @test3_neg(i64 %start) { ; CHECK-LABEL: @test3_neg( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[START:%.*]], i64 -1) -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[SMAX]], 1 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[START]], [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[TMP0]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[FOR_END:%.*]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: br i1 [[CMP1]], label [[LOOP]], label [[FOR_END:%.*]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -338,18 +336,16 @@ for.end: ; preds = %if.end, %entry define void @test4_neg(i64 %start) { ; CHECK-LABEL: @test4_neg( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[START:%.*]], i64 0) -; CHECK-NEXT: [[TMP0:%.*]] = add nuw i64 [[SMAX]], 1 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[START]], [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 25 ; CHECK-NEXT: br i1 [[CMP]], label [[BACKEDGE]], label [[FOR_END:%.*]] ; CHECK: backedge: ; CHECK-NEXT: call void @foo() -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[LOOP]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_END]], label [[LOOP]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll index 87ee673..acd9979 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll @@ -952,70 +952,22 @@ define void @umul_v8i64() { define void @umul_v16i32() { ; SSE-LABEL: @umul_v16i32( -; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 -; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 -; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 -; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 -; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 -; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 -; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 -; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 -; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 -; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 -; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 -; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 -; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 -; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 -; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 -; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 -; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3) -; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3) -; SSE-NEXT: [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3) -; SSE-NEXT: [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3) -; SSE-NEXT: [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3) -; SSE-NEXT: [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3) -; SSE-NEXT: [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3) -; SSE-NEXT: [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3) -; SSE-NEXT: [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3) -; SSE-NEXT: [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3) -; SSE-NEXT: [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3) -; SSE-NEXT: [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3) -; SSE-NEXT: [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3) -; SSE-NEXT: [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3) -; SSE-NEXT: [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3) -; SSE-NEXT: [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3) -; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]], i32 3) +; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]], i32 3) +; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]], i32 3) +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]], i32 3) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @umul_v16i32( @@ -1085,83 +1037,16 @@ define void @umul_v16i32() { ; SLM-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 ; SLM-NEXT: ret void ; -; AVX1-LABEL: @umul_v16i32( -; AVX1-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; AVX1-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; AVX1-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; AVX1-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; AVX1-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; AVX1-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; AVX1-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; AVX1-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; AVX1-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; AVX1-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; AVX1-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; AVX1-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; AVX1-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; AVX1-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; AVX1-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 -; AVX1-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 -; AVX1-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 -; AVX1-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 -; AVX1-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 -; AVX1-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 -; AVX1-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 -; AVX1-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 -; AVX1-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 -; AVX1-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 -; AVX1-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 -; AVX1-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 -; AVX1-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 -; AVX1-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 -; AVX1-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3) -; AVX1-NEXT: [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3) -; AVX1-NEXT: [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3) -; AVX1-NEXT: [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3) -; AVX1-NEXT: [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3) -; AVX1-NEXT: [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3) -; AVX1-NEXT: [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3) -; AVX1-NEXT: [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3) -; AVX1-NEXT: [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3) -; AVX1-NEXT: [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3) -; AVX1-NEXT: [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3) -; AVX1-NEXT: [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3) -; AVX1-NEXT: [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3) -; AVX1-NEXT: [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3) -; AVX1-NEXT: [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3) -; AVX1-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; AVX1-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; AVX1-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; AVX1-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; AVX1-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; AVX1-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; AVX1-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; AVX1-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; AVX1-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; AVX1-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; AVX1-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @umul_v16i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3) -; AVX2-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3) -; AVX2-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 -; AVX2-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: ret void +; AVX-LABEL: @umul_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3) +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3) +; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @umul_v16i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 @@ -1170,17 +1055,6 @@ define void @umul_v16i32() { ; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 ; AVX512-NEXT: ret void ; -; AVX256BW-LABEL: @umul_v16i32( -; AVX256BW-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX256BW-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX256BW-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX256BW-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX256BW-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3) -; AVX256BW-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3) -; AVX256BW-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 -; AVX256BW-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX256BW-NEXT: ret void -; %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll index 315eb9a4..c38794d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=SSE ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=-prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=+prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=-prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=+prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=+prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX128 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=-prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX256 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=+prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX128 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=-prefer-128-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX256 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=AVX256 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -93,51 +93,35 @@ define void @mul_v8i64() { ; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 ; SLM-NEXT: ret void ; -; AVX1-LABEL: @mul_v8i64( -; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]] -; AVX1-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]] -; AVX1-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]] -; AVX1-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]] -; AVX1-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]] -; AVX1-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]] -; AVX1-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]] -; AVX1-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]] -; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 -; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 -; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 -; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 -; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 -; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 -; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 -; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 -; AVX1-NEXT: ret void +; AVX128-LABEL: @mul_v8i64( +; AVX128-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP9:%.*]] = mul <2 x i64> [[TMP1]], [[TMP5]] +; AVX128-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[TMP2]], [[TMP6]] +; AVX128-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[TMP3]], [[TMP7]] +; AVX128-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[TMP4]], [[TMP8]] +; AVX128-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; AVX128-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX128-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX128-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX128-NEXT: ret void ; -; AVX2-LABEL: @mul_v8i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]] -; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 -; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: ret void +; AVX256-LABEL: @mul_v8i64( +; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX256-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]] +; AVX256-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]] +; AVX256-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX256-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @mul_v8i64( ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 @@ -220,16 +204,35 @@ define void @mul_v16i32() { ; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; -; AVX-LABEL: @mul_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = mul <8 x i32> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 -; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: ret void +; AVX128-LABEL: @mul_v16i32( +; AVX128-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]] +; AVX128-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]] +; AVX128-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]] +; AVX128-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]] +; AVX128-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; AVX128-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX128-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX128-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX128-NEXT: ret void +; +; AVX256-LABEL: @mul_v16i32( +; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]] +; AVX256-NEXT: [[TMP6:%.*]] = mul <8 x i32> [[TMP2]], [[TMP4]] +; AVX256-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX256-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @mul_v16i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 @@ -344,16 +347,35 @@ define void @mul_v32i16() { ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; -; AVX-LABEL: @mul_v32i16( -; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 -; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: ret void +; AVX128-LABEL: @mul_v32i16( +; AVX128-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]] +; AVX128-NEXT: [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]] +; AVX128-NEXT: [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]] +; AVX128-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]] +; AVX128-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; AVX128-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; AVX128-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; AVX128-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 +; AVX128-NEXT: ret void +; +; AVX256-LABEL: @mul_v32i16( +; AVX256-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX256-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX256-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX256-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX256-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]] +; AVX256-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]] +; AVX256-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX256-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @mul_v32i16( ; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, <32 x i16>* bitcast ([32 x i16]* @a16 to <32 x i16>*), align 2 @@ -532,16 +554,35 @@ define void @mul_v64i8() { ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; -; AVX-LABEL: @mul_v64i8( -; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = mul <32 x i8> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = mul <32 x i8> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 -; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: ret void +; AVX128-LABEL: @mul_v64i8( +; AVX128-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]] +; AVX128-NEXT: [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]] +; AVX128-NEXT: [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]] +; AVX128-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]] +; AVX128-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; AVX128-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX128-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX128-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX128-NEXT: ret void +; +; AVX256-LABEL: @mul_v64i8( +; AVX256-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 +; AVX256-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX256-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX256-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX256-NEXT: [[TMP5:%.*]] = mul <32 x i8> [[TMP1]], [[TMP3]] +; AVX256-NEXT: [[TMP6:%.*]] = mul <32 x i8> [[TMP2]], [[TMP4]] +; AVX256-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX256-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @mul_v64i8( ; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, <64 x i8>* bitcast ([64 x i8]* @a8 to <64 x i8>*), align 1 -- 2.7.4