From 27fed8e5d636d67ed5e2dff77705dcae1fcd0b15 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 14 Nov 2016 14:45:16 +0000 Subject: [PATCH] [X86][AVX] Fixed v16i16/v32i8 ADD/SUB costs on AVX1 subtargets Add explicit v16i16/v32i8 ADD/SUB costs, matching the costs of v4i64/v8i32 - they were missing for some reason. This has side effects on the LV max bandwidth tests (AVX1 now prefers 128-bit vectors vs AVX2 which still prefers 256-bit) llvm-svn: 286832 --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 4 ++++ llvm/test/Analysis/CostModel/X86/arith.ll | 16 ++++++++-------- .../Transforms/LoopVectorize/X86/vector_max_bandwidth.ll | 6 ++++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 7029a02..5b3091e 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -526,6 +526,10 @@ int X86TTIImpl::getArithmeticInstrCost( // Two ops + 1 extract + 1 insert = 4. { ISD::MUL, MVT::v16i16, 4 }, { ISD::MUL, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v32i8, 4 }, + { ISD::ADD, MVT::v32i8, 4 }, + { ISD::SUB, MVT::v16i16, 4 }, + { ISD::ADD, MVT::v16i16, 4 }, { ISD::SUB, MVT::v8i32, 4 }, { ISD::ADD, MVT::v8i32, 4 }, { ISD::SUB, MVT::v4i64, 4 }, diff --git a/llvm/test/Analysis/CostModel/X86/arith.ll b/llvm/test/Analysis/CostModel/X86/arith.ll index 2f3f0f9..aa204db 100644 --- a/llvm/test/Analysis/CostModel/X86/arith.ll +++ b/llvm/test/Analysis/CostModel/X86/arith.ll @@ -57,13 +57,13 @@ define i32 @add(i32 %arg) { %G = add <8 x i16> undef, undef ; SSSE3: cost of 2 {{.*}} %H = add ; SSE42: cost of 2 {{.*}} %H = add - ; AVX: cost of 2 {{.*}} %H = add + ; AVX: cost of 4 {{.*}} %H = add ; AVX2: cost of 1 {{.*}} %H = add ; AVX512: cost of 1 {{.*}} %H = add %H = add <16 x i16> undef, undef ; SSSE3: cost of 4 {{.*}} %I = add ; SSE42: cost of 4 {{.*}} %I = add - ; AVX: cost of 4 {{.*}} %I = add + ; AVX: cost of 8 {{.*}} %I = add ; AVX2: cost of 2 {{.*}} %I = add ; AVX512F: cost of 2 {{.*}} %I = add ; AVX512BW: cost of 1 {{.*}} %I = add @@ -77,13 +77,13 @@ define i32 @add(i32 %arg) { %J = add <16 x i8> undef, undef ; SSSE3: cost of 2 {{.*}} %K = add ; SSE42: cost of 2 {{.*}} %K = add - ; AVX: cost of 2 {{.*}} %K = add + ; AVX: cost of 4 {{.*}} %K = add ; AVX2: cost of 1 {{.*}} %K = add ; AVX512: cost of 1 {{.*}} %K = add %K = add <32 x i8> undef, undef ; SSSE3: cost of 4 {{.*}} %L = add ; SSE42: cost of 4 {{.*}} %L = add - ; AVX: cost of 4 {{.*}} %L = add + ; AVX: cost of 8 {{.*}} %L = add ; AVX2: cost of 2 {{.*}} %L = add ; AVX512F: cost of 2 {{.*}} %L = add ; AVX512BW: cost of 1 {{.*}} %L = add @@ -140,13 +140,13 @@ define i32 @sub(i32 %arg) { %G = sub <8 x i16> undef, undef ; SSSE3: cost of 2 {{.*}} %H = sub ; SSE42: cost of 2 {{.*}} %H = sub - ; AVX: cost of 2 {{.*}} %H = sub + ; AVX: cost of 4 {{.*}} %H = sub ; AVX2: cost of 1 {{.*}} %H = sub ; AVX512: cost of 1 {{.*}} %H = sub %H = sub <16 x i16> undef, undef ; SSSE3: cost of 4 {{.*}} %I = sub ; SSE42: cost of 4 {{.*}} %I = sub - ; AVX: cost of 4 {{.*}} %I = sub + ; AVX: cost of 8 {{.*}} %I = sub ; AVX2: cost of 2 {{.*}} %I = sub ; AVX512F: cost of 2 {{.*}} %I = sub ; AVX512BW: cost of 1 {{.*}} %I = sub @@ -160,13 +160,13 @@ define i32 @sub(i32 %arg) { %J = sub <16 x i8> undef, undef ; SSSE3: cost of 2 {{.*}} %K = sub ; SSE42: cost of 2 {{.*}} %K = sub - ; AVX: cost of 2 {{.*}} %K = sub + ; AVX: cost of 4 {{.*}} %K = sub ; AVX2: cost of 1 {{.*}} %K = sub ; AVX512: cost of 1 {{.*}} %K = sub %K = sub <32 x i8> undef, undef ; SSSE3: cost of 4 {{.*}} %L = sub ; SSE42: cost of 4 {{.*}} %L = sub - ; AVX: cost of 4 {{.*}} %L = sub + ; AVX: cost of 8 {{.*}} %L = sub ; AVX2: cost of 2 {{.*}} %L = sub ; AVX512F: cost of 2 {{.*}} %L = sub ; AVX512BW: cost of 1 {{.*}} %L = sub diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll b/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll index fe9d59e..a32cc46 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll @@ -1,4 +1,5 @@ -; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX1 +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=core-avx2 -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX2 ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -16,7 +17,8 @@ target triple = "x86_64-unknown-linux-gnu" ; -vectorizer-maximize-bandwidth is indicated. ; ; CHECK-label: foo -; CHECK: LV: Selecting VF: 32. +; CHECK-AVX1: LV: Selecting VF: 16. +; CHECK-AVX2: LV: Selecting VF: 32. define void @foo() { entry: br label %for.body -- 2.7.4