From 733c7fc55d0dfa4d49f4becb2fb92e108611ef11 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 27 Apr 2018 13:36:05 +0000 Subject: [PATCH] [NVPTX] Turn on Loop/SLP vectorization Since PTX has grown a <2 x half> datatype vectorization has become more important. The late LoadStoreVectorizer intentionally only does loads and stores, but now arithmetic has to be vectorized for optimal throughput too. This is still very limited, SLP vectorization happily creates <2 x half> if it's a legal type but there's still a lot of register moving happening to get that fed into a vectorized store. Overall it's a small performance win by reducing the amount of arithmetic instructions. I haven't really checked what the loop vectorizer does to PTX code, the cost model there might need some more tweaks. I didn't see it causing harm though. Differential Revision: https://reviews.llvm.org/D46130 llvm-svn: 331035 --- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 12 +++++++ .../Transforms/SLPVectorizer/NVPTX/lit.local.cfg | 2 ++ llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll | 40 ++++++++++++++++++++++ 3 files changed, 54 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg create mode 100644 llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index d2414b7..812d305 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -49,6 +49,18 @@ public: return AddressSpace::ADDRESS_SPACE_GENERIC; } + // NVPTX has infinite registers of all kinds, but the actual machine doesn't. + // We conservatively return 1 here which is just enough to enable the + // vectorizers but disables heuristics based on the number of registers. + // FIXME: Return a more reasonable number, while keeping an eye on + // LoopVectorizer's unrolling heuristics. + unsigned getNumberOfRegisters(bool Vector) const { return 1; } + + // Only <2 x half> should be vectorized, so always return 32 for the vector + // register size. + unsigned getRegisterBitWidth(bool Vector) const { return 32; } + unsigned getMinVectorRegisterBitWidth() const { return 32; } + // Increase the inlining cost threshold by a factor of 5, reflecting that // calls are particularly expensive in NVPTX. unsigned getInliningThresholdMultiplier() { return 5; } diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg new file mode 100644 index 0000000..2cb98eb3 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll new file mode 100644 index 0000000..d8b80f4 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 | FileCheck %s +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_40 | FileCheck %s -check-prefix=NOVECTOR + +; CHECK-LABEL: @fusion +; CHECK: load <2 x half>, <2 x half>* +; CHECK: fmul fast <2 x half> +; CHECK: fadd fast <2 x half> +; CHECK: store <2 x half> %4, <2 x half> + +; NOVECTOR-LABEL: @fusion +; NOVECTOR: load half +; NOVECTOR: fmul fast half +; NOVECTOR: fadd fast half +; NOVECTOR: fmul fast half +; NOVECTOR: fadd fast half +; NOVECTOR: store half +define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %arg, i8* noalias nocapture readonly align 256 dereferenceable(19267584) %arg1, i32 %arg2, i32 %arg3) local_unnamed_addr #0 { + %tmp = shl nuw nsw i32 %arg2, 6 + %tmp4 = or i32 %tmp, %arg3 + %tmp5 = shl nuw nsw i32 %tmp4, 2 + %tmp6 = zext i32 %tmp5 to i64 + %tmp7 = or i64 %tmp6, 1 + %tmp10 = bitcast i8* %arg1 to half* + %tmp11 = getelementptr inbounds half, half* %tmp10, i64 %tmp6 + %tmp12 = load half, half* %tmp11, align 8 + %tmp13 = fmul fast half %tmp12, 0xH5380 + %tmp14 = fadd fast half %tmp13, 0xH57F0 + %tmp15 = bitcast i8* %arg to half* + %tmp16 = getelementptr inbounds half, half* %tmp15, i64 %tmp6 + store half %tmp14, half* %tmp16, align 8 + %tmp17 = getelementptr inbounds half, half* %tmp10, i64 %tmp7 + %tmp18 = load half, half* %tmp17, align 2 + %tmp19 = fmul fast half %tmp18, 0xH5380 + %tmp20 = fadd fast half %tmp19, 0xH57F0 + %tmp21 = getelementptr inbounds half, half* %tmp15, i64 %tmp7 + store half %tmp20, half* %tmp21, align 2 + ret void +} + +attributes #0 = { nounwind } -- 2.7.4