From 5cbc7d2999bea6a005e02ec2a8ec4ae53c9646a5 Mon Sep 17 00:00:00 2001
From: Bjarke Hammersholt Roune <broune@google.com>
Date: Wed, 5 Aug 2015 23:11:57 +0000
Subject: [PATCH] [NVPTX] Use LDG for pointer induction variables.

More specifically, make NVPTXISelDAGToDAG able to emit cached loads (LDG) for pointer induction variables.

Also fix latent bug where LDG was not restricted to kernel functions. I believe that this could not be triggered so far since we do not currently infer that a pointer is global outside a kernel function, and only loads of global pointers are considered for cached loads.

llvm-svn: 244166
---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp        | 39 +++++++++++----
 .../CodeGen/NVPTX/load-with-non-coherent-cache.ll  | 56 +++++++++++++++++++++-
 2 files changed, 84 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 1d233c4..2d0098b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXISelDAGToDAG.h"
+#include "NVPTXUtilities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
@@ -546,18 +547,36 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
 }
 
 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
-                          unsigned codeAddrSpace, const DataLayout &DL) {
-  if (!Subtarget.hasLDG() || codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) {
+                          unsigned CodeAddrSpace, MachineFunction *F) {
+  // To use non-coherent caching, the load has to be from global
+  // memory and we have to prove that the memory area is not written
+  // to anywhere for the duration of the kernel call, not even after
+  // the load.
+  //
+  // To ensure that there are no writes to the memory, we require the
+  // underlying pointer to be a noalias (__restrict) kernel parameter
+  // that is never used for a write. We can only do this for kernel
+  // functions since from within a device function, we cannot know if
+  // there were or will be writes to the memory from the caller - or we
+  // could, but then we would have to do inter-procedural analysis.
+  if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL ||
+      !isKernelFunction(*F->getFunction())) {
     return false;
   }
 
-  // Check whether load operates on a readonly argument.
-  bool canUseLDG = false;
-  if (const Argument *A = dyn_cast<const Argument>(
-          GetUnderlyingObject(N->getMemOperand()->getValue(), DL)))
-    canUseLDG = A->onlyReadsMemory() && A->hasNoAliasAttr();
+  // We use GetUnderlyingObjects() here instead of
+  // GetUnderlyingObject() mainly because the former looks through phi
+  // nodes while the latter does not. We need to look through phi
+  // nodes to handle pointer induction variables.
+  SmallVector<Value *, 8> Objs;
+  GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
+                       Objs, F->getDataLayout());
+  for (Value *Obj : Objs) {
+    auto *A = dyn_cast<const Argument>(Obj);
+    if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
+  }
 
-  return canUseLDG;
+  return true;
 }
 
 SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
@@ -654,7 +673,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   // Address Space Setting
   unsigned int codeAddrSpace = getCodeAddrSpace(LD);
 
-  if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, CurDAG->getDataLayout())) {
+  if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
     return SelectLDGLDU(N);
   }
 
@@ -892,7 +911,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
 
-  if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, CurDAG->getDataLayout())) {
+  if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
     return SelectLDGLDU(N);
   }
 
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index d2d1ae6..d93499b 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -189,7 +189,60 @@ define void @foo18(float ** noalias readonly %from, float ** %to) {
   ret void
 }
 
-!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18}
+; Test that we can infer a cached load for a pointer induction variable.
+; SM20-LABEL: .visible .entry foo19(
+; SM20: ld.global.f32
+; SM35-LABEL: .visible .entry foo19(
+; SM35: ld.global.nc.f32
+define void @foo19(float * noalias readonly %from, float * %to, i32 %n) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %nexti, %loop ]
+  %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ]
+  %ptr = getelementptr inbounds float, float * %from, i32 %i
+  %value = load float, float * %ptr, align 4
+  %nextsum = fadd float %value, %sum
+  %nexti = add nsw i32 %i, 1
+  %exitcond = icmp eq i32 %nexti, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  store float %nextsum, float * %to
+  ret void
+}
+
+; This test captures the case of a non-kernel function. In a
+; non-kernel function, without interprocedural analysis, we do not
+; know that the parameter is global. We also do not know that the
+; pointed-to memory is never written to (for the duration of the
+; kernel). For both reasons, we cannot use a cached load here.
+; SM20-LABEL: notkernel(
+; SM20: ld.f32
+; SM35-LABEL: notkernel(
+; SM35: ld.f32
+define void @notkernel(float * noalias readonly %from, float * %to) {
+  %1 = load float, float * %from
+  store float %1, float * %to
+  ret void
+}
+
+; As @notkernel, but with the parameter explicitly marked as global. We still
+; do not know that the parameter is never written to (for the duration of the
+; kernel). This case does not currently come up normally since we do not infer
+; that pointers are global interprocedurally as of 2015-08-05.
+; SM20-LABEL: notkernel2(
+; SM20: ld.global.f32
+; SM35-LABEL: notkernel2(
+; SM35: ld.global.f32
+define void @notkernel2(float addrspace(1) * noalias readonly %from, float * %to) {
+  %1 = load float, float addrspace(1) * %from
+  store float %1, float * %to
+  ret void
+}
+
+!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19}
 !1 = !{void (float *, float *)* @foo1, !"kernel", i32 1}
 !2 = !{void (double *, double *)* @foo2, !"kernel", i32 1}
 !3 = !{void (i16 *, i16 *)* @foo3, !"kernel", i32 1}
@@ -208,3 +261,4 @@ define void @foo18(float ** noalias readonly %from, float ** %to) {
 !16 = !{void (<4 x float> *, <4 x float> *)* @foo16, !"kernel", i32 1}
 !17 = !{void (<4 x double> *, <4 x double> *)* @foo17, !"kernel", i32 1}
 !18 = !{void (float **, float **)* @foo18, !"kernel", i32 1}
+!19 = !{void (float *, float *, i32)* @foo19, !"kernel", i32 1}
-- 
2.7.4