From 9696f06bcf426db4504e2e566ce231c6b84206e2 Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Wed, 27 Mar 2019 20:17:01 -0700
Subject: [PATCH] Use __ldg for CUDA kernels in fuser (#18540)

Summary:
While benchmarking a kernel with broadcasted inputs, I noticed
that is was much slower than a hand-coded kernel for the smae task.

The kernel in question computed a * b + c for a of shape
32 x 32 x 10240 and b and c of shape 1 x 32 x 1.

This patch accellerates said kernel from 450us to 250us on my GTX1080Ti.

I didn't change half because there doesn't seem to be __ldg for
half.

An alternative could be to sprinkle const and restrict.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18540

Differential Revision: D14657840

Pulled By: soumith

fbshipit-source-id: 408847346ec12d1d1d9b119ac50bbc70f0d9ed33
---
 torch/csrc/jit/fuser/codegen.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/jit/fuser/codegen.cpp b/torch/csrc/jit/fuser/codegen.cpp
index 80535b3..06a7009 100644
--- a/torch/csrc/jit/fuser/codegen.cpp
+++ b/torch/csrc/jit/fuser/codegen.cpp
@@ -332,7 +332,7 @@ std::string generateKernel(
       env.d("nDim", nDim);
       env.s("scalar_type", scalarTypeName(desc.scalar_type));
       formals.push_back(
-          format("TensorInfo<${scalar_type},${nDim}> ${tensor}", env));
+          format("const TensorInfo<${scalar_type},${nDim}> ${tensor}", env));
       argument_loads.push_back(format(
           "*static_cast<TensorInfo<${scalar_type},${nDim}>*>(args[${formal_index}])",
           env));
@@ -393,6 +393,8 @@ std::string generateKernel(
             "access",
             format("__half2float(t${formal}.data[t${formal}_offset])", env));
         has_half_tensor = true;
+      } else if (use_cuda) {
+        env.s("access", format("__ldg(&t${formal}.data[t${formal}_offset])", env));
       } else {
         env.s("access", format("t${formal}.data[t${formal}_offset]", env));
       }
-- 
2.7.4