From 62d02f2b577c223f94d0b190df3f158cd985c221 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 24 Aug 2021 00:29:22 -0700
Subject: [PATCH] [TensorExpr] Make 'Tensor' a value type. (#63586)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63586

This is another commit in transition from KernelArena memory management.
Tensor is essentially just a pair of <BufPtr, StmtPtr> and we don't need
to dynamically allocate it at all - it's cheap to pass it by value, and
that's what we're switching to in this commit.

After this change nothing uses KernelScope/KernelArena and they can be
safely removed.

Differential Revision:
D30429114
D30429114

Test Plan: Imported from OSS

Reviewed By: navahgar

Pulled By: ZolotukhinM

fbshipit-source-id: f90b859cfe863692b7beffbe9bd0e4143df1e819
---
 benchmarks/cpp/tensorexpr/bench_approx.cpp        |  20 +-
 benchmarks/cpp/tensorexpr/bench_batchnorm.cpp     |   4 +-
 benchmarks/cpp/tensorexpr/bench_compile.cpp       |  40 +-
 benchmarks/cpp/tensorexpr/bench_concat.cpp        |   4 +-
 benchmarks/cpp/tensorexpr/bench_gemm.cpp          |  12 +-
 benchmarks/cpp/tensorexpr/bench_parallel.cpp      |   2 +-
 benchmarks/cpp/tensorexpr/bench_reduce.cpp        |  16 +-
 test/cpp/tensorexpr/test_approx.cpp               |   4 +-
 test/cpp/tensorexpr/test_boundsinference.cpp      | 190 +++----
 test/cpp/tensorexpr/test_conv.cpp                 |   8 +-
 test/cpp/tensorexpr/test_cuda.cpp                 |  68 +--
 test/cpp/tensorexpr/test_external_calls.cpp       |  48 +-
 test/cpp/tensorexpr/test_ir_printer.cpp           |  14 +-
 test/cpp/tensorexpr/test_kernel.cpp               |   2 +-
 test/cpp/tensorexpr/test_llvm.cpp                 |  64 +--
 test/cpp/tensorexpr/test_loopnest.cpp             | 645 +++++++++++-----------
 test/cpp/tensorexpr/test_memdependency.cpp        |  68 ++-
 test/cpp/tensorexpr/test_ops.cpp                  |   4 +-
 test/cpp/tensorexpr/test_reductions.cpp           | 230 ++++----
 test/cpp/tensorexpr/test_simplify.cpp             |   2 +-
 test/cpp/tensorexpr/tutorial.cpp                  |  30 +-
 torch/csrc/jit/runtime/static/ops.cpp             |  14 +-
 torch/csrc/jit/tensorexpr/codegen.h               |   2 +-
 torch/csrc/jit/tensorexpr/ir_printer.cpp          |  17 +-
 torch/csrc/jit/tensorexpr/ir_printer.h            |   4 +-
 torch/csrc/jit/tensorexpr/kernel.cpp              |  62 +--
 torch/csrc/jit/tensorexpr/kernel.h                |  10 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp            |  22 +-
 torch/csrc/jit/tensorexpr/loopnest.h              |  14 +-
 torch/csrc/jit/tensorexpr/operators/conv2d.cpp    |  18 +-
 torch/csrc/jit/tensorexpr/operators/conv2d.h      |   8 +-
 torch/csrc/jit/tensorexpr/operators/matmul.cpp    |   8 +-
 torch/csrc/jit/tensorexpr/operators/matmul.h      |   4 +-
 torch/csrc/jit/tensorexpr/operators/norm.cpp      |   2 +-
 torch/csrc/jit/tensorexpr/operators/norm.h        |   2 +-
 torch/csrc/jit/tensorexpr/operators/reduction.cpp |  10 +-
 torch/csrc/jit/tensorexpr/operators/reduction.h   |   6 +-
 torch/csrc/jit/tensorexpr/operators/softmax.cpp   |  31 +-
 torch/csrc/jit/tensorexpr/operators/softmax.h     |   2 +-
 torch/csrc/jit/tensorexpr/tensor.cpp              |  30 +-
 torch/csrc/jit/tensorexpr/tensor.h                |  39 +-
 torch/csrc/jit/tensorexpr/tensorexpr_init.cpp     |  20 +-
 42 files changed, 881 insertions(+), 919 deletions(-)

diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp
index 6e31697..92c2640 100644
--- a/benchmarks/cpp/tensorexpr/bench_approx.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_approx.cpp
@@ -10,14 +10,14 @@
 using namespace torch::jit;
 using namespace torch::jit::tensorexpr;
 
-void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target, int width) {
+void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor target, int width) {
   auto loops = ln->getLoopStmtsFor(target);
   ForPtr inner, tail;
   ln->splitWithTail(loops[0], width, &inner, &tail);
   ln->vectorize(inner);
 }
 
-void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target) {
+void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) {
   std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
   ForPtr inner, tail;
   ln->splitWithTail(loops[0], 16 * 8, &inner, &tail);
@@ -33,7 +33,7 @@ static void relu_nnc(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 0;
-  torch::jit::tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i){
+  torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i){
     auto A_elem = [&]() {
       auto elem = A.load(i);
       auto min = FloatImm::make(clamp);
@@ -67,7 +67,7 @@ static void log_nnc_sleef(benchmark::State& state) {
   KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return log(A.load(i));
       });
@@ -97,7 +97,7 @@ static void log_nnc_fast(benchmark::State& state) {
   KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return fast_log(A.load(i));
       });
@@ -127,7 +127,7 @@ static void log_nnc_vml(benchmark::State& state) {
   KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return log_vml(A.load(i));
       });
@@ -168,7 +168,7 @@ static void logit_nnc_sleef(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 1e-6f;
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
       auto elem = A.load(i);
       auto min = FloatImm::make(clamp);
@@ -205,7 +205,7 @@ static void logit_nnc_fast(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 1e-6f;
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
       auto elem = A.load(i);
       auto min = FloatImm::make(clamp);
@@ -242,7 +242,7 @@ static void logit_nnc_vml(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
   auto clamp = 1e-6f;
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
       auto elem = A.load(i);
       auto min = FloatImm::make(clamp);
@@ -319,7 +319,7 @@ static void tanh_nnc_fast(benchmark::State& state) {
   KernelScope ks;
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return fast_tanh(A.load(i));
       });
diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
index 872594e..85bf9d3 100644
--- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
@@ -84,7 +84,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
   VarHandle eps("eps", kFloat);
 
   using axis = const VarHandle&;
-  Tensor* output = Compute(
+  Tensor output = Compute(
       "output",
       {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
       [&](axis n, axis c, axis h, axis w) {
@@ -147,7 +147,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
   VarHandle eps("eps", kFloat);
 
   using axis = const VarHandle&;
-  Tensor* output = Compute(
+  Tensor output = Compute(
       "output",
       {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
       [&](axis n, axis c, axis h, axis w) {
diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp
index 245d5d8..50d54e5 100644
--- a/benchmarks/cpp/tensorexpr/bench_compile.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp
@@ -13,24 +13,24 @@ static void BM_CompileSwish(benchmark::State& state) {
     te::KernelScope ks;
     te::VarHandle n("n", te::kInt);
     te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
-    te::Tensor* relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
+    te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
       return te::Max::make(A.load(i), 0.f, false);
     });
-    te::Tensor* min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return te::Min::make(relu->load(i), 6.f, false);
+    te::Tensor min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return te::Min::make(relu.load(i), 6.f, false);
     });
-    te::Tensor* plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return min6->load(i) + 3.f;
+    te::Tensor plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return min6.load(i) + 3.f;
     });
-    te::Tensor* times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return A.load(i) * plus3->load(i);
+    te::Tensor times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return A.load(i) * plus3.load(i);
     });
-    te::Tensor* sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return times->load(i) * 1.f / 6.f;
+    te::Tensor sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return times.load(i) * 1.f / 6.f;
     });
     te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
     for (auto tensor : {relu, min6, plus3, times}) {
-      nest.computeInline(tensor->buf());
+      nest.computeInline(tensor.buf());
     }
     nest.prepareForCodegen();
     te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
@@ -43,24 +43,24 @@ static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
   te::KernelScope ks;
   te::VarHandle n("n", te::kInt);
   te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
-  te::Tensor* relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
+  te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
     return te::Max::make(A.load(i), 0.f, false);
   });
-  te::Tensor* min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return te::Min::make(relu->load(i), 6.f, false);
+  te::Tensor min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return te::Min::make(relu.load(i), 6.f, false);
   });
-  te::Tensor* plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return min6->load(i) + 3.f;
+  te::Tensor plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return min6.load(i) + 3.f;
   });
-  te::Tensor* times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return A.load(i) * plus3->load(i);
+  te::Tensor times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return A.load(i) * plus3.load(i);
   });
-  te::Tensor* sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return times->load(i) * 1.f / 6.f;
+  te::Tensor sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return times.load(i) * 1.f / 6.f;
   });
   te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
   for (auto tensor : {relu, min6, plus3, times}) {
-    nest.computeInline(tensor->buf());
+    nest.computeInline(tensor.buf());
   }
   nest.prepareForCodegen();
   te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp
index cb9aa84..856065d 100644
--- a/benchmarks/cpp/tensorexpr/bench_concat.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp
@@ -60,7 +60,7 @@ class ConcatBench : public benchmark::Fixture {
           {input_sizes_[i][0], input_sizes_[i][1]}));
     }
 
-    Tensor* output = Compute(
+    Tensor output = Compute(
         "aten_cat",
         {{output_size_[0], "M"}, {output_size_[1], "N"}},
         [&](const VarHandle& m, const VarHandle& n) {
@@ -147,7 +147,7 @@ class ConcatBench : public benchmark::Fixture {
       for_stmts[i] = for_st;
       cumulative_input_sizes += input_sizes_[i][1];
     }
-    auto output = new Tensor(output_buf, alloc<Block>(for_stmts));
+    auto output = Tensor(output_buf, alloc<Block>(for_stmts));
 
     LoopNest nest({output});
     nest.prepareForCodegen();
diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
index 7ebaa87..8646e97 100644
--- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
@@ -44,7 +44,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       te::Sum(),
@@ -68,7 +68,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       te::Sum(),
@@ -128,7 +128,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       te::Sum(),
@@ -188,7 +188,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       te::Sum(),
@@ -256,7 +256,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
 
   te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
   te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       te::Sum(),
@@ -302,7 +302,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
   }
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    loop.cacheAccesses(CT->buf(), "C_regs", loops[2]);
+    loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
   }
 
   loop.prepareForCodegen();
diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
index 966c9e2..847b66d 100644
--- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
@@ -38,7 +38,7 @@ BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
   KernelScope kernel_scope;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
-  Tensor* c_tensor = Compute(
+  Tensor c_tensor = Compute(
       "c", {{M, "m"}}, [&](const VarHandle& m) {
         return a_buf.load(m) + b_buf.load(m);
       });
diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
index be5dcc8..9d35701 100644
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
@@ -222,7 +222,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
   int M = A.numel();
 
   te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
       "reduce_full",
       {{1, "N"}},
       te::Sum(),
@@ -255,7 +255,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
   int M = A.numel();
 
   te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
       "reduce_full",
       {{1, "N"}},
       te::Sum(),
@@ -296,7 +296,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
   int M = A.numel();
 
   te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
       "reduce_full",
       {{1, "N"}},
       te::Sum(),
@@ -339,7 +339,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
   TORCH_CHECK(M % kChunkSize == 0);
 
   te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
       "reduce_full",
       {},
       te::Sum(),
@@ -359,7 +359,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
 
   loop.reorderAxis(mo, mi);
   loops = loop.getLoopStmtsFor(BT);
-  auto bt_body = loop.getAllWritesToBuf(BT->buf())[1];
+  auto bt_body = loop.getAllWritesToBuf(BT.buf())[1];
   TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf));
   loop.reorderAxis(loops.at(0), loops.at(1));
 
@@ -390,7 +390,7 @@ BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) {
   const int kChunkSize = 8;
 
   te::Placeholder a("A", te::kFloat, {M});
-  te::Tensor* b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
+  te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
   te::LoopNest nest({b});
 
   auto loops = nest.getLoopStmtsFor(b);
@@ -453,7 +453,7 @@ BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) {
   te::KernelScope ks;
   constexpr int kCacheSize = 1 << 12;
   te::Placeholder a("A", te::kFloat, {M, N});
-  te::Tensor* b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
+  te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
   te::LoopNest nest({b});
 
   auto sch = state.range(2);
@@ -560,7 +560,7 @@ BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
   te::KernelScope ks;
   constexpr int kChunkSize = 8;
   te::Placeholder a("A", te::kFloat, {M, N});
-  te::Tensor* b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat);
+  te::Tensor b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat);
   te::LoopNest nest({b});
 
   auto sch = state.range(2);
diff --git a/test/cpp/tensorexpr/test_approx.cpp b/test/cpp/tensorexpr/test_approx.cpp
index d761645..2a4ce94 100644
--- a/test/cpp/tensorexpr/test_approx.cpp
+++ b/test/cpp/tensorexpr/test_approx.cpp
@@ -11,7 +11,7 @@
 using namespace torch::indexing;
 namespace te = torch::jit::tensorexpr;
 
-static void vectorize(te::LoopNest* ln, te::Tensor* target, int width) {
+static void vectorize(te::LoopNest* ln, te::Tensor target, int width) {
   auto loops = ln->getLoopStmtsFor(target);
   te::ForPtr inner, tail;
   ln->splitWithTail(loops[0], width, &inner, &tail);
@@ -33,7 +33,7 @@ TEST(Approx, log_vml) {
   te::KernelScope ks;
   te::VarHandle N("N", te::kInt);
   te::Placeholder A("A", te::kFloat, {N});
-  te::Tensor* B = te::Compute(
+  te::Tensor B = te::Compute(
       "B", {N}, [&](const te::VarHandle& i) { return log_vml(A.load(i)); });
 
   te::LoopNest ln({B});
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
index fcfa8ce..d038665 100644
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -49,7 +49,7 @@ TEST(BoundsInference, _1) {
   KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
@@ -60,9 +60,9 @@ TEST(BoundsInference, _1) {
   ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
   verifyConstBounds(bounds_info.at(a.data())[0], {{0, 99}});
 
-  ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 99}});
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}});
 }
 
 TEST(BoundsInference, _2) {
@@ -74,7 +74,7 @@ TEST(BoundsInference, _2) {
   KernelScope kernel_scope;
   VarHandle n("n", kInt);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
@@ -85,9 +85,9 @@ TEST(BoundsInference, _2) {
   ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
   verifyConstBounds(bounds_info.at(a.data())[0], {{0, -1}});
 
-  ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b->buf())[0], {{0, -1}});
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, -1}});
 }
 
 TEST(BoundsInference, _3) {
@@ -99,7 +99,7 @@ TEST(BoundsInference, _3) {
   KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n + 10}, kFloat));
-  Tensor* b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
     return a.load(i) * a.load(i + 10);
   });
   LoopNest l({b});
@@ -111,9 +111,9 @@ TEST(BoundsInference, _3) {
   ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
   verifyConstBounds(bounds_info.at(a.data())[0], {{0, 109}});
 
-  ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 99}});
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}});
 }
 
 TEST(BoundsInference, _4) {
@@ -129,13 +129,13 @@ TEST(BoundsInference, _4) {
   ExprHandle W(320);
   ExprHandle H(200);
   Placeholder a(BufHandle("a", {H, W}, kFloat));
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return x * y;
       });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a.load(y, x) * b->load(y, x);
+        return a.load(y, x) * b.load(y, x);
       });
   LoopNest l({c});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
@@ -149,13 +149,13 @@ TEST(BoundsInference, _4) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 199}, {0, 319}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 199}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 199}, {0, 319}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 199}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 199}, {0, 319}});
   }
   {
     // Infer bounds on the inner loop scope
@@ -166,13 +166,13 @@ TEST(BoundsInference, _4) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {0, 319}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 319}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 319}});
   }
   {
     // Infer bounds on the inner loop body's scope
@@ -183,13 +183,13 @@ TEST(BoundsInference, _4) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {-1, -1}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}});
   }
 }
 
@@ -208,7 +208,7 @@ TEST(BoundsInference, _5) {
   KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
@@ -229,9 +229,9 @@ TEST(BoundsInference, _5) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 95}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 95}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 95}});
   }
   {
     // Verify inferred bounds for the tail loop
@@ -242,9 +242,9 @@ TEST(BoundsInference, _5) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{96, 99}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{96, 99}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{96, 99}});
   }
 }
 
@@ -263,13 +263,13 @@ TEST(BoundsInference, _6) {
   ExprHandle CW(32);
   ExprHandle CH(20);
   Placeholder a(BufHandle("a", {H, W}, kFloat));
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return x * y;
       });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{CH, "y"}, {CW, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a.load(y + 100, x + 100) * b->load(y * 2, x * 5);
+        return a.load(y + 100, x + 100) * b.load(y * 2, x * 5);
       });
   LoopNest l({c});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
@@ -283,13 +283,13 @@ TEST(BoundsInference, _6) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{100, 119}, {100, 131}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 38}, {0, 155}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 38}, {0, 155}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 19}, {0, 31}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 19}, {0, 31}});
   }
   {
     // Infer bounds on the inner loop scope
@@ -300,13 +300,13 @@ TEST(BoundsInference, _6) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {100, 131}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {0, 155}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 155}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {0, 31}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 31}});
   }
   {
     // Infer bounds on the inner loop body's scope
@@ -317,13 +317,13 @@ TEST(BoundsInference, _6) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {-1, -1}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}});
   }
 }
 
@@ -331,9 +331,9 @@ TEST(BoundsInference, Adjacent) {
   KernelScope kernel_scope;
   ExprHandle H(6);
   Placeholder a(BufHandle("a", {20}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H); });
   LoopNest l({b, c});
   std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
@@ -348,9 +348,9 @@ TEST(BoundsInference, Adjacent) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 5}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}});
   }
   {
     // Infer bounds on the inner loop scope
@@ -362,9 +362,9 @@ TEST(BoundsInference, Adjacent) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{6, 11}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}});
   }
   {
     // Infer bounds on the high level program.
@@ -377,24 +377,24 @@ TEST(BoundsInference, Adjacent) {
     ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
     verifyConstBounds(bounds_info.at(a.data())[0], {{0, 11}});
 
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}});
 
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}});
   }
 }
 
 TEST(BoundsInference, MultipleTopLoopLoad) {
   KernelScope kernel_scope;
   Placeholder a(BufHandle("a", {100}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{64, "x"}}, [&](const VarHandle& x) { return a.load(x); });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{32, "x"}}, [&](const VarHandle& x) { return a.load(x + 10); });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "d", {{96, "x"}}, [&](const VarHandle& x) { return a.load(x + 2); });
   LoopNest l({b, c, d});
 
@@ -418,7 +418,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) {
 
   // b, c, d only written.
   {
-    auto bounds = bounds_info[b->buf()];
+    auto bounds = bounds_info[b.buf()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
@@ -426,7 +426,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) {
     verifyConstBounds(bound, {{0, 63}});
   }
   {
-    auto bounds = bounds_info[c->buf()];
+    auto bounds = bounds_info[c.buf()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
@@ -434,7 +434,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) {
     verifyConstBounds(bound, {{0, 31}});
   }
   {
-    auto bounds = bounds_info[d->buf()];
+    auto bounds = bounds_info[d.buf()];
     ASSERT_EQ(bounds.size(), 1);
     auto bound = bounds[0];
     ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
@@ -506,24 +506,24 @@ TEST(BoundsInference, MultipleTopLoopStore) {
 TEST(BoundsInference, CacheReads) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 3);
+        return A.load(i + 30, j + 3);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
   LoopNest l({B, C});
   auto bounds_info_before = inferBounds(l.root_stmt());
 
   StmtPtr j_loop = l.getLoopStmtsFor(B)[1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
 
   auto bounds_info_after = inferBounds(l.root_stmt());
 
@@ -571,7 +571,7 @@ TEST(BoundsInference, CacheReads) {
 
 TEST(BoundsInference, Flattened) {
   KernelScope kernel_scope;
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "b",
       {{3, "z"}, {4, "y"}, {5, "x"}},
       [&](const VarHandle& z, const VarHandle& y, const VarHandle& x) {
@@ -585,7 +585,7 @@ TEST(BoundsInference, Flattened) {
 
   // There's only one buffer.
   ASSERT_EQ(bounds_info.size(), 1);
-  auto& TABI = bounds_info[b->buf()][0];
+  auto& TABI = bounds_info[b.buf()][0];
   ASSERT_EQ(TABI.kind, TensorAccessKind::kStore);
   // Flattened bounds should have a single dimension.
   ASSERT_EQ(TABI.start.size(), 1);
@@ -651,11 +651,11 @@ TEST(BoundsInference, GetPotentialHazards) {
 TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return (i + 1) * (j + 1);
       });
@@ -679,13 +679,13 @@ TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
 TEST(BoundsInference, GetPotentialHazardsLoopCall) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{64, "i"}, {64, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i, j) + 5;
+        return A.load(i, j) + 5;
       });
 
   LoopNest l({A, B});
@@ -706,7 +706,7 @@ TEST(BoundsInference, GetPotentialHazardsLoopCall) {
 TEST(BoundsInference, GetPotentialHazardsLoopSplit) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
diff --git a/test/cpp/tensorexpr/test_conv.cpp b/test/cpp/tensorexpr/test_conv.cpp
index 63881d0..293fbe2 100644
--- a/test/cpp/tensorexpr/test_conv.cpp
+++ b/test/cpp/tensorexpr/test_conv.cpp
@@ -30,7 +30,7 @@ TEST(Conv, DepthwiseConv2D) {
   te::Placeholder input("input", te::kFloat, {N, C, H, W});
   te::Placeholder weight("weight", te::kFloat, {K, CperG, R, S});
   te::Placeholder bias("bias", te::kFloat, {K});
-  te::Tensor* output = te::conv2d_depthwise(
+  te::Tensor output = te::conv2d_depthwise(
       input.handle(), weight.handle(), bias.handle(), kStride, kPad, kGroups);
 
   te::LoopNest loop({output});
@@ -61,7 +61,7 @@ TEST(Conv, DepthwiseConv2DNoBias) {
 
   te::Placeholder input("input", te::kFloat, {N, C, H, W});
   te::Placeholder weight("weight", te::kFloat, {K, CperG, R, S});
-  te::Tensor* output = te::conv2d_depthwise(
+  te::Tensor output = te::conv2d_depthwise(
       input.handle(), weight.handle(), kStride, kPad, kGroups);
 
   te::LoopNest loop({output});
@@ -96,7 +96,7 @@ TEST(Conv, DepthwiseConv2DDynamicShapes) {
   te::Placeholder input("input", te::kFloat, {N_var, C_var, H_var, W_var});
   te::Placeholder weight(
       "weight", te::kFloat, {K_var, CperG_var, R_var, S_var});
-  te::Tensor* output = te::conv2d_depthwise(
+  te::Tensor output = te::conv2d_depthwise(
       input.handle(),
       weight.handle(),
       N_var,
@@ -195,7 +195,7 @@ TEST(Conv, Conv2D) {
   te::Placeholder inputB(te::BufHandle("input", {N, C, H, W}, te::kFloat));
   te::Placeholder filterB(te::BufHandle("filter", {K, C, R, S}, te::kFloat));
 
-  te::Tensor* conv = te::Reduce(
+  te::Tensor conv = te::Reduce(
       "conv",
       {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}},
       te::Sum(),
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
index e36e17a..ed5c070 100644
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -34,7 +34,7 @@ static void testCudaTestVectorAdd01_impl() {
   Dtype dtype = ToDtype<ctype>();
   Placeholder a_buf("a", dtype, {num_iter, block_count, block_size});
   Placeholder b_buf("b", dtype, {num_iter, block_count, block_size});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c",
       {
           {num_iter, "n"},
@@ -99,7 +99,7 @@ TEST(Cuda, Sigmoid_CUDA) {
   const int block_size = 128;
   Dtype dtype = ToDtype<float>();
   Placeholder a_buf("a", dtype, {num_iter, block_count, block_size});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c",
       {
           {num_iter, "n"},
@@ -165,7 +165,7 @@ static void testCudaTestVectorAdd02_impl(int N, int block_size) {
   KernelScope kernel_scope;
   Placeholder a_buf("a", kFloat, {N});
   Placeholder b_buf("b", kFloat, {N});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c",
       {
           {N, "N"},
@@ -225,7 +225,7 @@ TEST(Cuda, HalfCast_CUDA) {
   KernelScope ks;
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
-  Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
     return Cast::make(kFloat, a.load(i));
   });
 
@@ -267,7 +267,7 @@ TEST(Cuda, DynamicShape2D_CUDA) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {m, n}, kFloat));
     Placeholder b(BufHandle("b", {m, n}, kFloat));
-    Tensor* c = Compute(
+    Tensor c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
           return a.load(i, j) + b.load(i, j);
         });
@@ -328,7 +328,7 @@ TEST(Cuda, TestRand01_CUDA) {
   const int num_iter = 3;
   const int block_count = 16;
   const int block_size = 128;
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c",
       {
           {num_iter, "n"},
@@ -387,7 +387,7 @@ TEST(Cuda, DynamicShapeSplit_CUDA) {
   constexpr int N = 4096;
   VarHandle n("n", kInt);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "b", {{n, "n"}}, [&](const VarHandle& i) { return a.load(i) * 2.0f; });
   LoopNest l({b});
   ForPtr inner;
@@ -928,16 +928,16 @@ TEST(Cuda, HalfSupport_CUDA) {
   KernelScope ks;
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
-  Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
     return Cast::make(half, ExprHandle(2.0f) * a.load(i));
   });
 
-  Tensor* c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) {
-    return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b->load(i));
+  Tensor c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) {
+    return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b.load(i));
   });
 
-  Tensor* d = Compute("d", {{4, "n"}}, [&](const VarHandle& i) {
-    return Cast::make(half, c->load(i));
+  Tensor d = Compute("d", {{4, "n"}}, [&](const VarHandle& i) {
+    return Cast::make(half, c.load(i));
   });
 
   LoopNest l({b, c, d});
@@ -986,7 +986,7 @@ TEST(Cuda, HalfPropagation_CUDA) {
   KernelScope kernel_scope;
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
-  Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
     return Max::make(a.load(i), ExprHandle(alloc<HalfImm>(0)), true);
   });
 
@@ -1036,7 +1036,7 @@ TEST(Cuda, UnusedHalfArgument_CUDA) {
   Placeholder a("a", kFloat, {4});
   auto half = ToDtype<at::Half>();
   Placeholder b("b", half, {4});
-  Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
     return Max::make(a.load(i), ExprHandle(alloc<FloatImm>(0)), true);
   });
 
@@ -1168,10 +1168,10 @@ TEST(Cuda, MaskBlockDim_CUDA) {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + 10;
   });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + b_buf.load(i);
   });
 
@@ -1261,10 +1261,10 @@ TEST(Cuda, MaskThreadDim_CUDA) {
   int B_SIZE = 100;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + 10;
   });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i / 2) + b_buf.load(i);
   });
 
@@ -1356,10 +1356,10 @@ TEST(Cuda, MaskMultiBlockDim_CUDA) {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + 10;
   });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + b_buf.load(i);
   });
 
@@ -1450,10 +1450,10 @@ TEST(Cuda, MaskBlockAndThreadDim_CUDA) {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + 10;
   });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i) + b_buf.load(i);
   });
 
@@ -1543,17 +1543,17 @@ TEST(Cuda, MaskMultiDim_CUDA) {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
   Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -1673,17 +1673,17 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) {
   VarHandle B_SIZE("B_SIZE", kInt);
   Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
   Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -2087,17 +2087,17 @@ TEST(Cuda, MaskMultiDimMultiAxis_CUDA) {
   int B_SIZE = 15;
   Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
   Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -2218,17 +2218,17 @@ TEST(Cuda, MaskMultiDimMultiLevel_CUDA) {
   int B_SIZE = 15;
   Placeholder a_buf("a", kFloat, {OUTER_A_SIZE, A_SIZE});
   Placeholder b_buf("b", kFloat, {OUTER_B_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "C",
       {{OUTER_A_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
         return ExprHandle(2) * a_buf.load(i, j);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "D",
       {{OUTER_B_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp
index 24ddfbf..a170e53 100644
--- a/test/cpp/tensorexpr/test_external_calls.cpp
+++ b/test/cpp/tensorexpr/test_external_calls.cpp
@@ -31,7 +31,7 @@ TEST(ExternalCall, Conv2d_float) {
   int64_t dilation = 1;
   int64_t groups = 1;
 
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -95,7 +95,7 @@ TEST(ExternalCall, Conv2d_int) {
   int64_t dilation = 1;
   int64_t groups = 1;
 
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -153,7 +153,7 @@ TEST(ExternalCall, Conv2d_nobias_noargs) {
   Placeholder Weight("Weight", kFloat, {16, 16, 1, 1});
   BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
 
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -203,7 +203,7 @@ TEST(ExternalCall, Addmm_float) {
   int64_t beta = 2;
   int64_t alpha = 2;
 
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -283,7 +283,7 @@ TEST(ExternalCall, Prepacked_Linear_float) {
       weight, bias, c10::optional<at::Scalar>(), c10::optional<at::Scalar>());
 
   Placeholder DummyPrepacked("DummyPrepacked", kFloat, {1});
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -370,7 +370,7 @@ TEST(ExternalCall, Prepacked_Conv2d_float) {
       c10::optional<at::Scalar>());
 
   Placeholder DummyPrepacked("DummyPrepacked", kFloat, {1});
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
@@ -431,7 +431,7 @@ TEST(ExternalCall, BinaryFloat) {
     Placeholder B("", kFloat, toExprHandleVec(bShape));
     BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat);
 
-    Tensor* Result = new Tensor(
+    Tensor Result = Tensor(
         ResultBuf.node(),
         ExternalCall::make(
             ResultBuf,
@@ -516,7 +516,7 @@ TEST(ExternalCall, UnaryFloat) {
     Placeholder A("A", kFloat, toExprHandleVec(aShape));
     BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat);
 
-    Tensor* Result = new Tensor(
+    Tensor Result = Tensor(
         ResultBuf.node(),
         ExternalCall::make(
             ResultBuf, externCallName, {BufHandle(A.data())}, externCallArgs));
@@ -566,14 +566,14 @@ TEST(ExternalCall, ComputeInterop) {
   BufHandle ConvResultBuf("ConvResult", {1, 16, 32, 32}, kFloat);
   BufHandle MatmulResultBuf("MatmulResult", {1, 16, 32, 32}, kFloat);
 
-  Tensor* Input = Compute(
+  Tensor Input = Compute(
       "Input",
       {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}},
       [&](const VarHandle& n,
           const VarHandle& c,
           const VarHandle& h,
           const VarHandle& w) { return FloatImm::make(5.0f); });
-  Tensor* Weight = Compute(
+  Tensor Weight = Compute(
       "Weight",
       {{16, "n"}, {16, "c"}, {1, "kh"}, {1, "kw"}},
       [&](const VarHandle& n,
@@ -581,28 +581,28 @@ TEST(ExternalCall, ComputeInterop) {
           const VarHandle& h,
           const VarHandle& w) { return FloatImm::make(6.0f); });
 
-  Tensor* ConvResult = new Tensor(
+  Tensor ConvResult = Tensor(
       ConvResultBuf.node(),
       ExternalCall::make(
           ConvResultBuf,
           "nnc_aten_conv2d",
-          {BufHandle(Input->buf()), BufHandle(Weight->buf())},
+          {BufHandle(Input.buf()), BufHandle(Weight.buf())},
           {}));
-  Tensor* MatmulResult = new Tensor(
+  Tensor MatmulResult = Tensor(
       MatmulResultBuf.node(),
       ExternalCall::make(
           MatmulResultBuf,
           "nnc_aten_matmul",
-          {BufHandle(ConvResult->buf()), BufHandle(ConvResult->buf())},
+          {BufHandle(ConvResult.buf()), BufHandle(ConvResult.buf())},
           {}));
-  Tensor* Result = Compute(
+  Tensor Result = Compute(
       "Result",
       {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}},
       [&](const VarHandle& n,
           const VarHandle& c,
           const VarHandle& h,
           const VarHandle& w) {
-        return ConvResult->load(n, c, h, w) + MatmulResult->load(n, c, h, w);
+        return ConvResult.load(n, c, h, w) + MatmulResult.load(n, c, h, w);
       });
 
   LoopNest l({Input, Weight, ConvResult, MatmulResult, Result});
@@ -658,31 +658,31 @@ TEST(ExternalCall, Inlining) {
 
   BufHandle MatmulResultBuf("MatmulResult", {8, 8}, kFloat);
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return FloatImm::make(5.0f);
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return FloatImm::make(4.0f);
       });
-  Tensor* MatmulResult = new Tensor(
+  Tensor MatmulResult = Tensor(
       MatmulResultBuf.node(),
       ExternalCall::make(
           MatmulResultBuf,
           "nnc_aten_matmul",
-          {BufHandle(A->buf()), BufHandle(B->buf())},
+          {BufHandle(A.buf()), BufHandle(B.buf())},
           {}));
-  Tensor* Result = Compute(
+  Tensor Result = Compute(
       "Result",
       {{8, "i"}, {8, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return MatmulResult->load(i, j) + FloatImm::make(3.0f);
+        return MatmulResult.load(i, j) + FloatImm::make(3.0f);
       });
 
   StmtPtr root_stmt = alloc<Block>(std::vector<StmtPtr>(
-      {A->stmt(), B->stmt(), MatmulResult->stmt(), Result->stmt()}));
-  LoopNest l(root_stmt, {Result->buf()});
+      {A.stmt(), B.stmt(), MatmulResult.stmt(), Result.stmt()}));
+  LoopNest l(root_stmt, {Result.buf()});
 
   // Inlining should not inline anything here since all Bufs are either
   // defined or used in ExternalCalls
diff --git a/test/cpp/tensorexpr/test_ir_printer.cpp b/test/cpp/tensorexpr/test_ir_printer.cpp
index 76d9247..e11ba06 100644
--- a/test/cpp/tensorexpr/test_ir_printer.cpp
+++ b/test/cpp/tensorexpr/test_ir_printer.cpp
@@ -56,30 +56,30 @@ TEST(IRPrinter, FunctionName) {
   int M = 4;
   int N = 20;
 
-  Tensor* producer = Compute(
+  Tensor producer = Compute(
       "producer",
       {{M, "m"}, {N, "n"}},
       [&](const ExprHandle& m, const ExprHandle& n) { return m * n; });
 
-  Tensor* chunk_0 = Compute(
+  Tensor chunk_0 = Compute(
       "chunk",
       {{M, "m"}, {N / 2, "n"}},
       [&](const ExprHandle& m, const ExprHandle& n) {
-        return producer->load(m, n);
+        return producer.load(m, n);
       });
 
-  Tensor* chunk_1 = Compute(
+  Tensor chunk_1 = Compute(
       "chunk",
       {{M, "m"}, {N / 2, "n"}},
       [&](const ExprHandle& m, const ExprHandle& n) {
-        return producer->load(m, n + ExprHandle(N / 2));
+        return producer.load(m, n + ExprHandle(N / 2));
       });
 
-  Tensor* consumer = Compute(
+  Tensor consumer = Compute(
       "consumer",
       {{M, "i"}, {N / 2, "j"}},
       [&](const ExprHandle& i, const ExprHandle& j) {
-        return i * chunk_1->load(i, j);
+        return i * chunk_1.load(i, j);
       });
 
   LoopNest l({chunk_0, chunk_1, consumer});
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index 8f36f54..765522e 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -1329,7 +1329,7 @@ TEST_F(Kernel, CodegenInspection) {
 #endif
 }
 
-Tensor* lowerNanToNum(
+Tensor lowerNanToNum(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType,
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index 75e6a06..6081403 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -593,10 +593,10 @@ TEST(LLVM, VectorizerLoadStoreTest) {
   KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {1}, kInt));
 
-  Tensor* c =
+  Tensor c =
       Compute("c", {{4, "i"}}, [&](const VarHandle& i) { return a.load(i); });
 
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
   LoopNest l({c});
   StmtPtr s = l.root_stmt();
   ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
@@ -616,11 +616,11 @@ TEST(LLVM, VectorizeBitCast) {
   KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {128}, kInt));
 
-  Tensor* c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) {
     return bitcast<float>(a.load(i));
   });
 
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
   LoopNest l({c});
   StmtPtr s = l.root_stmt();
   ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
@@ -1217,12 +1217,12 @@ TEST(LLVM, StoreFloat) {
 TEST(LLVM, SimpleMath01) {
   KernelScope kernel_scope;
   const int N = 1024;
-  Tensor* tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) {
+  Tensor tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) {
     return cast<float>(i * i + 1);
   });
   LoopNest l({tensor});
   StmtPtr stmt = l.root_stmt();
-  Placeholder f_buf(BufHandle(tensor->buf()));
+  Placeholder f_buf(BufHandle(tensor.buf()));
   LLVMCodeGen cg(stmt, {f_buf});
 
   PaddedBuffer<float> f_v(N, "f_v");
@@ -1241,11 +1241,11 @@ TEST(LLVM, ComputeMul) {
   const int N = 1024;
   Placeholder a(BufHandle("a", {N}, kFloat));
   Placeholder b(BufHandle("b", {N}, kFloat));
-  Tensor* c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) {
     return a.load(i) * b.load(i);
   });
 
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
   LoopNest l({c});
   StmtPtr s = l.root_stmt();
 
@@ -1265,12 +1265,12 @@ TEST(LLVM, BroadcastAdd) {
   const int N = 1024;
   Placeholder a(BufHandle("a", {M, N}, kFloat));
   Placeholder b(BufHandle("b", {N}, kFloat));
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return a.load(i, j) + b.load(j);
       });
 
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
   LoopNest l({c});
   l.prepareForCodegen();
   StmtPtr s = l.root_stmt();
@@ -1372,7 +1372,7 @@ TEST(LLVM, TensorDynamicShapeAdd) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {n}, kFloat));
     Placeholder b(BufHandle("b", {n}, kFloat));
-    Tensor* c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) {
+    Tensor c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) {
       return a.load(i) + b.load(i);
     });
     LoopNest l({c});
@@ -1396,7 +1396,7 @@ TEST(LLVM, DynamicShape2D) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {m, n}, kFloat));
     Placeholder b(BufHandle("b", {m, n}, kFloat));
-    Tensor* c = Compute(
+    Tensor c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
           return a.load(i, j) + b.load(i, j);
         });
@@ -1428,7 +1428,7 @@ TEST(LLVM, EliminatedStmt) {
   KernelScope kernel_scope;
   Placeholder a(BufHandle("a", {1}, kFloat));
 
-  Tensor* c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; });
+  Tensor c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; });
 
   LoopNest l({c});
   l.prepareForCodegen();
@@ -1452,7 +1452,7 @@ TEST(LLVM, SimpleReduction) {
   // TODO: why doesn't implicit vector<DimArg> work?
   std::vector<DimArg> axis = {DimArg(1)};
   std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
-  Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis);
   LoopNest loop({b});
 
   loop.prepareForCodegen();
@@ -1491,7 +1491,7 @@ TEST(LLVM, RFactorReduction) {
   // TODO: why doesn't implicit vector<DimArg> work?
   std::vector<DimArg> axis = {DimArg(1)};
   std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
-  Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis);
   LoopNest loop({b});
 
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(b);
@@ -1502,7 +1502,7 @@ TEST(LLVM, RFactorReduction) {
   loops = loop.getLoopStmtsFor(b);
   loop_m = loops.at(2);
   loop_n = loops.at(1);
-  auto b_body = loop.getAllWritesToBuf(b->buf())[1];
+  auto b_body = loop.getAllWritesToBuf(b.buf())[1];
   ASSERT_TRUE(loop.rfactor(b_body, loop_n));
 
   loop.prepareForCodegen();
@@ -1538,13 +1538,13 @@ TEST(LLVM, RFactorVectorizedReduction) {
 
   Placeholder a("a", kFloat, {1, M, N});
 
-  Tensor* b = Reduce("sum", {{1, "K"}}, Sum(), a, {{M, "M"}, {N, "N"}});
+  Tensor b = Reduce("sum", {{1, "K"}}, Sum(), a, {{M, "M"}, {N, "N"}});
   LoopNest loopnest({b});
   std::vector<ForPtr> loops = loopnest.getLoopStmtsFor(b);
   // Reorder n and m loops
   loopnest.reorderAxis(loops.at(1), loops.at(2));
-  auto b_body = loopnest.getAllWritesToBuf(b->buf()).at(1);
-  auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b->buf());
+  auto b_body = loopnest.getAllWritesToBuf(b.buf()).at(1);
+  auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b.buf());
   ASSERT_TRUE(all_loops.size() == 2 && all_loops[1].size() == 3);
   ASSERT_TRUE(loopnest.rfactor(b_body, all_loops[1][1]));
   auto distributed_loops = loopnest.distributeLoop(all_loops[1][1]);
@@ -1585,7 +1585,7 @@ TEST(LLVM, SimpleParallel) {
     KernelScope kernel_scope;
     const int M = 4;
     const int N = 6;
-    Tensor* f = Compute(
+    Tensor f = Compute(
         "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
           return cast<float>(m + n);
         });
@@ -1626,23 +1626,23 @@ TEST(LLVM, CompositeParallel) {
     KernelScope kernel_scope;
     int M = 5;
     int N = 7;
-    Tensor* t1 =
+    Tensor t1 =
         Compute("t1", {{M, "M"}}, [](const VarHandle& m) { return m + 1.f; });
-    Tensor* t2 =
+    Tensor t2 =
         Compute("t2", {{N, "N"}}, [](const VarHandle& n) { return n + 2.f; });
-    Tensor* t3 = Compute(
+    Tensor t3 = Compute(
         "t3",
         {{M, "M"}, {N, "N"}},
         [=](const VarHandle& m, const VarHandle& n) {
-          return t1->load(m) * t2->load(n);
+          return t1.load(m) * t2.load(n);
         });
-    Tensor* t4 = Compute(
+    Tensor t4 = Compute(
         "t4",
         {{M, "M"}, {N, "N"}},
         [=](const VarHandle& m, const VarHandle& n) {
-          return t3->load(m, n) + m + n;
+          return t3.load(m, n) + m + n;
         });
-    LoopNest loop_nest(std::vector<Tensor*>({t4}), {t1, t2, t3, t4});
+    LoopNest loop_nest({t4}, {t1, t2, t3, t4});
     std::vector<ForPtr> loop_list;
     {
       auto const& loops = loop_nest.getLoopStmtsFor(t1);
@@ -1695,7 +1695,7 @@ TEST(LLVM, VectorizedGEMM) {
 
   Placeholder AP(BufHandle("A", {M, K}, kFloat));
   Placeholder BP(BufHandle("B", {K, N}, kFloat));
-  Tensor* CT = Reduce(
+  Tensor CT = Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       Sum(),
@@ -1776,7 +1776,7 @@ TEST(LLVM, CallRaw) {
   VarHandle N("N", kInt);
   Placeholder a(BufHandle("a", {M, N}, kFloat));
   Placeholder b(BufHandle("b", {N}, kFloat));
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         return a.load(i, j) + b.load(j);
       });
@@ -1793,7 +1793,7 @@ TEST(LLVM, CallRaw) {
   std::vector<float> cv(M * N_value, 0);
   std::vector<void*> args({av.data(), bv.data(), cv.data(), &N_value});
 
-  LLVMCodeGen cg(s, {a, b, BufHandle(c->buf()), N});
+  LLVMCodeGen cg(s, {a, b, BufHandle(c.buf()), N});
   cg.call_raw(args);
 
   for (int i = 0; i < M; i++) {
@@ -1802,7 +1802,7 @@ TEST(LLVM, CallRaw) {
     }
   }
 
-  SimpleIREvaluator eval(s, {a, b, BufHandle(c->buf()), N});
+  SimpleIREvaluator eval(s, {a, b, BufHandle(c.buf()), N});
   eval.call_raw(args);
 
   for (int i = 0; i < M; i++) {
@@ -1818,7 +1818,7 @@ TEST(LLVM, CustomTarget) {
   Placeholder a("a", kFloat, {M});
   Placeholder b("b", kFloat, {M});
   Placeholder c("c", kFloat, {M});
-  Tensor* d = Compute("d", {{M, "m"}}, [&](const VarHandle& m) {
+  Tensor d = Compute("d", {{M, "m"}}, [&](const VarHandle& m) {
     return a.load(m) * b.load(m) + c.load(m);
   });
   LoopNest nest({d});
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index c80dd5f..7c3eefa 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -42,13 +42,12 @@ void checkExprIR(const ExprHandle& e, const std::string& pattern) {
 
 TEST(LoopNest, ExprSimple01) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{16, "X"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
   LoopNest::splitWithTail(loops[0], 2);
   LoopNest::splitWithTail(loops[0], 2);
@@ -56,7 +55,7 @@ TEST(LoopNest, ExprSimple01) {
 
 TEST(LoopNest, ExprLower01) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{16, "x"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
@@ -73,10 +72,9 @@ TEST(LoopNest, ExprSimple02) {
   auto func = [](const ExprHandle& x, const ExprHandle& y) {
     return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
   };
-  Tensor* tensor = Compute("f", {{26, "x"}, {5, "y"}}, func);
+  Tensor tensor = Compute("f", {{26, "x"}, {5, "y"}}, func);
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
   LoopNest::splitWithTail(loops[0], 4);
 
@@ -168,14 +166,13 @@ TEST(LoopNest, ExprSliceHeadWithLoopOptions) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   loops[0]->set_gpu_block_index(LoopOptions::IDX_Y);
   LoopNest::sliceHead(loops[0], 2, &head, &tail);
 
@@ -193,14 +190,13 @@ TEST(LoopNest, ExprSliceTailWithLoopOptions) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 4, &head, &tail);
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -227,14 +223,13 @@ TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceHead(loops[0], 10, &head, &tail);
 
   ASSERT_EQ(head, loops[0]);
@@ -249,14 +244,13 @@ TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceHead(loops[0], 100, &head, &tail);
 
   ASSERT_EQ(head, loops[0]);
@@ -271,14 +265,13 @@ TEST(LoopNest, ExprSliceHead) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceHead(loops[0], 4, &head, &tail);
 
   ASSERT_NE(head, nullptr);
@@ -295,10 +288,9 @@ TEST(LoopNest, ExprSliceHeadWithNonZeroStart) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -323,14 +315,13 @@ TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 10, &head, &tail);
 
   ASSERT_EQ(head, nullptr);
@@ -347,14 +338,13 @@ TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 100, &head, &tail);
 
   ASSERT_EQ(head, nullptr);
@@ -369,14 +359,13 @@ TEST(LoopNest, ExprSliceTail) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 4, &head, &tail);
 
   ASSERT_NE(head, nullptr);
@@ -396,15 +385,14 @@ TEST(LoopNest, ExprSplitAndSlice) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{100, "x"}}, func);
+  Tensor tensor = Compute("f", {{100, "x"}}, func);
   LoopNest l({tensor});
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr inner;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // outer: [0, 4)
   // inner: [0, 21)
   // tail:  [84, 100)
@@ -450,10 +438,9 @@ TEST(LoopNest, ExprSliceAndNormalize) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
@@ -482,11 +469,11 @@ TEST(LoopNest, ExprSliceWithVariableDimension) {
          const std::vector<std::pair<int, int>>& expected_for_ranges) {
         KernelScope kernel_scope;
         VarHandle dim("dim", kInt);
-        Tensor* tensor =
+        Tensor tensor =
             Compute("f", {{dim, "x"}}, [](const ExprHandle& x) { return x; });
         LoopNest l({tensor});
         std::vector<ForPtr> loops =
-            l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+            l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         ForPtr head;
@@ -521,10 +508,9 @@ TEST(LoopNest, ExprSplitWithTail) {
   auto func = [](const ExprHandle& x) {
     return ExprHandle(1.0f) + cast<float>(x);
   };
-  Tensor* tensor = Compute("f", {{199, "x"}}, func);
+  Tensor tensor = Compute("f", {{199, "x"}}, func);
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   LoopNest::splitWithTail(loops[0], 17);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
@@ -552,10 +538,9 @@ TEST(LoopNest, ExprSplitWithTailNone) {
   auto func = [](const ExprHandle& x, const ExprHandle& y) {
     return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
   };
-  Tensor* tensor = Compute("f", {{24, "x"}, {5, "y"}}, func);
+  Tensor tensor = Compute("f", {{24, "x"}, {5, "y"}}, func);
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::splitWithTail(loops[0], 4);
 
   StmtPtr stmt = l.root_stmt();
@@ -612,14 +597,13 @@ TEST(LoopNest, ExprSplitWithMask01) {
   const int N = 5;
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {M, N});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
         return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f;
       });
 
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::splitWithMask(loops[1], 4);
 
   StmtPtr stmt = l.root_stmt();
@@ -648,13 +632,12 @@ TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) {
   const int M = 64;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
-  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
 
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
   LoopNest::splitWithMask(loops[0], 4);
 
@@ -736,14 +719,13 @@ TEST(LoopNest, TileSimple) {
   const int M = 64, N = 64;
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {M, N});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
         return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
       });
 
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   l.tile(loops[0], loops[1], 4, 8);
 
@@ -783,14 +765,13 @@ TEST(LoopNest, TileWithTails) {
   const int M = 64, N = 64;
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {M, N});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
         return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
       });
 
   LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   l.tile(loops[0], loops[1], 5, 9);
 
@@ -831,7 +812,7 @@ TEST(LoopNest, TileInMiddle) {
   const int M = 8, N = 8, L = 8, K = 8;
   Placeholder a_buf("a", kFloat, {M, N, L, K});
   Placeholder b_buf("b", kFloat, {M, N, L, K});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f",
       {{M, "m"}, {N, "n"}, {L, "l"}, {K, "k"}},
       [&](const ExprHandle& m,
@@ -843,7 +824,7 @@ TEST(LoopNest, TileInMiddle) {
 
   LoopNest nest({tensor});
   std::vector<ForPtr> loops =
-      nest.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+      nest.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   nest.tile(loops[1], loops[2], 3, 3);
 
@@ -891,7 +872,7 @@ TEST(LoopNest, SplitWithTailWithLoopOptions) {
   const int M = 21;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
-  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -922,7 +903,7 @@ TEST(LoopNest, SplitWithMaskWithLoopOptions) {
   const int M = 21;
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
-  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -949,7 +930,7 @@ TEST(LoopNest, ScheduleBroadcastAddBuffer) {
   const int K = 6;
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {N, K});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{M, "m"}, {N, "n"}, {K, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -998,20 +979,20 @@ TEST(LoopNest, ScheduleFunctionCall01) {
   const int K = 6;
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {N, K});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{M, "m"}, {N, "n"}, {K, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "d",
       {{M, "m"}, {N, "n"}, {K, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->load(m, n, k) + 1;
+        return c.load(m, n, k) + 1;
       });
 
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
+  LoopNest l({d}, {c, d});
   l.prepareForCodegen();
   StmtPtr stmt = l.root_stmt();
   std::ostringstream oss;
@@ -1058,22 +1039,22 @@ TEST(LoopNest, ScheduleInlineSimple) {
   Placeholder c_buf("c", kFloat, {M, N});
   Placeholder d_buf("d", kFloat, {M, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
       });
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
   LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
 
   l1.prepareForCodegen();
   l2.prepareForCodegen();
@@ -1139,31 +1120,31 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
   Placeholder c_buf("c", kFloat, {M, N});
   Placeholder d_buf("d", kFloat, {M, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
       });
-  Tensor* z = Compute(
+  Tensor z = Compute(
       "z",
       {{M, "m3"}, {N, "n3"}, {K, "k3"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + y->load(m, n, k);
+        return x.load(m, n, k) + y.load(m, n, k);
       });
 
-  LoopNest l(std::vector<Tensor*>({z}), {x, y, z});
+  LoopNest l({z}, {x, y, z});
   for (const std::string& order : inline_order) {
     if (order == "x") {
-      l.computeInline(x->buf());
+      l.computeInline(x.buf());
     } else if (order == "y") {
-      l.computeInline(y->buf());
+      l.computeInline(y.buf());
     } else {
       throw std::runtime_error("Invalid order: " + order);
     }
@@ -1218,7 +1199,7 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
   }
 
   if (inline_order.size() == 2) {
-    Tensor* z2 = Compute(
+    Tensor z2 = Compute(
         "z",
         {{M, "m3"}, {N, "n3"}, {K, "k3"}},
         [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -1254,21 +1235,21 @@ TEST(LoopNest, ScheduleInlineRandom) {
   const int N = 5;
   const int K = 6;
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return Mod::make(Intrinsics::make(kRand, kInt), 5);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + x->load(m, n, k);
+        return x.load(m, n, k) + x.load(m, n, k);
       });
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
 
   // would normally compare results but Rand isn't implemented in the
   // SimpleIREvaluator, even if we could seed it.
@@ -1290,22 +1271,22 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) {
   const int N = 5;
   const int K = 6;
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return m * n * k;
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + Intrinsics::make(kRand, kInt) +
+        return x.load(m, n, k) + Intrinsics::make(kRand, kInt) +
             Intrinsics::make(kRand, kInt);
       });
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
 
   // would normally compare results but Rand isn't implemented in the
   // SimpleIREvaluator, even if we could seed it.
@@ -1327,18 +1308,18 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) {
   const int N = 5;
   const int K = 6;
 
-  Tensor* x = Compute("x", {{M, "m1"}}, [&](const VarHandle& m) {
+  Tensor x = Compute("x", {{M, "m1"}}, [&](const VarHandle& m) {
     return Mod::make(Intrinsics::make(kRand, kInt), 5);
   });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m) + x->load(m);
+        return x.load(m) + x.load(m);
       });
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
 
   // would normally compare results but Rand isn't implemented in the
   // SimpleIREvaluator, even if we could seed it.
@@ -1362,17 +1343,17 @@ TEST(LoopNest, ScheduleInlineIntrinsics) {
   Placeholder a_buf("a", kFloat, {M, N});
   Placeholder b_buf("b", kFloat, {N, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return Intrinsics::make(kSqrt, x->load(m, n, k));
+        return Intrinsics::make(kSqrt, x.load(m, n, k));
       });
 
   PaddedBuffer<float> a_v(M, N);
@@ -1389,9 +1370,9 @@ TEST(LoopNest, ScheduleInlineIntrinsics) {
     }
   }
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
   LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
 
   l1.prepareForCodegen();
   l2.prepareForCodegen();
@@ -1421,21 +1402,21 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
   const int N = 5;
   const int K = 6;
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return Intrinsics::make(kRand, kFloat);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return Intrinsics::make(kSqrt, x->load(m, n, k));
+        return Intrinsics::make(kSqrt, x.load(m, n, k));
       });
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
 
   StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
 
@@ -1451,31 +1432,31 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
 // Split a Compute then inline it into another compute.
 TEST(LoopNest, ScheduleSplitAThenInline) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
 }
 
 // Split a Compute then inline another Compute into it.
 TEST(LoopNest, ScheduleSplitBThenInline) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 3);
-  l.computeInline(a->buf());
+  l.computeInline(a.buf());
   l.prepareForCodegen();
   StmtPtr s = IRSimplifier::simplify(l.root_stmt());
 
@@ -1491,32 +1472,32 @@ TEST(LoopNest, ScheduleSplitBThenInline) {
 // Split a Compute twice then inline it.
 TEST(LoopNest, ScheduleSplitTwiceThenInline) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr i_inner;
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4, &i_inner);
   LoopNest::splitWithMask(i_inner, 2);
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
 }
 
 // Inline a Compute, then split.
 TEST(LoopNest, ScheduleInlineThenSplit) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  l.computeInline(a->buf());
+  LoopNest l({b}, {a, b});
+  l.computeInline(a.buf());
 
   std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
   LoopNest::splitWithMask(loops.back(), 3);
@@ -1534,16 +1515,16 @@ TEST(LoopNest, ScheduleInlineThenSplit) {
 // Split a Compute, inline it, then split the result.
 TEST(LoopNest, ScheduleSplitInlineThenSplit) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
+  LoopNest l({b}, {a, b});
   auto loops = NodeFinder<For>::find(l.root_stmt());
   LoopNest::splitWithMask(loops.back(), 2);
-  l.computeInline(a->buf());
+  l.computeInline(a.buf());
 
   loops = NodeFinder<For>::find(l.root_stmt());
   LoopNest::splitWithMask(loops.front(), 2);
@@ -1561,35 +1542,35 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) {
 // Oversplit a loop that is simplified out after inlining.
 TEST(LoopNest, ScheduleSplitInlineSimplify) {
   KernelScope kernel_scope;
-  Tensor* a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) {
+  Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) {
     return ExprHandle(4) * i - ExprHandle(2) * i;
   });
-  Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a->load(j) - ExprHandle(1);
+  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
+    return a.load(j) - ExprHandle(1);
   });
 
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
 }
 
 // Inline a Compute with two consumers.
 TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
       });
 
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
-  l.computeInline(a->buf());
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(a.buf());
   l.prepareForCodegen();
 
   StmtPtr s = IRSimplifier::simplify(l.root_stmt());
@@ -1607,20 +1588,20 @@ TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
 // Inline Compute A into B, then inline B into C.
 TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
       });
 
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
-  l.computeInline(a->buf());
-  l.computeInline(b->buf());
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(a.buf());
+  l.computeInline(b.buf());
   l.prepareForCodegen();
 
   StmtPtr s = IRSimplifier::simplify(l.root_stmt());
@@ -1638,19 +1619,19 @@ TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
 // Inline a Compute that is both a producer and consumer.
 TEST(LoopNest, ScheduleInlineThreeMixedInner) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
       });
 
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
-  l.computeInline(b->buf());
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(b.buf());
   l.prepareForCodegen();
 
   StmtPtr s = IRSimplifier::simplify(l.root_stmt());
@@ -1668,25 +1649,25 @@ TEST(LoopNest, ScheduleInlineThreeMixedInner) {
 // Split 3 Computes, then inline the first two into the last.
 TEST(LoopNest, ScheduleInlineThreeMixedSplit) {
   KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
       Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
   });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
       });
 
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 4);
-  loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 3);
-  loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
   LoopNest::splitWithMask(loops[0], 2);
 
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
 }
 
 // Check that inlining works for output tensors too
@@ -1696,21 +1677,21 @@ TEST(LoopNest, ScheduleInlineOutputTensors) {
   const int N = 5;
   const int K = 6;
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return m * n * k;
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + m;
+        return x.load(m, n, k) + m;
       });
 
   LoopNest l1({x, y});
-  l1.computeInline(x->buf());
+  l1.computeInline(x.buf());
 
   // would normally compare results but Rand isn't implemented in the
   // SimpleIREvaluator, even if we could seed it.
@@ -1736,14 +1717,14 @@ TEST(LoopNest, ScheduleFuserStyle) {
 
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
 
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "f", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
         return a_buf.load(axes[0]) + 11.0f;
       });
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "g", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
-        return b->load(axes[0]) + 1.0f;
+        return b.load(axes[0]) + 1.0f;
       });
 
   LoopNest l({b, c});
@@ -1772,17 +1753,17 @@ TEST(LoopNest, ScheduleFuserThreeArg) {
   Placeholder c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
   Placeholder d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
-  Tensor* e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+  Tensor e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
     return a.load(i) + b.load(i);
   });
-  Tensor* f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return e->load(i) + c.load(i);
+  Tensor f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+    return e.load(i) + c.load(i);
   });
-  Tensor* g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return f->load(i) + d.load(i);
+  Tensor g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+    return f.load(i) + d.load(i);
   });
 
-  LoopNest l(std::vector<Tensor*>({g}), {e, f, g});
+  LoopNest l({g}, {e, f, g});
   l.computeInline(l.getLoopBodyFor(e));
   l.computeInline(l.getLoopBodyFor(f));
   l.prepareForCodegen();
@@ -1807,7 +1788,7 @@ TEST(LoopNest, ScheduleDynamicShape2D) {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {m, n}, kFloat));
     Placeholder b(BufHandle("b", {m, n}, kFloat));
-    Tensor* c = Compute(
+    Tensor c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
           return a.load(i, j) + b.load(i, j);
         });
@@ -1842,12 +1823,12 @@ TEST(LoopNest, LoopNestComputeAt_1) {
   // and the temp should be used in B.
   KernelScope kernel_scope;
   VarHandle N("N", kInt);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; });
-  Tensor* B = Compute(
-      "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A->load(i_b); });
-  LoopNest l(std::vector<Tensor*>({B}), {A, B});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0);
+  Tensor B = Compute(
+      "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A.load(i_b); });
+  LoopNest l({B}, {A, B});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
   LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
   l.prepareForCodegen();
   StmtPtr s = l.root_stmt();
@@ -1891,16 +1872,16 @@ TEST(LoopNest, LoopNestComputeAt_2) {
   const int kW = 16, kH = 16;
   VarHandle W("W", kInt);
   VarHandle H("H", kInt);
-  Tensor* p = Compute(
+  Tensor p = Compute(
       "prod",
       {{H + 1, "py"}, {W + 1, "px"}},
       [&](const VarHandle& py, const VarHandle& px) { return px * py; });
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "cons",
       {{H, "cy"}, {W, "cx"}},
       [&](const VarHandle& y, const VarHandle& x) {
-        return p->load(y, x) + p->load(y + 1, x) + p->load(y, x + 1) +
-            p->load(y + 1, x + 1);
+        return p.load(y, x) + p.load(y + 1, x) + p.load(y, x + 1) +
+            p.load(y + 1, x + 1);
       });
 
   std::vector<int> c_ref(kW * kH, 0);
@@ -1909,12 +1890,12 @@ TEST(LoopNest, LoopNestComputeAt_2) {
       c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
     }
   }
-  LoopNest orig_loopnest(std::vector<Tensor*>({c}), {p, c});
+  LoopNest orig_loopnest({c}, {p, c});
 
   {
     // First let's try to compute P at axis cy (the outer loop)
     LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
     l.prepareForCodegen();
     StmtPtr s = l.root_stmt();
@@ -1940,7 +1921,7 @@ TEST(LoopNest, LoopNestComputeAt_2) {
   {
     // Now let's try to compute P at axis cx (the inner loop)
     LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
     l.prepareForCodegen();
     StmtPtr s = l.root_stmt();
@@ -1979,27 +1960,25 @@ TEST(LoopNest, LoopNestComputeAt_3) {
   const int kW = 16, kH = 16;
   VarHandle W("W", kInt);
   VarHandle H("H", kInt);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A",
       {{H + 1, "ay"}, {W + 1, "ax"}},
       [&](const VarHandle& ay, const VarHandle& ax) { return ax * ay; });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B",
       {{H + 1, "by"}, {W + 1, "bx"}},
-      [&](const VarHandle& by, const VarHandle& bx) {
-        return A->load(by, bx);
-      });
-  Tensor* C = Compute(
+      [&](const VarHandle& by, const VarHandle& bx) { return A.load(by, bx); });
+  Tensor C = Compute(
       "C",
       {{H, "cy"}, {W, "cx"}},
       [&](const VarHandle& cy, const VarHandle& cx) {
-        return B->load(cy, cx + 1);
+        return B.load(cy, cx + 1);
       });
-  Tensor* D = Compute(
+  Tensor D = Compute(
       "D",
       {{H, "dy"}, {W, "dx"}},
       [&](const VarHandle& dy, const VarHandle& dx) {
-        return A->load(dy + 1, dx) + C->load(dy, dx);
+        return A.load(dy + 1, dx) + C.load(dy, dx);
       });
 
   std::vector<int> c_ref(kW * kH, 0);
@@ -2009,11 +1988,11 @@ TEST(LoopNest, LoopNestComputeAt_3) {
     }
   }
 
-  LoopNest orig_loopnest(std::vector<Tensor*>({D}), {A, B, C, D});
+  LoopNest orig_loopnest({D}, {A, B, C, D});
   {
     // First let's try to compute A at axis dy (the outer loop)
     LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
     l.prepareForCodegen();
     StmtPtr s = l.root_stmt();
@@ -2044,7 +2023,7 @@ TEST(LoopNest, LoopNestComputeAt_3) {
   {
     // Now let's try to compute A at axis dx (the inner loop)
     LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(A), loops[1]);
     l.prepareForCodegen();
     StmtPtr s = l.root_stmt();
@@ -2083,15 +2062,15 @@ TEST(LoopNest, Reduce2dComputeAt) {
   VarHandle W("W", kInt);
   VarHandle H("H", kInt);
 
-  Tensor* p =
+  Tensor p =
       Compute("prod", {{H + 1, "py"}, {W + 1, "px"}}, [&](Axis py, Axis px) {
         return px * py;
       });
-  Tensor* c = Reduce(
+  Tensor c = Reduce(
       "cons",
       {{H, "cy"}, {W, "cx"}},
       Sum(),
-      [&](Axis y, Axis x, Axis r, Axis s) { return p->load(y + r, x + s); },
+      [&](Axis y, Axis x, Axis r, Axis s) { return p.load(y + r, x + s); },
       {{2, "r"}, {2, "s"}});
 
   std::vector<int> c_ref(kW * kH, 0);
@@ -2100,7 +2079,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
       c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
     }
   }
-  LoopNest orig_loopnest(std::vector<Tensor*>({c}), {p, c});
+  LoopNest orig_loopnest({c}, {p, c});
   checkIR(orig_loopnest.root_stmt(), R"IR(
 # CHECK: for (int py = 0; py < H + 1; py++) {
 # CHECK:   for (int px = 0; px < W + 1; px++) {
@@ -2122,7 +2101,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
   {
     // First let's try to compute P at axis cy (the outer loop)
     LoopNest l(orig_loopnest);
-    auto loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
     // FIXME: Calling simplify here breaks the IR:
     // MALFORMED INPUT: could not find base node in Load - temp[...]
@@ -2159,7 +2138,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
   {
     // Now let's try to compute P at axis cx (the inner loop)
     LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
     LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
     l.simplify();
     l.eliminateDeadStores();
@@ -2205,17 +2184,17 @@ TEST(LoopNest, DISABLED_Conv1d_NH) {
   int Pad = 1;
   Placeholder IP("input", kFloat, {H});
 
-  Tensor* A =
+  Tensor A =
       Compute("A", {{N, "np"}, {H + 2 * Pad, "hp"}}, [&](Axis n, Axis h) {
         auto cond = CompareSelect::make(h, Pad, 1, 0, kLT);
         cond = CompareSelect::make(h, H + Pad, 1, cond, kGE);
         return ifThenElse(cond, 0.f, IP.load(n, h - Pad));
       });
-  Tensor* B = Reduce(
+  Tensor B = Reduce(
       "B",
       {{N, "n"}, {H, "h"}},
       Sum(),
-      [&](Axis n, Axis h, Axis r) { return A->load(n, h + r); },
+      [&](Axis n, Axis h, Axis r) { return A.load(n, h + r); },
       {{R, "r"}});
   LoopNest l({B});
   checkIR(l.root_stmt(), R"IR(
@@ -2233,7 +2212,7 @@ TEST(LoopNest, DISABLED_Conv1d_NH) {
 # CHECK:   }
 # CHECK: }
 )IR");
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
   LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
   // FIXME: The current IR is totally broken.  The body of the inlined loop is:
 
@@ -2292,7 +2271,7 @@ class LoopOrderHelper : public IRVisitor {
 
 TEST(LoopNest, LoopNestReorderAxis1) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
@@ -2303,7 +2282,7 @@ TEST(LoopNest, LoopNestReorderAxis1) {
   SimpleIREvaluator cg(stmt1, {tensor});
   cg.call({stmt1_output});
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[1]);
   StmtPtr stmt2 = Stmt::clone(l.root_stmt());
 
@@ -2324,7 +2303,7 @@ TEST(LoopNest, LoopNestReorderAxis1) {
   }
 
   // Reorder them back.
-  loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[1]);
   StmtPtr stmt3 = l.root_stmt();
 
@@ -2341,7 +2320,7 @@ TEST(LoopNest, LoopNestReorderAxis1) {
 
 TEST(LoopNest, LoopNestReorderPartialAxes) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f",
       {{2, "x"}, {3, "y"}, {4, "z"}},
       [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
@@ -2358,7 +2337,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
   SimpleIREvaluator cg(stmt1, {tensor});
   cg.call({stmt1_output});
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[1]);
   ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,x,z,");
 
@@ -2372,7 +2351,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
     ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
   }
 
-  loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[1], loops[2]);
   ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,z,x,");
 
@@ -2389,7 +2368,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
 
 TEST(LoopNest, LoopNestReorderInternalAxis) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f",
       {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}},
       [](const VarHandle& w,
@@ -2409,7 +2388,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) {
   SimpleIREvaluator cg(stmt1, {tensor});
   cg.call({stmt1_output});
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[2], loops[1]);
   ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "w,y,x,z,");
 
@@ -2426,7 +2405,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) {
 
 TEST(LoopNest, LoopNestReorderEnclosingAxis) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f",
       {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}},
       [](const VarHandle& w,
@@ -2445,7 +2424,7 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) {
   SimpleIREvaluator cg(stmt1, {tensor});
   cg.call({stmt1_output});
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[3]);
   ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "z,x,y,w,");
 
@@ -2462,14 +2441,14 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) {
 
 TEST(LoopNest, LoopNestReorderSameAxis) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
   StmtPtr stmt1 = Stmt::clone(l.root_stmt());
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[1], loops[1]);
   StmtPtr stmt2 = Stmt::clone(l.root_stmt());
 
@@ -2492,7 +2471,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
 
   KernelScope kernel_scope;
 
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f",
       {{2, "x"}, {3, "y"}, {4, "z"}},
       [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
@@ -2503,7 +2482,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
 
   Placeholder extra(BufHandle("res", {6, 3}, kFloat));
 
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
   VarHandle i = VarHandle(loops[0]->var());
 
@@ -2589,7 +2568,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
    *
    *
    */
-  loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::reorderAxis(loops[0], loops[2]);
   StmtPtr stmt3 = Stmt::clone(l.root_stmt());
 
@@ -2628,7 +2607,7 @@ void LoopNestReorderTestHelper(
     int index2) {
   KernelScope kernel_scope;
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "5d",
       {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
       [](const std::vector<VarHandle>&) { return -1; });
@@ -2636,7 +2615,7 @@ void LoopNestReorderTestHelper(
 
   Placeholder extra(BufHandle("extra", {5}, kInt));
 
-  auto loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
   int j = 0;
   for (auto l : loops) {
     // Add an increment at each layer of the loop which counts the number of
@@ -2677,7 +2656,7 @@ void LoopNestReorderTestHelper(
     ASSERT_EQ(extra1[i], expected_loops);
   }
 
-  loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
   LoopNest::reorderAxis(loops[index1], loops[index2]);
   StmtPtr stmt2 = Stmt::clone(l.root_stmt());
 
@@ -2752,26 +2731,26 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
   Placeholder c_buf("c", kFloat, {M, N});
   Placeholder d_buf("d", kFloat, {M, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) * b_buf.load(n, k);
       });
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
       });
-  Tensor* z = Compute(
+  Tensor z = Compute(
       "z",
       {{M, "m3"}, {N, "n3"}, {K, "k3"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + y->load(m, n, k);
+        return x.load(m, n, k) + y.load(m, n, k);
       });
 
-  LoopNest l(std::vector<Tensor*>({z}), {x, y, z});
+  LoopNest l({z}, {x, y, z});
   ForPtr a = nullptr;
   ForPtr b = nullptr;
   auto fors = NodeFinder<For>::find(l.root_stmt());
@@ -2845,14 +2824,14 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
 
 TEST(LoopNest, OuterLoopVectorization) {
   KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
       "f", {{8, "X"}, {8, "y"}}, [](const VarHandle& x, const VarHandle& y) {
         return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
       });
   LoopNest l({tensor});
 
   ASSERT_TRUE(
-      LoopNest::vectorize(l.getAllLoopNestsWritingToBuf(tensor->buf())[0][0]));
+      LoopNest::vectorize(l.getAllLoopNestsWritingToBuf(tensor.buf())[0][0]));
 
   StmtPtr root_stmt = l.root_stmt();
   BlockPtr outer_block = to<Block>(root_stmt);
@@ -2899,10 +2878,10 @@ namespace {
 std::string constantUpperBoundLoopIR(int upper_bound_val) {
   KernelScope kernel_scope;
   ExprHandle upper_bound(upper_bound_val);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; });
   LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
   LoopNest::unroll(loops[0], &unrolled);
   std::ostringstream oss;
@@ -2927,12 +2906,12 @@ TEST(LoopNest, UnrollOuter) {
   KernelScope kernel_scope;
   ExprHandle outer_bound(3);
   ExprHandle inner_bound(4);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A",
       {{outer_bound, "x"}, {inner_bound, "y"}},
       [&](const VarHandle& x, const VarHandle& y) { return x + y; });
   LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
   LoopNest::unroll(loops[0], &unrolled);
   checkIR(unrolled, R"IR(
@@ -2951,12 +2930,12 @@ TEST(LoopNest, UnrollInner) {
   KernelScope kernel_scope;
   ExprHandle outer_bound(3);
   ExprHandle inner_bound(4);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A",
       {{outer_bound, "x"}, {inner_bound, "y"}},
       [&](const VarHandle& x, const VarHandle& y) { return x + y; });
   LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
   LoopNest::unroll(
       static_to<For>(loops[0]->body()->stmts().front()), &unrolled);
@@ -3044,10 +3023,10 @@ TEST(LoopNest, UnrollEmpty) {
 TEST(LoopNest, NoUnroll) {
   KernelScope kernel_scope;
   VarHandle upper_bound("N", kInt);
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; });
   LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
   StmtPtr unrolled = nullptr;
   ASSERT_THROWS_WITH(
       LoopNest::unroll(loops[0], &unrolled), "non-constant loop");
@@ -3326,7 +3305,7 @@ TEST(LoopNest, NormalizeAndSplitWithTail) {
   // Create a dummy tensor to construct LoopNest.
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
@@ -3615,12 +3594,12 @@ TEST(LoopNest, FlattenReductionLoopNestFromTensor) {
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
   Placeholder b(BufHandle("b", {m, n}, kFloat));
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
   LoopNest loop({c});
   HashProvider hasher;
   auto hash_before = hasher.hash(loop.root_stmt());
 
-  auto loops = loop.getAllLoopNestsWritingToBuf(c->buf())[1];
+  auto loops = loop.getAllLoopNestsWritingToBuf(c.buf())[1];
   ForPtr flattened = nullptr;
   ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
   ASSERT_EQ(flattened, nullptr);
@@ -3673,14 +3652,14 @@ TEST(LoopNest, DetectInlineRankMismatch) {
   const int kTotalSize = 8;
 
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Tensor* a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+  Tensor a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
     return a_buf.load(i);
   });
-  Tensor* reshape = Compute(
+  Tensor reshape = Compute(
       "reshape",
       {{kTotalSize / 2, "i"}, {2, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) { return a->load(i, j); });
-  LoopNest l(std::vector<Tensor*>({reshape}), {a, reshape});
+      [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j); });
+  LoopNest l({reshape}, {a, reshape});
   ASSERT_THROWS_WITH(
       l.computeInline(l.getLoopBodyFor(a)),
       "Placeholder indexed access is inconsistent with its rank");
@@ -3689,22 +3668,22 @@ TEST(LoopNest, DetectInlineRankMismatch) {
 TEST(LoopNest, CacheReadsSimple) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 3);
+        return A.load(i + 30, j + 3);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -3757,22 +3736,22 @@ TEST(LoopNest, CacheReadsSimple) {
 TEST(LoopNest, CacheReadsOuter) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 40) + A->load(i + 31, j + 41);
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][0];
-  LoopNest::cacheAccesses(A->buf(), "A_local", i_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][0];
+  LoopNest::cacheAccesses(A.buf(), "A_local", i_loop);
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -3805,22 +3784,22 @@ TEST(LoopNest, CacheReadsOuter) {
 TEST(LoopNest, CacheReadsInternal) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 40) + A->load(i + 31, j + 41);
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
 
@@ -3852,23 +3831,23 @@ TEST(LoopNest, CacheReadsInternal) {
 TEST(LoopNest, CacheReadsInner) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
   // note im changing the offset of the first arg of the first call to A.
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 34, j + 40) + A->load(i + 30, j + 41);
+        return A.load(i + 34, j + 40) + A.load(i + 30, j + 41);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
+  LoopNest l({B, C}, {A, B, C});
   StmtPtr body = l.getLoopBodyFor(B);
-  LoopNest::cacheAccesses(A->buf(), "A_local", body);
+  LoopNest::cacheAccesses(A.buf(), "A_local", body);
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
 
@@ -3900,22 +3879,22 @@ TEST(LoopNest, CacheReadsInner) {
 TEST(LoopNest, CacheWritesSimple) {
   KernelScope kernel_scope;
 
-  Tensor* A = Compute(
+  Tensor A = Compute(
       "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
         return i * j;
       });
-  Tensor* B = Compute(
+  Tensor B = Compute(
       "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 40) + A->load(i + 31, j + 41);
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
       });
-  Tensor* C = Compute(
+  Tensor C = Compute(
       "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
       });
 
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A->buf())[0][1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", a_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", a_loop);
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -4054,7 +4033,7 @@ TEST(LoopNest, CompoundTensorSimple) {
   auto outer_for2 = For::make(x, 0, 10, inner_for2);
   BlockPtr body = Block::make({outer_for1, outer_for2});
 
-  Tensor* A = new Tensor(a_buf.node(), body);
+  Tensor A = Tensor(a_buf.node(), body);
 
   LoopNest l({A});
   l.prepareForCodegen();
@@ -4080,22 +4059,22 @@ TEST(LoopNest, InlineConstantIndex) {
   KernelScope kernel_scope;
   const int N = 10;
   Placeholder x_buf("a", kFloat, {1, N, 1});
-  Tensor* y = Compute(
+  Tensor y = Compute(
       "f",
       {{1, "m"}, {N, "n"}, {1, "o"}},
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
         return x_buf.load(m, n, o);
       });
-  Tensor* z = Compute(
+  Tensor z = Compute(
       "f",
       {{1, "m"}, {N, "n"}, {1, "o"}},
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
-        return y->load(m, n, o);
+        return y.load(m, n, o);
       });
 
-  LoopNest l(std::vector<Tensor*>({z}), {y, z});
+  LoopNest l({z}, {y, z});
   l.simplify();
-  ASSERT_TRUE(l.computeInline(y->buf()));
+  ASSERT_TRUE(l.computeInline(y.buf()));
 }
 
 TEST(LoopNest, CompoundTensorUsed) {
@@ -4115,14 +4094,14 @@ TEST(LoopNest, CompoundTensorUsed) {
   auto outer_for2 = For::make(x, 0, 10, inner_for2);
   BlockPtr body = Block::make({outer_for1, outer_for2});
 
-  Tensor* A = new Tensor(a_buf.node(), body);
-  Tensor* B = Compute(
+  Tensor A = Tensor(a_buf.node(), body);
+  Tensor B = Compute(
       "B", {{10, "i"}, {3, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i, j + 1) + A->load(i, j + 2);
+        return A.load(i, j + 1) + A.load(i, j + 2);
       });
 
-  LoopNest l(std::vector<Tensor*>({B}), {A, B});
-  ASSERT_FALSE(l.computeInline(A->buf()));
+  LoopNest l({B}, {A, B});
+  ASSERT_FALSE(l.computeInline(A.buf()));
   l.prepareForCodegen();
 
   std::vector<int> a_data(50, 0);
@@ -4707,12 +4686,10 @@ TEST(LoopNest, OptimizeConditionalsNotNormalized) {
   ASSERT_EQ(hash_before, hash_after);
 }
 
-static std::pair<std::unique_ptr<Placeholder>, Tensor*> colReduce(
-    int M,
-    int N) {
+static std::pair<std::unique_ptr<Placeholder>, Tensor> colReduce(int M, int N) {
   auto a =
       std::make_unique<Placeholder>("a", kFloat, std::vector<ExprHandle>{M, N});
-  Tensor* t = Reduce(
+  Tensor t = Reduce(
       "b",
       {{N, "n"}},
       Sum(),
@@ -4721,10 +4698,10 @@ static std::pair<std::unique_ptr<Placeholder>, Tensor*> colReduce(
   return {std::move(a), t};
 }
 
-static StmtPtr splitTailReorder(Tensor* b) {
+static StmtPtr splitTailReorder(Tensor b) {
   constexpr int kVectorWidth = 8;
   LoopNest nest({b});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0];
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
   nest.splitWithTail(loops[0], kVectorWidth);
   // Now the loopnests will look like:
   //
@@ -4745,24 +4722,24 @@ static StmtPtr splitTailReorder(Tensor* b) {
   // Write #2: "b[n_outer * 8 + n_inner] = ReduceOp(...)"
   // Loopnest #2: {n_outer, n_inner, m};
   // We will have to reorder n_inner and m.
-  auto loopnests = nest.getAllLoopNestsWritingToBuf(b->buf());
+  auto loopnests = nest.getAllLoopNestsWritingToBuf(b.buf());
   LoopNest::reorderAxis(loopnests[1][1], loopnests[1][2]);
   nest.prepareForCodegen();
   return nest.root_stmt();
 }
 
-static StmtPtr splitMaskReorder(Tensor* b) {
+static StmtPtr splitMaskReorder(Tensor b) {
   constexpr int kVectorWidth = 8;
   LoopNest nest({b});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[1];
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1];
   nest.splitWithMask(loops[0], kVectorWidth);
-  loops = nest.getAllLoopNestsWritingToBuf(b->buf())[1];
+  loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1];
   LoopNest::reorderAxis(loops[1], loops[2]);
   nest.prepareForCodegen();
   return nest.root_stmt();
 }
 
-static void checkColReduce(StmtPtr s, Placeholder& p, Tensor* t) {
+static void checkColReduce(StmtPtr s, Placeholder& p, Tensor t) {
   int M = immediateAs<int>(p.dim(0));
   int N = immediateAs<int>(p.dim(1));
   PaddedBuffer<float> a(M, N);
@@ -4893,14 +4870,14 @@ TEST(LoopNest, VectorizeUse) {
   KernelScope kernel_scope;
   constexpr int N = 8;
   Placeholder a("a", kFloat, {N});
-  Tensor* b = Compute(
+  Tensor b = Compute(
       "b", {{N, "n"}}, [&](const VarHandle& n) { return a.load(n) + 1.0f; });
-  Tensor* c = Compute(
-      "c", {{N, "n"}}, [&](const VarHandle& n) { return b->load(n) + 2.0f; });
-  LoopNest nest(std::vector<Tensor*>({c}), {b, c});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0];
+  Tensor c = Compute(
+      "c", {{N, "n"}}, [&](const VarHandle& n) { return b.load(n) + 2.0f; });
+  LoopNest nest({c}, {b, c});
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
   ASSERT_TRUE(LoopNest::vectorize(loops[0]));
-  loops = nest.getAllLoopNestsWritingToBuf(c->buf())[0];
+  loops = nest.getAllLoopNestsWritingToBuf(c.buf())[0];
   ASSERT_TRUE(LoopNest::vectorize(loops[0]));
   nest.prepareForCodegen();
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
@@ -4939,7 +4916,7 @@ TEST(LoopNest, Int64Compute) {
 
   constexpr int64_t N = 12;
   Placeholder a("a", kLong, {N});
-  Tensor* b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) {
+  Tensor b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) {
     return a.load(n) + LongImm::make(1l);
   });
   LoopNest nest({b});
diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp
index 9503f9d..db37b66 100644
--- a/test/cpp/tensorexpr/test_memdependency.cpp
+++ b/test/cpp/tensorexpr/test_memdependency.cpp
@@ -2726,28 +2726,28 @@ TEST(MemDependency, MemDependencyCheckerComputeAPI) {
   // Can determine if 2 loops created by Compute are dependent.
   Placeholder a_buf("a", kFloat, {4, 5});
   Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "d",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->load(m, n, k) + 1;
+        return c.load(m, n, k) + 1;
       });
 
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
+  LoopNest l({d}, {c, d});
 
-  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()});
+  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d.buf()});
 
   l.root_stmt()->accept(&analyzer);
 
   // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a_buf.data()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.data()));
 
   // Second loop depends on first loop.
   auto c_loop = l.getLoopStmtsFor(c)[0];
@@ -2773,32 +2773,32 @@ TEST(MemDependency, MemDependencyCheckerComputeInline) {
 
   Placeholder a_buf("a", kFloat, {4, 5});
   Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n) + b_buf.load(n, k);
       });
-  Tensor* d = Compute(
+  Tensor d = Compute(
       "d",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->load(m, n, k) + 1;
+        return c.load(m, n, k) + 1;
       });
 
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
-  l.computeInline(c->buf());
+  LoopNest l({d}, {c, d});
+  l.computeInline(c.buf());
 
-  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()});
+  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d.buf()});
   l.root_stmt()->accept(&analyzer);
 
   // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a_buf.data()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.data()));
 
   // broadcast_add tensor should not appear in trace at all.
   for (auto& wi : analyzer.getHistory()) {
-    ASSERT_NE(wi->var(), c->buf()->base_handle());
+    ASSERT_NE(wi->var(), c.buf()->base_handle());
   }
 }
 
@@ -2810,7 +2810,7 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) {
 
   Placeholder a_buf("a", kFloat, {4, 5});
   Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -2819,13 +2819,12 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) {
 
   LoopNest l({c});
 
-  MemDependencyChecker analyzer_before(
-      {a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_before({a_buf.data(), b_buf.data()}, {c.buf()});
   l.root_stmt()->accept(&analyzer_before);
 
   l.splitWithTail(l.getLoopStmtsFor(c)[0], 2);
 
-  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c.buf()});
   StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
   stmt->accept(&analyzer_after);
 
@@ -2859,7 +2858,7 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) {
 
   Placeholder a_buf("a", kFloat, {4, 5});
   Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "broadcast_add",
       {{4, "m"}, {5, "n"}, {6, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -2868,14 +2867,13 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) {
 
   LoopNest l({c});
 
-  MemDependencyChecker analyzer_before(
-      {a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_before({a_buf.data(), b_buf.data()}, {c.buf()});
   l.root_stmt()->accept(&analyzer_before);
 
   auto loops = l.getLoopStmtsFor(c);
   l.reorderAxis(loops[0], loops[1]);
 
-  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c.buf()});
   StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
   stmt->accept(&analyzer_after);
 
@@ -2928,22 +2926,22 @@ TEST(MemDependency, MemDependencyCheckerComputeReduce) {
   Placeholder a(BufHandle("a", {2, 3, 6}, kFloat));
   Placeholder b(BufHandle("b", {2, 3, 6}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{2, "l2"}, {3, "n1"}, {6, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}});
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
+  Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}});
+  LoopNest l({d}, {c, d});
 
-  MemDependencyChecker analyzer({a.data(), b.data()}, {d->buf()});
+  MemDependencyChecker analyzer({a.data(), b.data()}, {d.buf()});
 
   l.root_stmt()->accept(&analyzer);
 
   // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a.data()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b.data()));
 
   // Second loop depends on first loop.
   auto c_loop = l.getLoopStmtsFor(c)[0];
@@ -2965,7 +2963,7 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
 
   Placeholder AP(BufHandle("A", {M, K}, kFloat));
   Placeholder BP(BufHandle("B", {K, N}, kFloat));
-  Tensor* CT = Reduce(
+  Tensor CT = Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
       Sum(),
@@ -3011,7 +3009,7 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
   }
   {
     auto const& loops = loop.getLoopStmtsFor(CT);
-    loop.cacheAccesses(CT->buf(), "C_regs", loops[2]);
+    loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
   }
 
   MemDependencyChecker analyzer_unlowered(
@@ -3026,12 +3024,12 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
     stmt->accept(&analyzer_unlowered);
 
     // Outputs depend on inputs.
-    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT->buf(), AP.data()));
-    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT->buf(), BP.data()));
+    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), AP.data()));
+    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), BP.data()));
 
     // The last write to gemm should cover the total bound of the output.
     std::shared_ptr<AccessInfo> outputAccess =
-        analyzer_unlowered.output(CT->buf());
+        analyzer_unlowered.output(CT.buf());
     // A single dependency.
     ASSERT_EQ(outputAccess->dependencies().size(), 1);
 
diff --git a/test/cpp/tensorexpr/test_ops.cpp b/test/cpp/tensorexpr/test_ops.cpp
index 674dbd9..122a498 100644
--- a/test/cpp/tensorexpr/test_ops.cpp
+++ b/test/cpp/tensorexpr/test_ops.cpp
@@ -6,7 +6,7 @@
 
 using namespace torch::jit::tensorexpr;
 
-using Tensors = std::vector<Tensor*>;
+using Tensors = std::vector<Tensor>;
 using Args = std::vector<CodeGen::BufferArg>;
 std::unique_ptr<SimpleIREvaluator> compile(
     const Args& inputs,
@@ -28,7 +28,7 @@ TEST(Ops, Sum) {
     constexpr int N = 16;
 
     Placeholder a("a", kFloat, {M, N});
-    Tensor* b = computeSum({a.handle(), dims, false}, c10::kFloat);
+    Tensor b = computeSum({a.handle(), dims, false}, c10::kFloat);
     auto cg = compile({a}, {b});
 
     auto at = at::arange(M * N, at::kFloat).view({M, N});
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index 449edac..6620ef2 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -35,7 +35,7 @@ TEST(Reductions, ReduceSum0D_1) {
 
   std::vector<float> out(M, -1.f);
 
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -59,7 +59,7 @@ TEST(Reductions, ReduceSum0D_2) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {});
+  Tensor c = Reduce("sum", {}, Sum(), b, {});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -83,7 +83,7 @@ TEST(Reductions, ReduceSum1D) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{10, "m"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{10, "m"}});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -114,7 +114,7 @@ TEST(Reductions, ReduceSum2D) {
 
   std::vector<float> out(M, -1.f);
 
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -145,7 +145,7 @@ TEST(Reductions, ReduceSum3D) {
 
   Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
-  Tensor* c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
+  Tensor c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -175,7 +175,7 @@ TEST(Reductions, ReduceSum3D) {
     ASSERT_EQ(cData[i], expected);
   }
 
-  Tensor* d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}});
+  Tensor d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}});
   LoopNest loop2({d});
   loop2.prepareForCodegen();
   StmtPtr s2 = loop2.root_stmt();
@@ -192,8 +192,8 @@ TEST(Reductions, ReduceSum3D) {
   }
 
   // This is the same as just reducing the original result across that axis.
-  Placeholder c_buf(BufHandle(c->buf()));
-  Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
+  Placeholder c_buf(BufHandle(c.buf()));
+  Tensor e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
   LoopNest loop3({e});
   loop3.prepareForCodegen();
   StmtPtr s3 = loop3.root_stmt();
@@ -219,7 +219,7 @@ TEST(Reductions, ReduceSum10D) {
   std::vector<float> in(InputSize, 1.f);
   std::vector<float> out(OutputSize, -1.f);
 
-  Tensor* c = Reduce(
+  Tensor c = Reduce(
       "sum",
       {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
       Sum(),
@@ -261,7 +261,7 @@ TEST(Reductions, ReduceProduct) {
   Reducer product(
       ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; });
 
-  Tensor* c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}});
+  Tensor c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}});
   LoopNest loop({c});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
@@ -294,7 +294,7 @@ TEST(Reductions, ReduceMax) {
     in[j] = j;
   }
 
-  Tensor* dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}});
+  Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}});
 
   LoopNest loop({dm1});
   loop.prepareForCodegen();
@@ -309,7 +309,7 @@ TEST(Reductions, ReduceMax) {
   Placeholder in2_(BufHandle("b", {2, 5}, kFloat));
   std::vector<float> out2(2, -1.f);
 
-  Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
+  Tensor m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
 
   LoopNest loop2({m2d});
   loop2.prepareForCodegen();
@@ -336,7 +336,7 @@ TEST(Reductions, ReduceMinCustomInitializer) {
     in[j] = 10 + j;
   }
 
-  Tensor* min = Reduce(
+  Tensor min = Reduce(
       "min",
       {},
       Minimum(ExprHandle(minInit)),
@@ -372,7 +372,7 @@ TEST(Reductions, ReduceAnyAll) {
     return CompareSelect::make(a, 1, 1, b, kEQ);
   });
 
-  Tensor* any = Reduce(
+  Tensor any = Reduce(
       "anyEqual",
       {{4, "i"}},
       anyEqSV,
@@ -415,7 +415,7 @@ TEST(Reductions, ReduceAnyAll) {
     return CompareSelect::make(a, 0, 0, b, kEQ);
   });
 
-  Tensor* allGreaterThan = Reduce(
+  Tensor allGreaterThan = Reduce(
       "allGreaterThan",
       {{4, "i"}},
       allGTSV,
@@ -465,7 +465,7 @@ TEST(Reductions, ReduceMatmul2D) {
     }
   }
 
-  Tensor* mm = Reduce(
+  Tensor mm = Reduce(
       "mm",
       {{3, "m"}, {3, "n"}},
       Sum(),
@@ -501,10 +501,10 @@ TEST(Reductions, ReduceRfactorLike) {
   std::vector<float> in_rf_(10, -2.f);
   std::vector<float> out(1, -1.f);
 
-  Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
-  Placeholder in_rf(BufHandle(l1->buf()));
+  Tensor l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
+  Placeholder in_rf(BufHandle(l1.buf()));
 
-  Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
+  Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
 
   LoopNest loop({l1, l2});
   loop.prepareForCodegen();
@@ -526,14 +526,14 @@ TEST(Reductions, ReduceAsProducer) {
   Placeholder a(BufHandle("a", {2, 3}, kFloat));
   Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
-  Tensor* c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
-  Tensor* d = Compute(
+  Tensor c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
+  Tensor d = Compute(
       "scale",
       {{2, "l2"}, {3, "n1"}},
       [&](const VarHandle& l, const VarHandle& n) {
-        return c->load(l, n) * a.load(l, n);
+        return c.load(l, n) * a.load(l, n);
       });
-  LoopNest loop(std::vector<Tensor*>({d}), {c, d});
+  LoopNest loop({d}, {c, d});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
   s = IRSimplifier::simplify(s);
@@ -571,14 +571,14 @@ TEST(Reductions, ReduceAsConsumer) {
   Placeholder a(BufHandle("a", {2, 3, m}, kFloat));
   Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{2, "l2"}, {3, "n1"}, {m, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
-  LoopNest loop(std::vector<Tensor*>({d}), {c, d});
+  Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
+  LoopNest loop({d}, {c, d});
   loop.prepareForCodegen();
   StmtPtr s = loop.root_stmt();
   s = IRSimplifier::simplify(s);
@@ -626,7 +626,7 @@ TEST(Reductions, SplitReduceAxis) {
   }
   std::vector<float> out(16, -1.f);
 
-  Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
   LoopNest::splitWithTail(loops[1], 2);
@@ -656,7 +656,7 @@ TEST(Reductions, SplitNonReduceAxis) {
     }
   }
   std::vector<float> out(16, -1.f);
-  Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
   LoopNest::splitWithTail(loops[0], 2);
@@ -687,14 +687,14 @@ TEST(Reductions, ReorderedReductionInitializer) {
   Placeholder in(BufHandle("in", {1, 12, 6}, kFloat));
   std::vector<float> in_(12 * 6, 1.f);
 
-  Tensor* tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
+  Tensor tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
   LoopNest l_({tensor_});
 
   l_.prepareForCodegen();
   StmtPtr s_ = Stmt::clone(l_.root_stmt());
   s_ = IRSimplifier::simplify(s_);
 
-  Tensor* tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
+  Tensor tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
   LoopNest l({tensor});
 
   auto loops = l.getLoopStmtsFor(tensor);
@@ -741,10 +741,10 @@ TEST(Reductions, ReduceRfactor) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
   ASSERT_TRUE(loop.rfactor(c_body, loops.at(0)));
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
@@ -776,10 +776,10 @@ TEST(Reductions, Reduce3DRfactorInner) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
   ASSERT_FALSE(loop.rfactor(c_body, loops.at(2)));
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 1);
@@ -811,10 +811,10 @@ TEST(Reductions, Reduce3DRfactorOuter) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
   ASSERT_TRUE(loop.rfactor(c_body, loops.at(0)));
   auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
   ASSERT_EQ(rc.size(), 2);
@@ -837,7 +837,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) {
   std::vector<float> out(1, -1.f);
   std::vector<float> ref(1, -1.f);
 
-  Tensor* c = Reduce(
+  Tensor c = Reduce(
       "sum",
       {},
       Sum(),
@@ -854,7 +854,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) {
         IRSimplifier::simplify(refloop.root_stmt()), {in_, c});
     ref_cg.call({in, ref});
 
-    BufPtr tmp_buf = c->buf();
+    BufPtr tmp_buf = c.buf();
 
     for (int idx = 0; idx < rfac_number; idx++) {
       auto reduce = loop.getAllWritesToBuf(tmp_buf)[1];
@@ -890,7 +890,7 @@ TEST(Reductions, ReduceSplitTail) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithTail(loops[i], 8);
@@ -922,7 +922,7 @@ TEST(Reductions, ReduceSplitNoTail) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithTail(loops[i], 5);
@@ -956,7 +956,7 @@ TEST(Reductions, ReduceOverSplitTail) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithTail(loops[i], 16);
@@ -989,7 +989,7 @@ TEST(Reductions, ReduceSplitMask) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithMask(loops[i], 8);
@@ -1021,7 +1021,7 @@ TEST(Reductions, ReduceSplitNoMask) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithMask(loops[i], 5);
@@ -1054,7 +1054,7 @@ TEST(Reductions, ReduceOverSplitMask) {
   for (int i = 0; i < 3; ++i) {
     std::vector<float> out(M, -1.f);
 
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
     LoopNest loop({c});
     std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
     LoopNest::splitWithMask(loops[i], 16);
@@ -1090,16 +1090,16 @@ TEST(Reductions, ReduceSplitRfactor) {
 
   std::vector<float> out(M, -1.f);
 
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   LoopNest::splitWithTail(loops[2], SPLIT_FACTOR);
 
-  auto c_body = loop.getAllWritesToBuf(c->buf())[2];
-  auto all_loops = loop.getAllLoopNestsWritingToBuf(c->buf());
+  auto c_body = loop.getAllWritesToBuf(c.buf())[2];
+  auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
   ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3);
   LoopNest::reorderAxis(all_loops[2][1], all_loops[2][2]);
-  all_loops = loop.getAllLoopNestsWritingToBuf(c->buf());
+  all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
   ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3);
   ASSERT_TRUE(loop.rfactor(c_body, all_loops[2][1]));
   loop.prepareForCodegen();
@@ -1131,7 +1131,7 @@ TEST(Reductions, ReduceOverSplitRfactor) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -1139,9 +1139,9 @@ TEST(Reductions, ReduceOverSplitRfactor) {
   LoopNest::splitWithTail(loops[1], SPLIT_FACTOR, &i, &t);
   LoopNest::reorderAxis(loops[0], i);
 
-  auto all_loops = loop.getAllLoopNestsWritingToBuf(c->buf());
+  auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
   ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(1).size() == 3);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
   ASSERT_TRUE(loop.rfactor(c_body, all_loops[1][0]));
   LoopNest::reorderAxis(all_loops[1][0], all_loops[1][2]);
 
@@ -1182,9 +1182,9 @@ TEST(Reductions, ReduceInlineReduction) {
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M, N, K});
 
-  Tensor* x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
-  Tensor* y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
-    return a_buf.load(m) + x->load(m);
+  Tensor x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
+  Tensor y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
+    return a_buf.load(m) + x.load(m);
   });
 
   PaddedBuffer<float> a_v(M);
@@ -1201,9 +1201,9 @@ TEST(Reductions, ReduceInlineReduction) {
     }
   }
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
   // Cannot inline a reduction computation
-  ASSERT_FALSE(l1.computeInline(x->buf()));
+  ASSERT_FALSE(l1.computeInline(x.buf()));
 }
 
 TEST(Reductions, ReduceInlineConsumer) {
@@ -1215,13 +1215,13 @@ TEST(Reductions, ReduceInlineConsumer) {
   Placeholder a_buf("a", kFloat, {M, N, K});
   Placeholder b_buf("b", kFloat, {M, N, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
         return a_buf.load(m, n, k) + b_buf.load(m, n, k);
       });
-  Tensor* y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});
+  Tensor y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});
 
   PaddedBuffer<float> a_v(M, N, K);
   PaddedBuffer<float> b_v(M, N, K);
@@ -1235,9 +1235,9 @@ TEST(Reductions, ReduceInlineConsumer) {
     }
   }
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
   LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
 
   l1.prepareForCodegen();
   l2.prepareForCodegen();
@@ -1269,7 +1269,7 @@ TEST(Reductions, ReduceInlineReducerInternal) {
   Placeholder a_buf("a", kFloat, {M, N, K});
   Placeholder b_buf("b", kFloat, {M, N, K});
 
-  Tensor* x = Compute(
+  Tensor x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -1279,7 +1279,7 @@ TEST(Reductions, ReduceInlineReducerInternal) {
   Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) {
     return Add::make(ExprHandle(1.f), Min::make(a, b, false));
   });
-  Tensor* y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}});
+  Tensor y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}});
 
   PaddedBuffer<float> a_v(M, N, K);
   PaddedBuffer<float> b_v(M, N, K);
@@ -1293,9 +1293,9 @@ TEST(Reductions, ReduceInlineReducerInternal) {
     }
   }
 
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
   LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
 
   l1.prepareForCodegen();
   l2.prepareForCodegen();
@@ -1328,25 +1328,25 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
   Placeholder a(BufHandle("a", {L, N, M}, kFloat));
   Placeholder b(BufHandle("b", {L, N, M}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{L, "l2"}, {N, "n1"}, {M, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
 
-  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
   LoopNest l_before(l);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
 
   StmtPtr d_loop = l.getLoopStmtsFor(d)[0];
-  l.cacheAccesses(d->buf(), "d_local", d_loop);
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1405,25 +1405,25 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
   Placeholder a(BufHandle("a", {L, N, M}, kFloat));
   Placeholder b(BufHandle("b", {L, N, M}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{L, "l2"}, {N, "n1"}, {M, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
 
-  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
   LoopNest l_before(l);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
 
   StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
-  l.cacheAccesses(d->buf(), "d_local", d_loop);
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1480,25 +1480,25 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
   Placeholder a(BufHandle("a", {L, N, M}, kFloat));
   Placeholder b(BufHandle("b", {L, N, M}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{L, "l2"}, {N, "n1"}, {M, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
 
-  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
   LoopNest l_before(l);
   l_before.prepareForCodegen();
   SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
 
   StmtPtr d_loop = l.getLoopStmtsFor(d)[2];
-  l.cacheAccesses(d->buf(), "d_local", d_loop);
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1551,22 +1551,22 @@ TEST(Reductions, ReductionCacheBodyAccess) {
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{24, "l2"}, {32, "n1"}, {12, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
 
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
 
   StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
-  l.cacheAccesses(c->buf(), "scale_local", d_loop);
+  l.cacheAccesses(c.buf(), "scale_local", d_loop);
 
   l.prepareForCodegen();
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1592,24 +1592,24 @@ TEST(Reductions, ReductionCacheConsumerAccess) {
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{24, "l2"}, {32, "n1"}, {12, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
 
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
 
   LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4);
 
   StmtPtr e_loop = l.getLoopStmtsFor(e)[1];
-  l.cacheAccesses(d->buf(), "sum_local", e_loop);
+  l.cacheAccesses(d.buf(), "sum_local", e_loop);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1633,19 +1633,19 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{24, "l2"}, {32, "n1"}, {12, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
 
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr inner;
@@ -1656,7 +1656,7 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
   // Split reduction consumer.
   LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner);
 
-  l.cacheAccesses(d->buf(), "sum_local", inner);
+  l.cacheAccesses(d.buf(), "sum_local", inner);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1681,19 +1681,19 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
   Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
   Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
 
-  Tensor* c = Compute(
+  Tensor c = Compute(
       "scale",
       {{24, "l2"}, {32, "n1"}, {12, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
         return b.load(l, n, m) * a.load(l, n, m);
       });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
 
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
   });
 
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr inner;
@@ -1705,7 +1705,7 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
   // Split reduction consumer.
   LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner);
 
-  l.cacheAccesses(d->buf(), "sum_local", inner);
+  l.cacheAccesses(d.buf(), "sum_local", inner);
   l.prepareForCodegen();
 
   StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1742,13 +1742,13 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
   LoopNest loop({c});
 
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
   LoopNest::reorderAxis(loops.at(0), loops.at(1));
   loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   BufPtr rfac_buf;
   ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf));
@@ -1811,10 +1811,10 @@ TEST(Reductions, ReductionRfactorCacheTempInner) {
 
   std::vector<float> out(1, -1.f);
 
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
 
   LoopNest::reorderAxis(loops.at(0), loops.at(1));
   loops = loop.getLoopStmtsFor(c);
@@ -1871,7 +1871,7 @@ TEST(Reductions, ReductionVectorize) {
 
   Placeholder in(BufHandle("in", {8, 8}, kFloat));
 
-  Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
   LoopNest l_before({tensor});
   LoopNest l(l_before);
   l_before.prepareForCodegen();
@@ -1909,7 +1909,7 @@ TEST(Reductions, ReductionVectorizeInner) {
 
   Placeholder in(BufHandle("in", {8, 8}, kFloat));
 
-  Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
   LoopNest l({tensor});
 
   ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1]));
@@ -1929,7 +1929,7 @@ TEST(Reductions, ReductionVectorizeRfactor) {
 
   Placeholder in(BufHandle("in", {8, 8}, kFloat));
 
-  Tensor* tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}});
+  Tensor tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}});
 
   LoopNest l_before({tensor});
   LoopNest l(l_before);
@@ -1944,7 +1944,7 @@ TEST(Reductions, ReductionVectorizeRfactor) {
   std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
   LoopNest::reorderAxis(loops[0], loops[1]);
   loops = l.getLoopStmtsFor(tensor);
-  auto tensor_body = l.getAllWritesToBuf(tensor->buf())[1];
+  auto tensor_body = l.getAllWritesToBuf(tensor.buf())[1];
   BufPtr rfac_buf = nullptr;
   ASSERT_TRUE(LoopNest::rfactor(tensor_body, loops.at(0), &rfac_buf));
 
@@ -1988,7 +1988,7 @@ TEST(Reductions, InitFunction) {
   constexpr int N = 16;
   Placeholder A("A", kFloat, {M, N});
   Placeholder B("B", kFloat, {N});
-  Tensor* C = Reduce(
+  Tensor C = Reduce(
       "C",
       {{N, "n"}},
       Sum(),
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
index c25ae4f..0df9e92 100644
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -3932,7 +3932,7 @@ TEST(Simplify, SimplifyForCleansUp) {
   {
     Placeholder a("a", kFloat, {1, 12, 1});
     VarHandle x("x", kInt);
-    Tensor* b = Compute(
+    Tensor b = Compute(
         // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
         "x",
         {{1, "i"}, {12, "m"}, {1, "n"}},
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
index 5a6f257..0f0277e 100644
--- a/test/cpp/tensorexpr/tutorial.cpp
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -155,8 +155,8 @@ int main(int argc, char* argv[]) {
     ExprPtr body = alloc<Mul>(i, j);
 
     // Finally, we pass all these pieces together to Tensor constructor:
-    Tensor* X = new Tensor(buf, args, body);
-    std::cout << "Tensor computation: " << *X << std::endl;
+    Tensor X = Tensor(buf, args, body);
+    std::cout << "Tensor computation: " << X << std::endl;
     // Prints:
     // Tensor computation: Tensor X[64, 32]:
     // for (int i = 0; i < 64; i++) {
@@ -171,11 +171,11 @@ int main(int argc, char* argv[]) {
     // constructing Exprs, Tensors also have a more convenient API for
     // construction. It is based on Compute API, which takes a name,
     // dimensions, and a lambda specifying the computation body:
-    Tensor* Z = Compute(
+    Tensor Z = Compute(
         "Z",
         {{64, "i"}, {32, "j"}},
         [](const VarHandle& i, const VarHandle& j) { return i / j; });
-    std::cout << "Tensor computation: " << *Z << std::endl;
+    std::cout << "Tensor computation: " << Z << std::endl;
     // Prints:
     // Tensor computation: Tensor Z[64, 32]:
     // for (int i = 0; i < 64; i++) {
@@ -187,13 +187,13 @@ int main(int argc, char* argv[]) {
     // Tensors might access other tensors and external placeholders in their
     // expressions. It can be done like so:
     Placeholder P("P", kInt, {64, 32});
-    Tensor* R = Compute(
+    Tensor R = Compute(
         "R",
         {{64, "i"}, {32, "j"}},
         [&](const VarHandle& i, const VarHandle& j) {
-          return Z->load(i, j) * P.load(i, j);
+          return Z.load(i, j) * P.load(i, j);
         });
-    std::cout << "Tensor computation: " << *R << std::endl;
+    std::cout << "Tensor computation: " << R << std::endl;
     // Prints:
     // Tensor computation: Tensor R[64, 32]:
     // for (int i = 0; i < 64; i++) {
@@ -224,20 +224,20 @@ int main(int argc, char* argv[]) {
     // Let's create a simple tensor expression and construct a loop nest for it.
     Placeholder A("A", kFloat, {64, 32});
     Placeholder B("B", kFloat, {64, 32});
-    Tensor* X = Compute(
+    Tensor X = Compute(
         "X",
         {{64, "i"}, {32, "j"}},
         [&](const VarHandle& i, const VarHandle& j) {
           return A.load(i, j) + B.load(i, j);
         });
-    Tensor* Y = Compute(
+    Tensor Y = Compute(
         "Y",
         {{64, "i"}, {32, "j"}},
         [&](const VarHandle& i, const VarHandle& j) {
-          return sigmoid(X->load(i, j));
+          return sigmoid(X.load(i, j));
         });
-    std::cout << "Tensor computation X: " << *X
-              << "Tensor computation Y: " << *Y << std::endl;
+    std::cout << "Tensor computation X: " << X << "Tensor computation Y: " << Y
+              << std::endl;
     // Prints:
     // Tensor computation X: Tensor X[64, 32]:
     // for (int i = 0; i < 64; i++) {
@@ -256,9 +256,7 @@ int main(int argc, char* argv[]) {
     // Creating a loop nest is as quite simple, we just need to specify a list
     // of all and a list of output tensors:
     // NOLINTNEXTLINE(bugprone-argument-comment)
-    std::vector<Tensor*> outputs = {Y};
-    std::vector<Tensor*> all = {X, Y};
-    LoopNest loopnest(outputs, all);
+    LoopNest loopnest(/*outputs=*/{Y}, /*all=*/{X, Y});
 
     // An IR used in LoopNest is based on tensor statements, represented by
     // `Stmt` class. Statements are used to specify the loop nest structure, and
@@ -357,7 +355,7 @@ int main(int argc, char* argv[]) {
     // Let's start by constructing a simple computation for us to work with:
     Placeholder A("A", kInt, {64, 32});
     Placeholder B("B", kInt, {64, 32});
-    Tensor* X = Compute(
+    Tensor X = Compute(
         "X",
         {{64, "i"}, {32, "j"}},
         [&](const VarHandle& i, const VarHandle& j) {
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 27f6e54..0a34f47 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -500,7 +500,7 @@ struct TEWrapper {
 
 void optimizePointwise(
     tensorexpr::LoopNest* ln,
-    tensorexpr::Tensor* target,
+    tensorexpr::Tensor target,
     int width) {
   using namespace torch::jit::tensorexpr;
   std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
@@ -513,7 +513,7 @@ void optimizePointwise(
 std::shared_ptr<TEWrapper> wrapTECompute(
     std::shared_ptr<TEWrapper> wrap,
     tensorexpr::Placeholder& in,
-    tensorexpr::Tensor* out,
+    tensorexpr::Tensor out,
     tensorexpr::VarHandle& dim,
     int width = kVectorWidth) {
   using namespace torch::jit::tensorexpr;
@@ -553,7 +553,7 @@ struct TEWrapper {
 std::shared_ptr<TEWrapper> wrapTECompute(
     std::shared_ptr<TEWrapper> wrap,
     tensorexpr::Placeholder& in,
-    tensorexpr::Tensor* out,
+    tensorexpr::Tensor out,
     tensorexpr::VarHandle& dim,
     int width = kVectorWidth) {
   return wrap;
@@ -593,7 +593,7 @@ std::shared_ptr<TEWrapper> createLogit(c10::optional<float> clamp) {
   auto wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
       if (!clamp) {
         return A.load(i);
@@ -619,7 +619,7 @@ std::shared_ptr<TEWrapper> createRelu() {
   wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto zero = FloatImm::make(0.f);
     auto a = A.load(i);
     return ifThenElse(a < zero, zero, a);
@@ -638,7 +638,7 @@ std::shared_ptr<TEWrapper> createTanh() {
   wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto a = A.load(i);
     return fast_tanh(a);
   });
@@ -656,7 +656,7 @@ std::shared_ptr<TEWrapper> createSigmoid() {
   wrap = std::make_shared<TEWrapper>();
   auto N = VarHandle("N", kInt);
   Placeholder A("A", kFloat, {N});
-  Tensor* B =
+  Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) { return sigmoid(A.load(i)); });
   // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor
   // (Sleef_expf8).
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h
index 77ba8e1..d7cfe78 100644
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -104,7 +104,7 @@ class TORCH_API CodeGen {
 class CodeGen::BufferArg {
  public:
   BufferArg(const Placeholder& buffer) : buf_(buffer.data()) {}
-  BufferArg(Tensor* tensor) : buf_(tensor->buf()) {}
+  BufferArg(Tensor tensor) : buf_(tensor.buf()) {}
   BufferArg(const VarHandle& var) : var_(var.node()), isVar_(true) {}
   BufferArg(const BufHandle& buf) : buf_(buf.node()) {}
 
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index f885246..27b56e2 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -545,7 +545,7 @@ std::ostream& operator<<(std::ostream& stream, const Stmt& stmt) {
 }
 
 std::ostream& operator<<(std::ostream& stream, const Tensor& t) {
-  stream << std::to_string(&t);
+  stream << std::to_string(t);
   return stream;
 }
 
@@ -568,7 +568,7 @@ void print(StmtPtr stmt) {
   }
 }
 
-void print(const Tensor* t) {
+void print(const Tensor& t) {
   std::cout << std::to_string(t);
 }
 
@@ -589,20 +589,17 @@ std::string to_string(StmtPtr stmt) {
   return oss.str();
 }
 
-std::string to_string(const Tensor* t) {
-  if (!t) {
-    return "(null tensor)\n";
-  }
+std::string to_string(const Tensor& t) {
   std::ostringstream oss;
   // TODO: move this to Buf printer
-  oss << "Tensor " << t->buf()->name_hint() << "[";
-  for (const auto i : c10::irange(t->buf()->ndim())) {
+  oss << "Tensor " << t.buf()->name_hint() << "[";
+  for (const auto i : c10::irange(t.buf()->ndim())) {
     if (i != 0) {
       oss << ", ";
     }
-    oss << *t->buf()->dim(i);
+    oss << *t.buf()->dim(i);
   }
-  oss << "]:\n" << *t->stmt() << "\n";
+  oss << "]:\n" << *t.stmt() << "\n";
   return oss.str();
 }
 } // namespace std
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h
index e76dcca..321d1ef 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.h
+++ b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -103,7 +103,7 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Tensor&);
 
 TORCH_API void print(ExprPtr expr);
 TORCH_API void print(StmtPtr stmt);
-TORCH_API void print(const Tensor* t);
+TORCH_API void print(const Tensor& t);
 
 } // namespace tensorexpr
 } // namespace jit
@@ -119,5 +119,5 @@ using torch::jit::tensorexpr::Tensor;
 
 TORCH_API std::string to_string(ExprPtr expr);
 TORCH_API std::string to_string(StmtPtr stmt);
-TORCH_API std::string to_string(const Tensor* t);
+TORCH_API std::string to_string(const Tensor& t);
 } // namespace std
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index faacd02..8076ba2 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -960,7 +960,7 @@ std::vector<ExprHandle> TensorExprKernel::broadcastShapesMut(
   return res.first;
 }
 
-Tensor* computeOneOperand(
+Tensor computeOneOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -981,7 +981,7 @@ Tensor* computeOneOperand(
       });
 }
 
-Tensor* computeTwoOperand(
+Tensor computeTwoOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -1004,7 +1004,7 @@ Tensor* computeTwoOperand(
       });
 }
 
-Tensor* computeTwoOperandWithAlpha(
+Tensor computeTwoOperandWithAlpha(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -1028,7 +1028,7 @@ Tensor* computeTwoOperandWithAlpha(
       });
 }
 
-Tensor* computeConditionWithTwoOperand(
+Tensor computeConditionWithTwoOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -1055,7 +1055,7 @@ Tensor* computeConditionWithTwoOperand(
       });
 }
 
-Tensor* computeThreeOperand(
+Tensor computeThreeOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -1083,7 +1083,7 @@ Tensor* computeThreeOperand(
         return demoteOutput(compute, outputType);
       });
 }
-Tensor* computeFourOperand(
+Tensor computeFourOperand(
     const std::string& name,
     const std::vector<ArgValue>& inputValues,
     const std::vector<ExprHandle>& outputShape,
@@ -1135,7 +1135,7 @@ std::pair<ScalarType, std::vector<BufHandle>> processCatList(
   }
   return {highType, nonEmptyInputs};
 }
-Tensor* computeCatWoConditionals(
+Tensor computeCatWoConditionals(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape) {
   // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
@@ -1164,7 +1164,7 @@ Tensor* computeCatWoConditionals(
   auto output_buf =
       alloc<Buf>("aten_cat", output_sizes_expr, ToDtype(high_type));
   if (non_empty_inputs.size() == 0) {
-    return new Tensor(
+    return Tensor(
         output_buf, alloc<tensorexpr::Block>(std::vector<StmtPtr>({})));
   }
 
@@ -1213,10 +1213,10 @@ Tensor* computeCatWoConditionals(
     concat_dim_size =
         alloc<Add>(concat_dim_size, input_dims[norm_concat_dim].node());
   }
-  return new Tensor(output_buf, IRSimplifier::simplify(block));
+  return Tensor(output_buf, IRSimplifier::simplify(block));
 }
 
-Tensor* computeCat(
+Tensor computeCat(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     at::Device device) {
@@ -1276,7 +1276,7 @@ Tensor* computeCat(
       });
 }
 
-Tensor* computeConv2d(
+Tensor computeConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
@@ -1319,10 +1319,10 @@ Tensor* computeConv2d(
        dilation[0],
        dilation[1],
        groups});
-  return new Tensor(ResultBuf.node(), s);
+  return Tensor(ResultBuf.node(), s);
 }
 
-Tensor* tensorexpr::computeOperandValue(
+Tensor tensorexpr::computeOperandValue(
     c10::Symbol op,
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
@@ -2391,7 +2391,7 @@ c10::optional<ScalarType> findDtypeForValue(const torch::jit::Value* v) {
   return c10::nullopt;
 }
 
-Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
+Tensor TensorExprKernel::computeValue(const torch::jit::Value* v) {
   auto inputs = v->node()->inputs();
   auto op = v->node()->kind();
 
@@ -2703,9 +2703,9 @@ static std::vector<ExprHandle> toExprHandles(const std::vector<T>& sizes) {
   return dims;
 }
 
-Tensor* TensorExprKernel::bindInput(const torch::jit::Value* input) {
+Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) {
   auto const& t = input->type();
-  Tensor* result = nullptr;
+  Tensor result(nullptr, nullptr);
   switch (t->kind()) {
     case TypeKind::TensorType: {
       auto tt = input->type()->cast<TensorType>();
@@ -2744,7 +2744,7 @@ Tensor* TensorExprKernel::bindInput(const torch::jit::Value* input) {
             }
             return inBuffer.load(idx);
           });
-      bufs_.emplace(input, result->buf());
+      bufs_.emplace(input, result.buf());
 
       bufferArgs_.emplace_back(inBuffer);
       break;
@@ -2800,7 +2800,7 @@ bool denseAndNonOverlapping(
   return (strides == at::infer_dense_strides(sizes, strides));
 }
 
-Tensor* TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
+Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
   const TensorTypePtr& tt = v->type()->expect<TensorType>();
   TORCH_INTERNAL_ASSERT(bufs_.count(v));
   BufPtr buf = bufs_.at(v);
@@ -2816,19 +2816,19 @@ Tensor* TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
   auto sizes = *tt->sizes().concrete_sizes();
   std::vector<int64_t> default_strides = TensorType::contiguousStridesOf(sizes);
   if (!tt->strides().concrete_sizes()) {
-    return new Tensor(buf, nullptr);
+    return Tensor(buf, nullptr);
   }
   TORCH_INTERNAL_ASSERT(tt->strides().concrete_sizes());
   const std::vector<int64_t> strides = *tt->strides().concrete_sizes();
   // All Tensors in NNC are layed out in default, contiguous layout.
   // If the output is also default contiguous we don't need to do anything
   if (strides == default_strides) {
-    return new Tensor(buf, nullptr);
+    return Tensor(buf, nullptr);
   }
   // If the tensor is not dense or overlaps, we have
   // no way of matching the profiled striding
   if (!denseAndNonOverlapping(sizes, strides)) {
-    return new Tensor(buf, nullptr);
+    return Tensor(buf, nullptr);
   }
 
   auto dims = c10::fmap<DimArg>(sizesForValue(v));
@@ -2922,8 +2922,9 @@ void TensorExprKernel::compile() {
   nInputs_ = graph_->inputs().size();
   genInputDebugNames();
   for (auto const& input : graph_->inputs()) {
-    if (Tensor* t = bindInput(input)) {
-      block->append_stmt(t->stmt());
+    Tensor t = bindInput(input);
+    if (t.stmt()) {
+      block->append_stmt(t.stmt());
     }
   }
 
@@ -2937,10 +2938,9 @@ void TensorExprKernel::compile() {
     } else {
       for (auto const& output : n->outputs()) {
         if (output->hasUses()) {
-          Tensor* t = computeValue(output);
-          bufs_.emplace(output, t->buf());
-          // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-          block->append_stmt(t->stmt());
+          Tensor t = computeValue(output);
+          bufs_.emplace(output, t.buf());
+          block->append_stmt(t.stmt());
         }
       }
     }
@@ -2958,12 +2958,12 @@ void TensorExprKernel::compile() {
     // The "strided" tensor will be incorrect if used in NNC,
     // since NNC views it as contiguous. Only convert it to the right
     // strides at the end of the kernel (if already contiguous it's a no-op)
-    Tensor* properly_strided_output = convertOutputToCorrectStrides(output);
-    if (properly_strided_output->stmt()) {
-      block->append_stmt(properly_strided_output->stmt());
+    Tensor properly_strided_output = convertOutputToCorrectStrides(output);
+    if (properly_strided_output.stmt()) {
+      block->append_stmt(properly_strided_output.stmt());
     }
     // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    bufs_[output] = properly_strided_output->buf();
+    bufs_[output] = properly_strided_output.buf();
     const auto& tt = output->type()->expect<TensorType>();
     auto sizes = *tt->sizes().concrete_sizes();
     tensorOutputSizes_.push_back(sizes);
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h
index 7b35e1e..a8a57b9 100644
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@@ -47,7 +47,7 @@ using ArgValue = c10::variant<
     IntList,
     ArgNone>;
 
-using NNCLoweringFunction = std::function<Tensor*(
+using NNCLoweringFunction = std::function<Tensor(
     const std::vector<ArgValue>&,
     const std::vector<ExprHandle>&,
     const c10::optional<ScalarType>&,
@@ -123,7 +123,7 @@ struct TensorInfo {
   c10::ScalarType dtype;
 };
 
-TORCH_API Tensor* computeOperandValue(
+TORCH_API Tensor computeOperandValue(
     c10::Symbol op,
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
@@ -209,7 +209,7 @@ class TORCH_API TensorExprKernel {
       const torch::jit::Value* v,
       const std::vector<ExprHandle>& axes);
 
-  Tensor* computeValue(const torch::jit::Value* v);
+  Tensor computeValue(const torch::jit::Value* v);
 
   void bindConstant(const torch::jit::Value* v);
 
@@ -222,9 +222,9 @@ class TORCH_API TensorExprKernel {
       std::vector<at::Tensor>& outputs);
   BackendType inferBackendTypeFromDevice(at::Device device);
 
-  Tensor* bindInput(const torch::jit::Value* input);
+  Tensor bindInput(const torch::jit::Value* input);
 
-  Tensor* convertOutputToCorrectStrides(torch::jit::Value* v);
+  Tensor convertOutputToCorrectStrides(torch::jit::Value* v);
 
   // Captures the information for reduction operation nodes.
   struct ReductionInfo {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index d9d2073..1904999 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -47,14 +47,14 @@ LoopNest::LoopNest(StmtPtr stmt, std::unordered_set<BufPtr> output_bufs)
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 LoopNest::LoopNest(
-    const std::vector<Tensor*>& output_tensors,
-    const std::vector<Tensor*>& tensors_to_compute) {
+    const std::vector<Tensor>& output_tensors,
+    const std::vector<Tensor>& tensors_to_compute) {
   initialize(output_tensors, tensors_to_compute);
   verify(root_stmt_);
 }
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-LoopNest::LoopNest(const std::vector<Tensor*>& output_tensors) {
+LoopNest::LoopNest(const std::vector<Tensor>& output_tensors) {
   initialize(output_tensors, output_tensors);
   verify(root_stmt_);
 }
@@ -486,15 +486,15 @@ bool LoopNest::vectorize(ForPtr f) {
 }
 
 void LoopNest::initialize(
-    const std::vector<Tensor*>& output_tensors,
-    const std::vector<Tensor*>& tensors_to_compute) {
+    const std::vector<Tensor>& output_tensors,
+    const std::vector<Tensor>& tensors_to_compute) {
   for (auto t : output_tensors) {
-    output_bufs_.insert(t->buf());
+    output_bufs_.insert(t.buf());
   }
 
   std::vector<StmtPtr> loops;
-  for (Tensor* t : tensors_to_compute) {
-    StmtPtr loop = t->stmt();
+  for (Tensor t : tensors_to_compute) {
+    StmtPtr loop = t.stmt();
     if (loop->get_parent()) {
       std::cerr << "Error: creating a loopnest from already used Tensors\n";
       loops = {};
@@ -2384,7 +2384,7 @@ void LoopNest::compressAllBuffers(StmtPtr stmt) {
   }
 }
 
-std::vector<ForPtr> LoopNest::getLoopStmtsFor(Tensor* t) const {
+std::vector<ForPtr> LoopNest::getLoopStmtsFor(Tensor t) const {
   StmtPtr cur_stmt = getLoopBodyFor(t);
   return getLoopStmtsFor(cur_stmt);
 }
@@ -2407,8 +2407,8 @@ std::vector<ForPtr> LoopNest::getLoopStmtsFor(StmtPtr s) const {
   return result;
 }
 
-StmtPtr LoopNest::getLoopBodyFor(Tensor* t) const {
-  return getLoopBodyFor(t->buf());
+StmtPtr LoopNest::getLoopBodyFor(Tensor t) const {
+  return getLoopBodyFor(t.buf());
 }
 
 StmtPtr LoopNest::getLoopBodyFor(BufPtr buf) const {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index c8cf2d8..42f072d 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -27,11 +27,11 @@ class TORCH_API LoopNest {
  public:
   // A constructor for building a LoopNest from a list of Tensors
   LoopNest(
-      const std::vector<Tensor*>& output_tensors,
-      const std::vector<Tensor*>& tensors_to_compute);
+      const std::vector<Tensor>& output_tensors,
+      const std::vector<Tensor>& tensors_to_compute);
 
   // A convenience constructor for the case when all tensors are output tensors
-  LoopNest(const std::vector<Tensor*>& output_tensors);
+  LoopNest(const std::vector<Tensor>& output_tensors);
 
   // A constructor for building a LoopNest from an Stmt and a list of output
   // buffers.
@@ -45,10 +45,10 @@ class TORCH_API LoopNest {
     return root_stmt_;
   }
 
-  std::vector<ForPtr> getLoopStmtsFor(Tensor*) const;
+  std::vector<ForPtr> getLoopStmtsFor(Tensor) const;
   std::vector<ForPtr> getLoopStmtsFor(BufPtr) const;
   std::vector<ForPtr> getLoopStmtsFor(StmtPtr) const;
-  StmtPtr getLoopBodyFor(Tensor*) const;
+  StmtPtr getLoopBodyFor(Tensor) const;
   StmtPtr getLoopBodyFor(BufPtr) const;
 
   // Returns the For stmt indexed by 'indices' in the 'root' For stmt.
@@ -547,8 +547,8 @@ class TORCH_API LoopNest {
 
  private:
   void initialize(
-      const std::vector<Tensor*>& output_tensors,
-      const std::vector<Tensor*>& tensors_to_compute);
+      const std::vector<Tensor>& output_tensors,
+      const std::vector<Tensor>& tensors_to_compute);
   StmtPtr insertAllocFree(StmtPtr stmt);
   const std::unordered_set<BufPtr> getIntermediateBufs() const;
 
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
index c4af83a..51d323f 100644
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
@@ -16,7 +16,7 @@ void assert_dims_constant(const BufHandle& buf) {
 
 using InitFunc = std::function<ExprHandle(const std::vector<VarHandle>&)>;
 
-Tensor* conv2d_depthwise_static(
+Tensor conv2d_depthwise_static(
     BufHandle input,
     BufHandle weight,
     const InitFunc& init_func,
@@ -45,7 +45,7 @@ Tensor* conv2d_depthwise_static(
   auto OH = (H - R + 2 * pad) / stride + 1;
   auto OW = (W - S + 2 * pad) / stride + 1;
 
-  Tensor* conv = Reduce(
+  Tensor conv = Reduce(
       "conv2d_depthwise",
       {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}},
       Sum(),
@@ -83,7 +83,7 @@ Tensor* conv2d_depthwise_static(
   } else if (R == 3 && stride == 1 && pad == 1) {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     ForPtr main, peeled;
-    auto loops = nest.getAllLoopNestsWritingToBuf(conv->buf());
+    auto loops = nest.getAllLoopNestsWritingToBuf(conv.buf());
     main = loops[1][kLoopW];
     nest.sliceHead(main, 1, &peeled, &main);
     nest.sliceTail(main, 1, &main, &peeled);
@@ -92,10 +92,10 @@ Tensor* conv2d_depthwise_static(
     nest.sliceTail(main, 1, &main, &peeled);
   }
 
-  return new Tensor(conv->buf(), nest.root_stmt());
+  return Tensor(conv.buf(), nest.root_stmt());
 }
 
-Tensor* conv2d_depthwise_dynamic(
+Tensor conv2d_depthwise_dynamic(
     BufHandle input,
     BufHandle weight,
     const InitFunc& init_func,
@@ -144,7 +144,7 @@ Tensor* conv2d_depthwise_dynamic(
 
 } // namespace
 
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     BufHandle bias,
@@ -158,7 +158,7 @@ Tensor* conv2d_depthwise(
   return conv2d_depthwise_static(input, weight, init_func, stride, pad, groups);
 }
 
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     int stride,
@@ -170,7 +170,7 @@ Tensor* conv2d_depthwise(
   return conv2d_depthwise_static(input, weight, init_func, stride, pad, groups);
 }
 
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     BufHandle bias,
@@ -206,7 +206,7 @@ Tensor* conv2d_depthwise(
       groups);
 }
 
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     ExprHandle N,
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h
index 14612fb..4c2215b 100644
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.h
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h
@@ -7,7 +7,7 @@ namespace jit {
 namespace tensorexpr {
 
 // An API to compute 2D depthwise convolutions with bias.
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     BufHandle bias,
@@ -16,14 +16,14 @@ TORCH_API Tensor* conv2d_depthwise(
     int groups);
 
 // An API to compute 2D depthwise convolutions without bias.
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     int stride,
     int pad,
     int groups);
 
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     BufHandle bias,
@@ -39,7 +39,7 @@ TORCH_API Tensor* conv2d_depthwise(
     ExprHandle pad,
     ExprHandle groups);
 
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
     BufHandle input,
     BufHandle weight,
     ExprHandle N,
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.cpp b/torch/csrc/jit/tensorexpr/operators/matmul.cpp
index 23cb455..581514c 100644
--- a/torch/csrc/jit/tensorexpr/operators/matmul.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.cpp
@@ -5,7 +5,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeMatmul(
+Tensor computeMatmul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
@@ -44,13 +44,13 @@ Tensor* computeMatmul(
         },
         {{size_a[1], "K"}});
   } else {
-    return new Tensor(
+    return Tensor(
         ResultBuf.node(),
         ExternalCall::make(ResultBuf, "nnc_aten_matmul", {a, b}, {}));
   }
 }
 
-Tensor* computeAddMM(
+Tensor computeAddMM(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
@@ -59,7 +59,7 @@ Tensor* computeAddMM(
     dtype = Dtype(*outputType);
   }
   BufHandle ResultBuf("addmm", outputShape, dtype);
-  return new Tensor(
+  return Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.h b/torch/csrc/jit/tensorexpr/operators/matmul.h
index 35b30f4..0b52ad6 100644
--- a/torch/csrc/jit/tensorexpr/operators/matmul.h
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.h
@@ -6,11 +6,11 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeMatmul(
+Tensor computeMatmul(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
-Tensor* computeAddMM(
+Tensor computeAddMM(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.cpp b/torch/csrc/jit/tensorexpr/operators/norm.cpp
index d96ebcd..610f928 100644
--- a/torch/csrc/jit/tensorexpr/operators/norm.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/norm.cpp
@@ -4,7 +4,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeBatchNorm(
+Tensor computeBatchNorm(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.h b/torch/csrc/jit/tensorexpr/operators/norm.h
index 98d53b4..7f1412f 100644
--- a/torch/csrc/jit/tensorexpr/operators/norm.h
+++ b/torch/csrc/jit/tensorexpr/operators/norm.h
@@ -6,7 +6,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeBatchNorm(
+Tensor computeBatchNorm(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.cpp b/torch/csrc/jit/tensorexpr/operators/reduction.cpp
index c1f3f7f..fe5cb6d 100644
--- a/torch/csrc/jit/tensorexpr/operators/reduction.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.cpp
@@ -19,7 +19,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeSum(
+Tensor computeSum(
     const std::vector<ArgValue>& inputs,
     const c10::optional<ScalarType>& outputType) {
   std::vector<size_t> axes;
@@ -100,7 +100,7 @@ Tensor* computeSum(
       reductionDims);
 }
 
-Tensor* computeMean(
+Tensor computeMean(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
@@ -120,13 +120,13 @@ Tensor* computeMean(
       mean_dims_expr.emplace_back(idx);
     }
   }
-  return new Tensor(
+  return Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf, "nnc_aten_mean", {InputBuf}, mean_dims_expr));
 }
 
-Tensor* computeAdaptiveAvgPool2d(
+Tensor computeAdaptiveAvgPool2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType) {
@@ -137,7 +137,7 @@ Tensor* computeAdaptiveAvgPool2d(
   BufHandle ResultBuf("adaptive_avgpool2d", outputShape, dtype);
   // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
   auto out_size_param = c10::get<IntList>(inputs[1]);
-  return new Tensor(
+  return Tensor(
       ResultBuf.node(),
       ExternalCall::make(
           ResultBuf,
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.h b/torch/csrc/jit/tensorexpr/operators/reduction.h
index 4335d7b..d76bac6 100644
--- a/torch/csrc/jit/tensorexpr/operators/reduction.h
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.h
@@ -6,14 +6,14 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-TORCH_API Tensor* computeSum(
+TORCH_API Tensor computeSum(
     const std::vector<ArgValue>& inputs,
     const c10::optional<ScalarType>& outputType);
-TORCH_API Tensor* computeMean(
+TORCH_API Tensor computeMean(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
-TORCH_API Tensor* computeAdaptiveAvgPool2d(
+TORCH_API Tensor computeAdaptiveAvgPool2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const c10::optional<ScalarType>& outputType);
diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.cpp b/torch/csrc/jit/tensorexpr/operators/softmax.cpp
index d6cb6c0..c1c2872 100644
--- a/torch/csrc/jit/tensorexpr/operators/softmax.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/softmax.cpp
@@ -6,7 +6,7 @@ namespace tensorexpr {
 
 using namespace torch::jit::tensorexpr;
 
-Tensor* computeSoftmax(
+Tensor computeSoftmax(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     bool log_softmax) {
@@ -111,48 +111,43 @@ Tensor* computeSoftmax(
       Compute("aten_softmax_exp", output_dims, [&](ParameterList& indices) {
         auto inp = tensorOrConstant(
             inputs[0], convert_indices_to_expr_handle(indices));
-        return exp(inp - max->load(remove_softmax_dim_index(indices)));
+        return exp(inp - max.load(remove_softmax_dim_index(indices)));
       });
   auto sum = Reduce(
       "aten_softmax_sum",
       non_softmax_dims,
       Sum(),
       [&](ParameterList& indices) {
-        return e->load(move_softmax_dim_index_to_pos(indices));
+        return e.load(move_softmax_dim_index_to_pos(indices));
       },
       {output_dims[softmax_dim]});
   if (!log_softmax) {
     auto result =
         Compute("aten_softmax", output_dims, [&](ParameterList& indices) {
-          return e->load(indices) /
-              sum->load(remove_softmax_dim_index(indices));
+          return e.load(indices) / sum.load(remove_softmax_dim_index(indices));
         });
-    return new Tensor(
-        result->buf(),
+    return Tensor(
+        result.buf(),
         alloc<tensorexpr::Block>(std::vector<StmtPtr>(
-            {max->stmt(), e->stmt(), sum->stmt(), result->stmt()})));
+            {max.stmt(), e.stmt(), sum.stmt(), result.stmt()})));
   }
 
   auto log_sum = Compute(
       "aten_softmax_log_sum", non_softmax_dims, [&](ParameterList& indices) {
-        return log(sum->load(indices));
+        return log(sum.load(indices));
       });
   auto result =
       Compute("aten_log_softmax", output_dims, [&](ParameterList& indices) {
         auto inp = tensorOrConstant(
             inputs[0], convert_indices_to_expr_handle(indices));
         auto non_softmax_indices = remove_softmax_dim_index(indices);
-        return inp - max->load(non_softmax_indices) -
-            log_sum->load(non_softmax_indices);
+        return inp - max.load(non_softmax_indices) -
+            log_sum.load(non_softmax_indices);
       });
-  return new Tensor(
-      result->buf(),
+  return Tensor(
+      result.buf(),
       alloc<tensorexpr::Block>(std::vector<StmtPtr>(
-          {max->stmt(),
-           e->stmt(),
-           sum->stmt(),
-           log_sum->stmt(),
-           result->stmt()})));
+          {max.stmt(), e.stmt(), sum.stmt(), log_sum.stmt(), result.stmt()})));
 }
 
 } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.h b/torch/csrc/jit/tensorexpr/operators/softmax.h
index 07ddd0f..b74a867 100644
--- a/torch/csrc/jit/tensorexpr/operators/softmax.h
+++ b/torch/csrc/jit/tensorexpr/operators/softmax.h
@@ -6,7 +6,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-Tensor* computeSoftmax(
+Tensor computeSoftmax(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     bool log_softmax);
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index 9df70f8..ea3902d 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -48,7 +48,7 @@ StmtPtr Tensor::constructStmt(
   return s;
 }
 
-Tensor* Compute(
+Tensor Compute(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
@@ -57,10 +57,10 @@ Tensor* Compute(
   unpack_dim_args(dim_args, &dims, &args);
   ExprPtr body = body_func(VarVectorToVarHandleVector(args)).node();
   BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
 }
 
-Tensor* Compute(
+Tensor Compute(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const VarHandle&)>& body_func) {
@@ -73,10 +73,10 @@ Tensor* Compute(
   unpack_dim_args(dim_args, &dims, &args);
   ExprPtr body = body_func(VarHandle(args[0])).node();
   BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
 }
 
-Tensor* Compute(
+Tensor Compute(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
@@ -89,10 +89,10 @@ Tensor* Compute(
   unpack_dim_args(dim_args, &dims, &args);
   ExprPtr body = body_func(VarHandle(args[0]), VarHandle(args[1])).node();
   BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
 }
 
-Tensor* Compute(
+Tensor Compute(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<
@@ -108,10 +108,10 @@ Tensor* Compute(
       body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
           .node();
   BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
 }
 
-Tensor* Compute(
+Tensor Compute(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(
@@ -132,10 +132,10 @@ Tensor* Compute(
                      VarHandle(args[3]))
                      .node();
   BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
 }
 
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -149,7 +149,7 @@ Tensor* Reduce(
       reduce_args);
 }
 
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -163,17 +163,17 @@ Tensor* Reduce(
       reduce_args);
 }
 
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
-    Tensor* tensor,
+    Tensor tensor,
     const std::vector<DimArg>& reduce_args) {
   return Reduce(
       name,
       dim_args,
       reducer,
-      [&](ParameterList& p) { return tensor->load(p); },
+      [&](ParameterList& p) { return tensor.load(p); },
       reduce_args);
 }
 
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index 3eb02c6..8d8ffe5 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -12,7 +12,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-class TORCH_API Tensor : KernelScopedObject {
+class TORCH_API Tensor {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   Tensor(BufPtr buf, const std::vector<VarPtr>& args, ExprPtr body)
@@ -42,9 +42,9 @@ class TORCH_API Tensor : KernelScopedObject {
   }
 
   template <typename T>
-  inline ExprHandle load(const std::vector<T>& args);
+  inline ExprHandle load(const std::vector<T>& args) const;
   template <typename... Ts>
-  inline ExprHandle load(const Ts&... ts);
+  inline ExprHandle load(const Ts&... ts) const;
 
  private:
   StmtPtr constructStmt(
@@ -134,22 +134,22 @@ class Placeholder {
   std::vector<ExprPtr> strides_;
 };
 
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const VarHandle&)>& body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
         body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const std::function<
         ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
         body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(
@@ -157,7 +157,7 @@ TORCH_API Tensor* Compute(
         const VarHandle&,
         const VarHandle&,
         const VarHandle&)>& body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func);
@@ -179,7 +179,7 @@ inline void unpack_dim_args(
 
 // Handle reductions over a Reducer and a body_func which produces values.
 template <typename InitFunc, typename BodyFunc>
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -207,7 +207,7 @@ Tensor* Reduce(
             .node();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     BufPtr func_result = alloc<Buf>(func_name, dims, body->dtype());
-    return new Tensor(func_result, vars, body);
+    return Tensor(func_result, vars, body);
   }
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -227,13 +227,12 @@ Tensor* Reduce(
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ReduceOpPtr reduce_op = reducer(func_result, body, output_args, reduce_vars);
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  Tensor* t =
-      new Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op);
+  Tensor t = Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op);
   return t;
 }
 
 template <typename BodyFunc>
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -250,7 +249,7 @@ Tensor* Reduce(
 
 // Overload which allows inline lambda functions for the body_func.
 template <typename BodyFunc>
-Tensor* Reduce(
+Tensor Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -260,14 +259,14 @@ Tensor* Reduce(
 }
 
 // Overload for the common case of all dimensions of a Placeholder.
-TORCH_API Tensor* Reduce(
+TORCH_API Tensor Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
     const Placeholder& buffer,
     const std::vector<DimArg>& reduce_args);
 
-TORCH_API Tensor* Reduce(
+TORCH_API Tensor Reduce(
     const std::string& name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
@@ -276,22 +275,22 @@ TORCH_API Tensor* Reduce(
 
 // Overload for the common case of all dimensions of a prevously Computed
 // Tensor.
-TORCH_API Tensor* Reduce(
+TORCH_API Tensor Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
-    Tensor* tensor,
+    Tensor tensor,
     const std::vector<DimArg>& reduce_args);
 
 template <typename... Ts>
-inline ExprHandle Tensor::load(const Ts&... ts) {
+inline ExprHandle Tensor::load(const Ts&... ts) const {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<ExprHandle> params({ExprHandle(ts)...});
   return Load::make(BufHandle(this->buf()), params);
 }
 
 template <typename T>
-inline ExprHandle Tensor::load(const std::vector<T>& args) {
+inline ExprHandle Tensor::load(const std::vector<T>& args) const {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<ExprHandle> params(args.begin(), args.end());
   return Load::make(BufHandle(this->buf()), params);
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index 4e1618a..c380233 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -185,9 +185,9 @@ void initTensorExprBindings(PyObject* module) {
              const std::vector<ExprHandle>& args,
              const ExprHandle& val) { return self.store(args, val); })
       .def("data", [](Placeholder& self) { return BufHandle(self.data()); });
-  py::class_<Tensor, std::unique_ptr<Tensor, py::nodelete>>(te, "Tensor")
-      .def(py::init(
-          [](BufHandle& b, StmtPtr s) { return new Tensor(b.node(), s); }))
+  py::class_<Tensor>(te, "Tensor")
+      .def(
+          py::init([](BufHandle& b, StmtPtr s) { return Tensor(b.node(), s); }))
       .def(
           "load",
           [](Tensor& self, const std::vector<ExprHandle>& v) {
@@ -268,7 +268,7 @@ void initTensorExprBindings(PyObject* module) {
       [](const std::string& func_name,
          const std::vector<DimArg>& dim_args,
          const Reducer& reducer,
-         Tensor* buffer,
+         Tensor buffer,
          const std::vector<DimArg>& reduce_args) {
         return Reduce(func_name, dim_args, reducer, buffer, reduce_args);
       },
@@ -380,7 +380,7 @@ void initTensorExprBindings(PyObject* module) {
       .def(py::init(&ExternalCall::make));
 
   py::class_<LoopNest>(te, "LoopNest")
-      .def(py::init<const std::vector<Tensor*>&>())
+      .def(py::init<const std::vector<Tensor>&>())
       .def(py::init([](StmtPtr s, const std::vector<BufHandle>& bufs) {
         std::unordered_set<BufPtr> buf_nodes;
         for (auto& buf : bufs) {
@@ -392,9 +392,7 @@ void initTensorExprBindings(PyObject* module) {
       .def("prepare_for_codegen", &LoopNest::prepareForCodegen)
       .def(
           "get_loop_body_for",
-          [](const LoopNest& self, Tensor* t) {
-            return self.getLoopBodyFor(t);
-          },
+          [](const LoopNest& self, Tensor t) { return self.getLoopBodyFor(t); },
           py::return_value_policy::reference)
       .def(
           "get_loop_body_for",
@@ -404,7 +402,7 @@ void initTensorExprBindings(PyObject* module) {
           py::return_value_policy::reference)
       .def(
           "get_loops_for",
-          [](const LoopNest& self, Tensor* t) {
+          [](const LoopNest& self, Tensor t) {
             return self.getLoopStmtsFor(t);
           },
           py::return_value_policy::reference)
@@ -760,12 +758,12 @@ void initTensorExprBindings(PyObject* module) {
 
   py::class_<CodeGen::BufferArg>(te, "BufferArg")
       .def(py::init<const Placeholder&>())
-      .def(py::init<Tensor*>())
+      .def(py::init<Tensor>())
       .def(py::init<const VarHandle&>())
       .def(py::init<const BufHandle&>());
 
   py::implicitly_convertible<Placeholder, CodeGen::BufferArg>();
-  py::implicitly_convertible<Tensor*, CodeGen::BufferArg>();
+  py::implicitly_convertible<Tensor, CodeGen::BufferArg>();
   py::implicitly_convertible<VarHandle, CodeGen::BufferArg>();
   py::implicitly_convertible<BufHandle, CodeGen::BufferArg>();
 
-- 
2.7.4